aldepyde 0.0.0a32__tar.gz → 0.0.0a35__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aldepyde might be problematic. Click here for more details.
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/PKG-INFO +1 -1
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/__init__.py +0 -19
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/biomolecule/__init__.py +3 -0
- aldepyde-0.0.0a35/aldepyde/databases/SCOPe_Astral.py +71 -0
- aldepyde-0.0.0a35/aldepyde/databases/UniRef.py +114 -0
- aldepyde-0.0.0a35/aldepyde/databases/_database.py +72 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/rand/RandomProtein.py +2 -0
- aldepyde-0.0.0a35/aldepyde/rand/__init__.py +6 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde.egg-info/PKG-INFO +1 -1
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde.egg-info/SOURCES.txt +1 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/pyproject.toml +1 -1
- aldepyde-0.0.0a32/aldepyde/databases/UniRef.py +0 -75
- aldepyde-0.0.0a32/aldepyde/databases/_database.py +0 -38
- aldepyde-0.0.0a32/aldepyde/rand/__init__.py +0 -3
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/LICENSE +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/README.md +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/Parsers/_mmcif_parser.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/Parsers/_pdb_parser.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/_config.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/biomolecule/Residue.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/biomolecule/_Atom.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/biomolecule/_AtomFactory.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/biomolecule/_amino_acid.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/biomolecule/_dna.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/biomolecule/_pdb.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/biomolecule/_rna.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/biomolecule/utils.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/cache/__init__.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/cache/_cache.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/cache/cachemanager.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/cache/downloader.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/cache/utils.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/configurable.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/data/RemoteFileHandler.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/data/__init__.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/data.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/databases/PDB.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/databases/RemoteFileHandler.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/databases/__init__.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/env.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/fetcher/__init__.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/fetcher/test.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/json/CHG.json +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/json/Swiss_Prot.json +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/json/chemistry.json +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/stats/ProteinStats.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/stats/__init__.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde/utils.py +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde.egg-info/dependency_links.txt +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/aldepyde.egg-info/top_level.txt +0 -0
- {aldepyde-0.0.0a32 → aldepyde-0.0.0a35}/setup.cfg +0 -0
|
@@ -17,25 +17,6 @@ def get_cache() -> CacheManager:
|
|
|
17
17
|
global _cache_manager
|
|
18
18
|
return _cache_manager
|
|
19
19
|
|
|
20
|
-
# def get_cache() -> _cache_handler:
|
|
21
|
-
# global _cache
|
|
22
|
-
# # if _cache.null:
|
|
23
|
-
# # return create_cache()
|
|
24
|
-
# return _cache
|
|
25
|
-
|
|
26
|
-
# def SaveConfig(path: str="config.json", indent: str = "") -> None:
|
|
27
|
-
# global _config
|
|
28
|
-
# get_config().Save(path=path, indent=indent)
|
|
29
|
-
#
|
|
30
|
-
#
|
|
31
|
-
# def LoadConfig(s: dict | str, ignore_missing=False) -> None:
|
|
32
|
-
# global _config
|
|
33
|
-
# get_config().Load(s, ignore_missing=ignore_missing)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
# from . import rand
|
|
37
|
-
# from . import biomolecule
|
|
38
|
-
# from . import fetcher
|
|
39
20
|
|
|
40
21
|
from importlib import import_module
|
|
41
22
|
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from aldepyde.databases._database import local_database
|
|
2
|
+
import operator
|
|
3
|
+
from contextlib import nullcontext
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
class scop_parser(local_database):
|
|
7
|
+
op = {
|
|
8
|
+
"and": lambda a,b,c: a and b and c,
|
|
9
|
+
"or": lambda a,b,c: a or b or c
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
def fetch(self, url):
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
def fetch_code(self, codes):
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
def parse(self, text):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
def extract_all_scop(self):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
def partition_scope(self):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
def extract_all_astral(self):
|
|
28
|
+
lines = self.fp.readlines()
|
|
29
|
+
entry = b""
|
|
30
|
+
for line in lines:
|
|
31
|
+
if line.startswith(b">") and len(entry) > 0:
|
|
32
|
+
yield entry
|
|
33
|
+
entry = b""
|
|
34
|
+
entry += line
|
|
35
|
+
yield entry
|
|
36
|
+
|
|
37
|
+
# TODO allow a list of search parameters. Big challenge to make efficient, but could be cute
|
|
38
|
+
def partition_astral(self, destination:None|str=None, append=False, class_name:str=b'',contains_id:str=b'' , contains_desc:str=b'', mode="and") -> dict:
|
|
39
|
+
mode = mode.lower()
|
|
40
|
+
# Everything is a byte string in order to play nicely with future parent methods
|
|
41
|
+
if isinstance(class_name, str):
|
|
42
|
+
class_name = class_name.encode('utf-8')
|
|
43
|
+
if isinstance(contains_desc, str):
|
|
44
|
+
contains_desc = contains_desc.encode('utf-8')
|
|
45
|
+
if isinstance(contains_id, str):
|
|
46
|
+
contains_id = contains_id.encode('utf-8')
|
|
47
|
+
if mode != "and" and mode != "or":
|
|
48
|
+
raise ValueError("mode must be \"and\" or \"or\".")
|
|
49
|
+
logic = scop_parser.op[mode]
|
|
50
|
+
regex = re.compile(b">[a-zA-Z0-9_.]* *[a-l](.[0-9]+)?(.[0-9]+)?(.[0-9]+)?")
|
|
51
|
+
if append:
|
|
52
|
+
file_context = open(destination, 'ab') if destination is not None else nullcontext()
|
|
53
|
+
else:
|
|
54
|
+
file_context = open(destination, 'wb') if destination is not None else nullcontext()
|
|
55
|
+
with file_context as fp:
|
|
56
|
+
ret_dict = dict()
|
|
57
|
+
for line in self.extract_all_astral():
|
|
58
|
+
identifiers = regex.search(line).group().split()
|
|
59
|
+
id = identifiers[0]
|
|
60
|
+
cls = identifiers[1]
|
|
61
|
+
unmatched_spl = regex.sub(b'', line).split(b'\n')
|
|
62
|
+
desc = unmatched_spl[0]
|
|
63
|
+
sequence = unmatched_spl[1:]
|
|
64
|
+
if logic(class_name.lower() in cls.lower(), contains_id.lower() in id.lower(), contains_desc.lower() in desc.lower()):
|
|
65
|
+
ret_dict[id] = { # Yes, I know '>' isn't part of the FASTA identifier. This keeps things more consistant
|
|
66
|
+
"class" : cls,
|
|
67
|
+
"description" : desc,
|
|
68
|
+
"sequence" : b"".join(sequence)
|
|
69
|
+
}
|
|
70
|
+
fp.write(line)
|
|
71
|
+
return ret_dict
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import zlib
|
|
2
|
+
from aldepyde.databases._database import streamable_database
|
|
3
|
+
from aldepyde.utils import ProgressBar
|
|
4
|
+
|
|
5
|
+
class uniref_parser(streamable_database):
|
|
6
|
+
def __init__(self):
|
|
7
|
+
super().__init__()
|
|
8
|
+
|
|
9
|
+
# TODO single entry parsing
|
|
10
|
+
# TODO store metadata upon request
|
|
11
|
+
# TODO implement abstract methods
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def stream_uniref_gz(filepath, chunk_size=8192, use_progress_bar=False, stitch=False):
|
|
15
|
+
raw_stream, size = streamable_database.open_stream(filepath)
|
|
16
|
+
pbar = ProgressBar(size//chunk_size) if use_progress_bar else None
|
|
17
|
+
decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
18
|
+
try:
|
|
19
|
+
if not stitch:
|
|
20
|
+
while True:
|
|
21
|
+
comp_chunk = raw_stream.read(chunk_size)
|
|
22
|
+
if not comp_chunk:
|
|
23
|
+
break
|
|
24
|
+
if pbar is not None:
|
|
25
|
+
pbar.update()
|
|
26
|
+
decomp_chunk = decompressor.decompress(comp_chunk)
|
|
27
|
+
if decomp_chunk:
|
|
28
|
+
yield decomp_chunk
|
|
29
|
+
final = decompressor.flush()
|
|
30
|
+
if final:
|
|
31
|
+
yield final
|
|
32
|
+
else:
|
|
33
|
+
# Really hacky solution for now
|
|
34
|
+
# TODO Clean this up
|
|
35
|
+
yield from uniref_parser.stitch_streamed_sequences(
|
|
36
|
+
uniref_parser.stream_uniref_gz(filepath=filepath, chunk_size=chunk_size, use_progress_bar=use_progress_bar, stitch=False))
|
|
37
|
+
finally:
|
|
38
|
+
raw_stream.close()
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def download_file(url, destination, chunk_size=8192, use_progress_bar=False):
|
|
42
|
+
raw_stream, size = streamable_database.open_stream(url)
|
|
43
|
+
pbar = ProgressBar(size // chunk_size) if use_progress_bar else None
|
|
44
|
+
with open(destination, 'wb') as fp:
|
|
45
|
+
while True:
|
|
46
|
+
chunk = raw_stream.read(chunk_size)
|
|
47
|
+
if not chunk:
|
|
48
|
+
break
|
|
49
|
+
if pbar is not None:
|
|
50
|
+
pbar.update()
|
|
51
|
+
fp.write(chunk)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def stitch_streamed_sequences(stream, as_str=True):
|
|
56
|
+
buffer = b''
|
|
57
|
+
for chunk in stream:
|
|
58
|
+
buffer += chunk
|
|
59
|
+
while buffer.count(b'>') >= 2:
|
|
60
|
+
sequences = [b">" + seq for seq in buffer.split(b">") if seq != b""]
|
|
61
|
+
buffer = buffer[buffer.rfind(b">"):]
|
|
62
|
+
ret_l = [b"".join(sequence.split(b'\n')[1:]).replace(b"\n", b"") for sequence in sequences[:-1]]
|
|
63
|
+
for s in ret_l:
|
|
64
|
+
yield s if not as_str else s.decode()
|
|
65
|
+
yield uniref_parser._final_sequence(buffer) if not as_str else uniref_parser._final_sequence(buffer).decode()
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _final_sequence(buffer):
|
|
69
|
+
lines = buffer.split(b'\n')
|
|
70
|
+
return b"".join(lines[1:])
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def stream_uniref50(chunk_size=8192, use_progress_bar=False, stitch=False):
|
|
74
|
+
if not stitch:
|
|
75
|
+
yield from uniref_parser.stream_uniref_gz('https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz',
|
|
76
|
+
chunk_size=chunk_size, use_progress_bar=use_progress_bar)
|
|
77
|
+
else:
|
|
78
|
+
yield from uniref_parser.stitch_streamed_sequences(uniref_parser.stream_uniref_gz(
|
|
79
|
+
'https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz',
|
|
80
|
+
chunk_size=chunk_size, use_progress_bar=use_progress_bar))
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def stream_uniref90(chunk_size=8192, use_progress_bar=False, stitch=False):
|
|
84
|
+
if not stitch:
|
|
85
|
+
yield from uniref_parser.stream_uniref_gz('https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz',
|
|
86
|
+
chunk_size=chunk_size, use_progress_bar=use_progress_bar)
|
|
87
|
+
else:
|
|
88
|
+
yield from uniref_parser.stitch_streamed_sequences(uniref_parser.stream_uniref_gz(
|
|
89
|
+
'https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz',
|
|
90
|
+
chunk_size=chunk_size, use_progress_bar=use_progress_bar))
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def stream_uniref100(chunk_size=8192, use_progress_bar=False, stitch=False):
|
|
94
|
+
if not stitch:
|
|
95
|
+
yield from uniref_parser.stream_uniref_gz('https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz',
|
|
96
|
+
chunk_size=chunk_size, use_progress_bar=use_progress_bar)
|
|
97
|
+
else:
|
|
98
|
+
yield from uniref_parser.stitch_streamed_sequences(uniref_parser.stream_uniref_gz(
|
|
99
|
+
'https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz',
|
|
100
|
+
chunk_size=chunk_size, use_progress_bar=use_progress_bar))
|
|
101
|
+
|
|
102
|
+
@staticmethod
|
|
103
|
+
def download_uniref50(destination='uniref50.fasta.gz', chunk_size=8192, use_progress_bar=False):
|
|
104
|
+
uniref_parser.download_file('https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz', destination=destination,
|
|
105
|
+
chunk_size=chunk_size, use_progress_bar=use_progress_bar)
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def download_uniref90(destination='uniref90.fasta.gz', chunk_size=8192, use_progress_bar=False):
|
|
109
|
+
uniref_parser.download_file('https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz', destination=destination,
|
|
110
|
+
chunk_size=chunk_size, use_progress_bar=use_progress_bar)
|
|
111
|
+
@staticmethod
|
|
112
|
+
def download_uniref100(destination='uniref100.fasta.gz', chunk_size=8192, use_progress_bar=False):
|
|
113
|
+
uniref_parser.download_file('https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz', destination=destination,
|
|
114
|
+
chunk_size=chunk_size, use_progress_bar=use_progress_bar)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import gzip
|
|
3
|
+
import requests
|
|
4
|
+
import os
|
|
5
|
+
from typing import Tuple, BinaryIO
|
|
6
|
+
from io import TextIOWrapper
|
|
7
|
+
|
|
8
|
+
class streamable_database(ABC):
|
|
9
|
+
|
|
10
|
+
def __init__(self):
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def fetch(self, url):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def fetch_code(self, codes):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def parse(self, text):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def open_stream(source:str) -> Tuple[BinaryIO, int] | None:
|
|
27
|
+
if source.startswith('http://') or source.startswith('https://'):
|
|
28
|
+
resp = requests.get(source, stream=True)
|
|
29
|
+
resp.raise_for_status()
|
|
30
|
+
length = resp.headers.get("Content-Length")
|
|
31
|
+
return resp.raw, int(length) if length else None
|
|
32
|
+
else:
|
|
33
|
+
size = os.path.getsize(source)
|
|
34
|
+
return open(source, 'rb'), size
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Yes, I know the first conditionals do the same thing
|
|
39
|
+
|
|
40
|
+
def __call__(self):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
class local_database(ABC):
|
|
44
|
+
|
|
45
|
+
def __init__(self, filepath=None, as_fp=False):
|
|
46
|
+
self.fp = None
|
|
47
|
+
self.as_fp = as_fp
|
|
48
|
+
self.size = None
|
|
49
|
+
self.load_path(filepath)
|
|
50
|
+
|
|
51
|
+
def load_path(self, filepath):
|
|
52
|
+
self.filepath = filepath
|
|
53
|
+
|
|
54
|
+
def get_pointer(self):
|
|
55
|
+
return self.fp
|
|
56
|
+
|
|
57
|
+
def __enter__(self):
|
|
58
|
+
self.fp, self.size = local_database.open_stream(self.filepath)
|
|
59
|
+
if self.as_fp:
|
|
60
|
+
return self.fp
|
|
61
|
+
else:
|
|
62
|
+
return self
|
|
63
|
+
|
|
64
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
65
|
+
if self.fp is not None:
|
|
66
|
+
self.fp.close()
|
|
67
|
+
self.fp = None
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def open_stream(source:str) -> Tuple[BinaryIO, int] | None:
|
|
71
|
+
size = os.path.getsize(source)
|
|
72
|
+
return open(source, 'rb'), size
|
|
@@ -11,6 +11,8 @@ class InvalidDistribution(Exception):
|
|
|
11
11
|
class ImpossibleSetting(Exception):
|
|
12
12
|
pass
|
|
13
13
|
|
|
14
|
+
|
|
15
|
+
# TODO This whole thing needs to be cleaned up to better align with more modern python
|
|
14
16
|
class RandomProtein:
|
|
15
17
|
# Hardcode data for now
|
|
16
18
|
def __init__(self, His_Is_Charged=True, Cys_Is_Polar=True, Charged_Is_Polar=True, Distribution="Swiss"):
|
|
@@ -31,6 +31,7 @@ aldepyde/data/RemoteFileHandler.py
|
|
|
31
31
|
aldepyde/data/__init__.py
|
|
32
32
|
aldepyde/databases/PDB.py
|
|
33
33
|
aldepyde/databases/RemoteFileHandler.py
|
|
34
|
+
aldepyde/databases/SCOPe_Astral.py
|
|
34
35
|
aldepyde/databases/UniRef.py
|
|
35
36
|
aldepyde/databases/__init__.py
|
|
36
37
|
aldepyde/databases/_database.py
|
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
import zlib
|
|
2
|
-
|
|
3
|
-
from aldepyde.databases.RemoteFileHandler import RemoteFileHandler
|
|
4
|
-
from aldepyde.databases._database import _database
|
|
5
|
-
from aldepyde.utils import ProgressBar
|
|
6
|
-
import os
|
|
7
|
-
import gzip
|
|
8
|
-
|
|
9
|
-
class uniref_parser(_database):
|
|
10
|
-
def __init__(self):
|
|
11
|
-
pass
|
|
12
|
-
|
|
13
|
-
#TODO Fix the total calculation
|
|
14
|
-
@staticmethod
|
|
15
|
-
def stream_uniref_gz(filepath, chunk_size=8192, use_progress_bar=False):
|
|
16
|
-
if use_progress_bar is not None:
|
|
17
|
-
raw_stream, size = _database.open_stream(filepath)
|
|
18
|
-
pbar = ProgressBar(size//chunk_size) if use_progress_bar else None
|
|
19
|
-
decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
20
|
-
try:
|
|
21
|
-
while True:
|
|
22
|
-
comp_chunk = raw_stream.read(chunk_size)
|
|
23
|
-
if not comp_chunk:
|
|
24
|
-
break
|
|
25
|
-
if pbar is not None:
|
|
26
|
-
pbar.update()
|
|
27
|
-
decomp_chunk = decompressor.decompress(comp_chunk)
|
|
28
|
-
if decomp_chunk:
|
|
29
|
-
yield decomp_chunk
|
|
30
|
-
final = decompressor.flush()
|
|
31
|
-
if final:
|
|
32
|
-
yield final
|
|
33
|
-
finally:
|
|
34
|
-
raw_stream.close()
|
|
35
|
-
|
|
36
|
-
@staticmethod
|
|
37
|
-
def stitch_streamed_sequences(stream, as_str=True):
|
|
38
|
-
buffer = b''
|
|
39
|
-
for chunk in stream:
|
|
40
|
-
buffer += chunk
|
|
41
|
-
while buffer.count(b'>') >= 2:
|
|
42
|
-
sequences = [b">" + seq for seq in buffer.split(b">") if seq != b""]
|
|
43
|
-
buffer = buffer[buffer.rfind(b">"):]
|
|
44
|
-
ret_l = [b"".join(sequence.split(b'\n')[1:]).replace(b"\n", b"") for sequence in sequences[:-1]]
|
|
45
|
-
for s in ret_l:
|
|
46
|
-
yield s if not as_str else s.decode()
|
|
47
|
-
yield uniref_parser._final_sequence(buffer) if not as_str else uniref_parser._final_sequence(buffer).decode()
|
|
48
|
-
|
|
49
|
-
@staticmethod
|
|
50
|
-
def _final_sequence(buffer):
|
|
51
|
-
lines = buffer.split(b'\n')
|
|
52
|
-
return b"".join(lines[1:])
|
|
53
|
-
|
|
54
|
-
@staticmethod
|
|
55
|
-
def stream_uniref50(chunk_size=8192, use_progress_bar=False):
|
|
56
|
-
yield from uniref_parser.stream_uniref_gz('https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz',
|
|
57
|
-
chunk_size=chunk_size, use_progress_bar=use_progress_bar)
|
|
58
|
-
# yield from RemoteFileHandler.stream_url(
|
|
59
|
-
# 'https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz', chunk_size=chunk_size)
|
|
60
|
-
|
|
61
|
-
@staticmethod
|
|
62
|
-
def stream_uniref90(chunk_size=8192, use_progress_bar=False):
|
|
63
|
-
yield from uniref_parser.stream_uniref_gz('https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz',
|
|
64
|
-
chunk_size=chunk_size, use_progress_bar=use_progress_bar)
|
|
65
|
-
|
|
66
|
-
# yield from RemoteFileHandler.stream_url(
|
|
67
|
-
# 'https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz', chunk_size=chunk_size)
|
|
68
|
-
|
|
69
|
-
@staticmethod
|
|
70
|
-
def stream_uniref100(chunk_size=8192, use_progress_bar=False):
|
|
71
|
-
yield from uniref_parser.stream_uniref_gz('https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz',
|
|
72
|
-
chunk_size=chunk_size, use_progress_bar=use_progress_bar)
|
|
73
|
-
|
|
74
|
-
# yield from RemoteFileHandler.stream_url(
|
|
75
|
-
# 'https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz', chunk_size=chunk_size)
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
import gzip
|
|
3
|
-
import requests
|
|
4
|
-
import os
|
|
5
|
-
from typing import Tuple, BinaryIO
|
|
6
|
-
from io import TextIOWrapper
|
|
7
|
-
|
|
8
|
-
class _database(ABC):
|
|
9
|
-
|
|
10
|
-
@abstractmethod
|
|
11
|
-
def fetch(self, url):
|
|
12
|
-
pass
|
|
13
|
-
|
|
14
|
-
@abstractmethod
|
|
15
|
-
def fetch_code(self, codes):
|
|
16
|
-
pass
|
|
17
|
-
|
|
18
|
-
@abstractmethod
|
|
19
|
-
def parse(self, text):
|
|
20
|
-
pass
|
|
21
|
-
|
|
22
|
-
@staticmethod
|
|
23
|
-
def open_stream(source:str) -> Tuple[BinaryIO, int] | None:
|
|
24
|
-
if source.startswith('http://') or source.startswith('https://'):
|
|
25
|
-
resp = requests.get(source, stream=True)
|
|
26
|
-
resp.raise_for_status()
|
|
27
|
-
length = resp.headers.get("Content-Length")
|
|
28
|
-
return resp.raw, int(length) if length else None
|
|
29
|
-
else:
|
|
30
|
-
size = os.path.getsize(source)
|
|
31
|
-
return open(source, 'rb'), size
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# Yes, I know the first conditionals do the same thing
|
|
36
|
-
|
|
37
|
-
def __call__(self):
|
|
38
|
-
pass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|