seqsearch 2.2.1__tar.gz → 2.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {seqsearch-2.2.1 → seqsearch-2.2.4}/LICENSE.txt +1 -1
  2. {seqsearch-2.2.1 → seqsearch-2.2.4}/PKG-INFO +26 -9
  3. {seqsearch-2.2.1 → seqsearch-2.2.4}/README.md +2 -2
  4. seqsearch-2.2.4/pyproject.toml +47 -0
  5. seqsearch-2.2.4/seqsearch/__init__.py +11 -0
  6. seqsearch-2.2.4/seqsearch/databases/__init__.py +223 -0
  7. seqsearch-2.2.4/seqsearch/databases/human.py +51 -0
  8. seqsearch-2.2.4/seqsearch/databases/humgut.py +238 -0
  9. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/mothur/greengenes.py +0 -2
  10. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/mothur/rdp.py +0 -2
  11. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/mothur/silva.py +0 -2
  12. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/ncbi_16s.py +2 -2
  13. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/nr.py +1 -8
  14. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/nt.py +2 -4
  15. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/pfam.py +7 -9
  16. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/pr_two.py +2 -4
  17. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/refseq.py +3 -5
  18. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/silva.py +2 -4
  19. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/string.py +2 -9
  20. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/tigrfam.py +2 -4
  21. {seqsearch-2.2.1/seqsearch/databases → seqsearch-2.2.4/seqsearch/examples}/download.py +0 -6
  22. seqsearch-2.2.4/seqsearch/mapping/bam.py +35 -0
  23. seqsearch-2.2.4/seqsearch/mapping/bam_and_sam.py +186 -0
  24. seqsearch-2.2.4/seqsearch/mapping/bwa.py +103 -0
  25. seqsearch-2.2.4/seqsearch/mapping/sam.py +27 -0
  26. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/blast.py +12 -3
  27. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/hmmer.py +1 -1
  28. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/vsearch.py +1 -3
  29. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch.egg-info/PKG-INFO +26 -9
  30. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch.egg-info/SOURCES.txt +8 -2
  31. seqsearch-2.2.4/seqsearch.egg-info/requires.txt +12 -0
  32. seqsearch-2.2.1/seqsearch/__init__.py +0 -16
  33. seqsearch-2.2.1/seqsearch/databases/__init__.py +0 -139
  34. seqsearch-2.2.1/seqsearch.egg-info/requires.txt +0 -12
  35. seqsearch-2.2.1/setup.py +0 -38
  36. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/examples/blast_object/simple_blast.py +0 -0
  37. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/examples/seqsearch_object/simple_query.py +0 -0
  38. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/__init__.py +0 -0
  39. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/core.py +0 -0
  40. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/parallel.py +0 -0
  41. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch.egg-info/dependency_links.txt +0 -0
  42. {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch.egg-info/top_level.txt +0 -0
  43. {seqsearch-2.2.1 → seqsearch-2.2.4}/setup.cfg +0 -0
@@ -1,4 +1,4 @@
1
- Copyright (c) 2021 Lucas Sinclair
1
+ Copyright (c) 2026 Lucas Sinclair
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
4
 
@@ -1,21 +1,38 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: seqsearch
3
- Version: 2.2.1
3
+ Version: 2.2.4
4
4
  Summary: Sequence similarity searches (e.g. BLAST) made easy.
5
- Home-page: https://github.com/xapple/seqsearch
6
- Author: Lucas Sinclair
7
- Author-email: lucas.sinclair@me.com
8
- License: MIT
5
+ Author-email: Lucas Sinclair <lucas.sinclair@me.com>
6
+ License: Copyright (c) 2026 Lucas Sinclair
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11
+
12
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
13
+ Project-URL: Homepage, https://github.com/xapple/seqsearch
14
+ Keywords: bioinformatics,dna,sequences,search
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
9
17
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
10
18
  Requires-Python: >=3.8
11
19
  Description-Content-Type: text/markdown
20
+ License-File: LICENSE.txt
21
+ Requires-Dist: autopaths>=1.6.2
22
+ Requires-Dist: plumbing>=2.11.3
23
+ Requires-Dist: fasta>=2.3.6
24
+ Requires-Dist: runps>=4.1.1
25
+ Requires-Dist: biopython
26
+ Requires-Dist: tqdm
12
27
  Provides-Extra: ftp
28
+ Requires-Dist: ftputil; extra == "ftp"
13
29
  Provides-Extra: downloads
14
- License-File: LICENSE.txt
30
+ Requires-Dist: wget; extra == "downloads"
31
+ Dynamic: license-file
15
32
 
16
33
  [![PyPI version](https://badge.fury.io/py/seqsearch.svg)](https://badge.fury.io/py/seqsearch)
17
34
 
18
- # `seqsearch` version 2.2.1
35
+ # `seqsearch` version 2.2.4
19
36
 
20
37
  `seqsearch` is a python package for dealing sequence similarity searches (e.g. BLAST on DNA sequences) and automation.
21
38
 
@@ -53,7 +70,7 @@ Bellow are some examples to illustrate the various ways there are to use this pa
53
70
 
54
71
  ### Searches
55
72
 
56
- You can parallelize BLAST searches by splitting the input into several files. It's easier to chop-up the input, because database chopping requires message passing across the nodes like mpiblast does (when and if it works).
73
+ You can parallelize BLAST searches by splitting the input into several files. It's easier to chop-up the input, because database chopping requires message passing across the nodes like `mpiblast` does (when and if it works).
57
74
 
58
75
  Input chopping is fine as long as the database to search against fits in the RAM of the nodes. If the input is small and the database is large you can always switch them one for the other (in most cases).
59
76
 
@@ -1,6 +1,6 @@
1
1
  [![PyPI version](https://badge.fury.io/py/seqsearch.svg)](https://badge.fury.io/py/seqsearch)
2
2
 
3
- # `seqsearch` version 2.2.1
3
+ # `seqsearch` version 2.2.4
4
4
 
5
5
  `seqsearch` is a python package for dealing sequence similarity searches (e.g. BLAST on DNA sequences) and automation.
6
6
 
@@ -38,7 +38,7 @@ Bellow are some examples to illustrate the various ways there are to use this pa
38
38
 
39
39
  ### Searches
40
40
 
41
- You can parallelize BLAST searches by splitting the input into several files. It's easier to chop-up the input, because database chopping requires message passing across the nodes like mpiblast does (when and if it works).
41
+ You can parallelize BLAST searches by splitting the input into several files. It's easier to chop-up the input, because database chopping requires message passing across the nodes like `mpiblast` does (when and if it works).
42
42
 
43
43
  Input chopping is fine as long as the database to search against fits in the RAM of the nodes. If the input is small and the database is large you can always switch them one for the other (in most cases).
44
44
 
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "seqsearch"
7
+ version = "2.2.4"
8
+
9
+ description = "Sequence similarity searches (e.g. BLAST) made easy."
10
+ readme = { file = "README.md", content-type = "text/markdown" }
11
+ license = { file = "LICENSE.txt" }
12
+ requires-python = ">=3.8"
13
+
14
+ authors = [
15
+ { name = "Lucas Sinclair", email = "lucas.sinclair@me.com" },
16
+ ]
17
+
18
+ keywords = ["bioinformatics", "dna", "sequences", "search"]
19
+
20
+ classifiers = [
21
+ "License :: OSI Approved :: MIT License",
22
+ "Operating System :: OS Independent",
23
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
24
+ ]
25
+
26
+ dependencies = [
27
+ "autopaths>=1.6.2",
28
+ "plumbing>=2.11.3",
29
+ "fasta>=2.3.6",
30
+ "runps>=4.1.1",
31
+ "biopython",
32
+ "tqdm",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ ftp = ["ftputil"]
37
+ downloads = ["wget"]
38
+
39
+ [project.urls]
40
+ Homepage = "https://github.com/xapple/seqsearch"
41
+
42
+ [tool.setuptools]
43
+ include-package-data = true
44
+
45
+ [tool.setuptools.packages.find]
46
+ namespaces = true
47
+ exclude = ["example"]
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Written by Lucas Sinclair.
6
+ MIT Licensed.
7
+ Contact at www.sinclair.bio
8
+ """
9
+
10
+ # Special variables #
11
+ __version__ = '2.2.4'
@@ -0,0 +1,223 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Written by Lucas Sinclair.
6
+ MIT Licensed.
7
+ Contact at www.sinclair.bio
8
+ """
9
+
10
+ # Built-in modules #
11
+ import os, fnmatch
12
+ from collections import Counter
13
+ import urllib.request
14
+
15
+ # First party modules #
16
+ from fasta import FASTA
17
+ from autopaths.auto_paths import AutoPaths
18
+ from autopaths.dir_path import DirectoryPath
19
+ from autopaths.file_path import FilePath
20
+ from plumbing.cache import property_cached
21
+ from plumbing.common import natural_sort
22
+ from plumbing.scraping import download_from_url
23
+
24
+ # Third party modules #
25
+ from tqdm import tqdm
26
+
27
+ # Constants #
28
+ home = os.environ.get('HOME', '~') + '/'
29
+ base_directory = DirectoryPath(home + "databases/")
30
+
31
+ ###############################################################################
32
+ class Database:
33
+ """General database object to inherit from."""
34
+
35
+ all_paths = """
36
+ /raw/
37
+ /unzipped/
38
+ /blast_db/
39
+ """
40
+
41
+ def __init__(self, seq_type='nucl', base_dir=None):
42
+ # The sequence type is either 'prot' or 'nucl' #
43
+ self.seq_type = seq_type
44
+ # The default base directory #
45
+ if base_dir is None: base_dir = base_directory
46
+ # Make base_dir object #
47
+ self.base_dir = base_dir + self.short_name + '/'
48
+ # Make autopaths object #
49
+ self.autopaths = AutoPaths(self.base_dir, self.all_paths)
50
+
51
+ def __repr__(self):
52
+ # Get the name of this class #
53
+ name = self.__class__.__name__
54
+ # Return a user-friendly string #
55
+ return '<%s at "%s">' % (name, self.base_dir)
56
+
57
+ def __bool__(self):
58
+ """
59
+ Return True if the database was already downloaded and the
60
+ results are stored on the filesystem. Return False otherwise.
61
+ """
62
+ return not self.autopaths.unzipped_dir.empty
63
+
64
+ @property_cached
65
+ def files_to_retrieve(self):
66
+ """The files we want to download with their destinations."""
67
+ return {f: FilePath(self.autopaths.raw_dir + f)
68
+ for f in self.files}
69
+
70
+ #---------------------------- GZIP compression ---------------------------#
71
+ def ungzip(self):
72
+ """Ungzip them."""
73
+ # Check the extension #
74
+ for f in tqdm(self.raw_files):
75
+ if f.endswith('.gz') or f.endswith('.gzip'): continue
76
+ destination = self.autopaths.unzipped_dir + f.prefix
77
+ f.ungzip_to(destination)
78
+ # Make them only readable #
79
+ for f in self.autopaths.unzipped_dir:
80
+ f.permissions.only_readable()
81
+
82
+ def untargz(self):
83
+ """Untargzip them."""
84
+ # Check the extension #
85
+ for f in tqdm(self.raw_files):
86
+ if f.endswith('.tar.gz') or f.endswith('.tgz'): continue
87
+ f.untargz_to(self.autopaths.unzipped_dir)
88
+ # Make them only readable #
89
+ for f in self.autopaths.unzipped_dir:
90
+ f.permissions.only_readable()
91
+
92
+ @property
93
+ def raw_files(self):
94
+ """The files we have downloaded."""
95
+ return self.autopaths.raw_dir.contents
96
+
97
+ @property
98
+ def unzipped_files(self):
99
+ """The files we have downloaded."""
100
+ return self.autopaths.unzipped_dir.contents
101
+
102
+ #------------------------ Only for FASTA databases -----------------------#
103
+ @property
104
+ def fasta_files(self):
105
+ """The files we have downloaded."""
106
+ return map(FASTA, self.raw_files)
107
+
108
+ @property
109
+ def sequences(self):
110
+ """All the sequences from all the raw files."""
111
+ for fasta in self.raw_files:
112
+ for seq in fasta: yield seq
113
+
114
+ #------------------ Only for preformatted BLAST databases ----------------#
115
+ @property_cached
116
+ def blast_db(self):
117
+ """A BLASTable version of the sequences."""
118
+ # Import #
119
+ from seqsearch.search.blast import BLASTdb
120
+ # Create object #
121
+ db = BLASTdb(self.autopaths.unzipped_dir + self.db_name,
122
+ self.seq_type)
123
+ # Return #
124
+ return db
125
+
126
+ #--------------------- Only for taxonomic databases ----------------------#
127
+ @property_cached
128
+ def tax_depth_freq(self):
129
+ def depths():
130
+ with open(self.taxonomy, 'r') as handle:
131
+ for line in handle:
132
+ line = line.strip('\n')
133
+ otu_name, species = line.split('\t')
134
+ yield len(species.split(';'))
135
+ return Counter(depths())
136
+
137
+ #-------------------- Only for BWA indexed databases ---------------------#
138
+ @property
139
+ def bwa_index(self):
140
+ # Get the first file #
141
+ first = next(self.unzipped_files)
142
+ # Get only the prefix without the extension #
143
+ return first.prefix_path
144
+
145
+ ###############################################################################
146
+ class DatabaseHTTP(Database):
147
+ """A database that is stored on an HTTP server."""
148
+
149
+ @property_cached
150
+ def files_to_retrieve(self):
151
+ """The files we want to download with their destinations."""
152
+ return {f: FilePath(self.autopaths.raw_dir + f)
153
+ for f in self.files}
154
+
155
+ @property
156
+ def files_remaining(self):
157
+ """The files we haven't downloaded yet based on size checks."""
158
+ # Function to get the size of a file #
159
+ def get_size_http(url):
160
+ response = urllib.request.urlopen(url)
161
+ return int(response.getheader("Content-Length"))
162
+ # Check each file #
163
+ return {source: dest
164
+ for source, dest in self.files_to_retrieve.items()
165
+ if dest.count_bytes == 0 or
166
+ dest.count_bytes != get_size_http(self.base_url + source)}
167
+
168
+ def download(self):
169
+ """Retrieve all files from the website."""
170
+ for source, dest in self.files_remaining.items():
171
+ # Get the full URL #
172
+ url = self.base_url + source
173
+ # Similar to wget #
174
+ download_from_url(url, dest,
175
+ stream = True,
176
+ progress = True,
177
+ desc = source,
178
+ cleanup = True,
179
+ )
180
+ # Make it readable only #
181
+ dest.permissions.only_readable()
182
+
183
+ ###############################################################################
184
+ class DatabaseFTP(Database):
185
+ """A database that is stored on an FTP server."""
186
+
187
+ @property_cached
188
+ def ftp(self):
189
+ """If the data is to be obtained by FTP, here is the ftputil object."""
190
+ from ftputil import FTPHost
191
+ ftp = FTPHost(self.ftp_url, "anonymous")
192
+ ftp.chdir(self.ftp_dir)
193
+ return ftp
194
+
195
+ @property_cached
196
+ def files_to_retrieve(self):
197
+ """The files we want to download with their destinations."""
198
+ # In the case we specify a pattern #
199
+ if hasattr(self, "pattern"):
200
+ files = self.ftp.listdir(self.ftp.curdir)
201
+ files.sort(key=natural_sort)
202
+ return {f: FilePath(self.autopaths.raw_dir + f)
203
+ for f in files if fnmatch.fnmatch(f, self.pattern)}
204
+ # In the case we specify a list of files #
205
+ if hasattr(self, "files"):
206
+ return {f: FilePath(self.autopaths.raw_dir + f)
207
+ for f in self.files}
208
+
209
+ @property
210
+ def files_remaining(self):
211
+ """The files we haven't downloaded yet based on size checks."""
212
+ return {source: dest for source, dest in self.files_to_retrieve.items()
213
+ if dest.count_bytes != self.ftp.path.getsize(source)}
214
+
215
+ def download(self):
216
+ """Retrieve all files from the FTP site."""
217
+ # Create the directory #
218
+ self.base_dir.create_if_not_exists()
219
+ # Loop over files #
220
+ for source, dest in tqdm(self.files_remaining.items()):
221
+ dest.remove()
222
+ self.ftp.download(source, dest)
223
+ dest.permissions.only_readable()
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Written by Lucas Sinclair.
6
+ MIT Licensed.
7
+ Contact at www.sinclair.bio
8
+ """
9
+
10
+ # Built-in modules #
11
+ import os
12
+
13
+ # First party modules #
14
+ from seqsearch.databases import DatabaseFTP
15
+
16
+ # Third party modules #
17
+
18
+ # Constants #
19
+ home = os.environ.get('HOME', '~') + '/'
20
+
21
+ ###############################################################################
22
+ class HumanGenome(DatabaseFTP):
23
+ """
24
+ The NCBI provides the latest version of the human genome, as well
25
+ as preformatted files here:
26
+
27
+ https://www.ncbi.nlm.nih.gov/genome/guide/human/
28
+
29
+ To install:
30
+
31
+ >>> from seqsearch.databases.human import hg38
32
+ >>> hg38.download()
33
+ >>> hg38.untargz()
34
+ >>> hg38.autopaths.raw_dir.remove()
35
+
36
+ It will place the resulting files in "~/databases/human/".
37
+ """
38
+
39
+ tag = "hg38"
40
+ short_name = "grc_h38"
41
+ long_name = 'Human Genome v38 at NCBI'
42
+
43
+ ftp_url = "ftp.ncbi.nlm.nih.gov"
44
+ ftp_dir = "genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/" \
45
+ "seqs_for_alignment_pipelines.ucsc_ids/"
46
+
47
+ files = ['GCA_000001405.15_GRCh38_full_analysis_set.fna.bwa_index.tar.gz']
48
+
49
+ ###############################################################################
50
+ # Create a singleton #
51
+ hg38 = HumanGenome()
@@ -0,0 +1,238 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ Written by Lucas Sinclair.
6
+ MIT Licensed.
7
+ Contact at www.sinclair.bio
8
+ """
9
+
10
+ # Built-in modules #
11
+ import os, re, tarfile, shutil, functools
12
+
13
+ # First party modules #
14
+ from seqsearch.databases import DatabaseHTTP
15
+ from fasta import FASTA
16
+ from plumbing.timer import Timer
17
+
18
+ # Third party modules #
19
+ import pandas
20
+ from tqdm import tqdm
21
+
22
+ # Constants #
23
+ home = os.environ.get('HOME', '~') + '/'
24
+
25
+ ###############################################################################
26
+ class HumGutGenome:
27
+ """A genome found in the database, with associated counts."""
28
+
29
+ def __init__(self, tax_id, count):
30
+ """Use the HumGut genome ID to create the instance."""
31
+ # Save attributes #
32
+ self.tax_id = tax_id
33
+ self.count = count
34
+
35
+ def __repr__(self):
36
+ return '<%s object ID %s>' % (self.__class__.__name__, self.tax_id)
37
+
38
+ @functools.cached_property
39
+ def metadata(self):
40
+ """
41
+ An example output is:
42
+
43
+ {'HumGut_name': 'HumGut_20705',
44
+ 'cluster975': 20705,
45
+ 'cluster95': 3214,
46
+ 'gtdbtk_tax_id': 4030631,
47
+ 'gtdbtk_organism_name': 's__Enterococcus_D casseliflavus',
48
+ 'gtdbtk_taxonomy': 'd__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Enterococcaceae;g__Enterococcus_D;s__Enterococcus_D casseliflavus',
49
+ 'ncbi_tax_id': 1218087,
50
+ 'ncbi_organism_name': 'Enterococcus casseliflavus NBRC 100478',
51
+ 'ncbi_rank': 'strain',
52
+ 'prevalence_score': 0.7849960826259196,
53
+ 'metagenomes_present': 22,
54
+ 'completeness': 99.24528301886792,
55
+ 'contamination': 0.389531345100426,
56
+ 'GC': 0.4235326316891364,
57
+ 'genome_size': 3668336,
58
+ 'source': 'RefSeq',
59
+ 'genome_type': 'Complete Genome',
60
+ 'cluster975_size': 68,
61
+ 'cluster95_size': 125,
62
+ 'genome_file': 'GCF_003641225.1_ASM364122v1_genomic.fna.gz',
63
+ 'ftp_download': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/641/225/GCF_003641225.1_ASM364122v1/GCF_003641225.1_ASM364122v1_genomic.fna.gz'}
64
+ """
65
+ return humgut.id_to_metadata(self.tax_id)
66
+
67
+ @functools.cached_property
68
+ def tax(self):
69
+ """
70
+ Parse the gtdbtk_taxonomy string into a list.
71
+ A typical output is the following:
72
+
73
+ ['Bacteria',
74
+ 'Firmicutes',
75
+ 'Bacilli',
76
+ 'Lactobacillales',
77
+ 'Enterococcaceae',
78
+ 'Enterococcus_D']
79
+ """
80
+ # Parse the string 'd__Bacteria;p__Firmicutes;...' #
81
+ pattern = '__(.+?);'
82
+ tax = re.findall(pattern, self.metadata['gtdbtk_taxonomy'])
83
+ # If the classification doesn't go all the way down we will add
84
+ # 'Unclassified' until reaching the species level
85
+ count_ranks = len(humgut.rank_names)
86
+ tax += ['Unclassified'] * (count_ranks - len(tax))
87
+ # Return tax #
88
+ return tax
89
+
90
+ ###############################################################################
91
+ class HumGut(DatabaseHTTP):
92
+ """
93
+ Quote from the paper:
94
+
95
+ "We screened > 5,700 healthy human gut metagenomes for the containment of
96
+ > 490,000 publicly available prokaryotic genomes sourced from RefSeq and
97
+ the recently announced UHGG collection. This resulted in a pool of
98
+ > 381,000 genomes that were subsequently scored and ranked based on their
99
+ prevalence in the healthy human metagenomes. The genomes were then
100
+ clustered at a 97.5% sequence identity resolution, and cluster
101
+ representatives (30,691 in total) were retained to comprise the HumGut
102
+ collection."
103
+
104
+ The publication is here:
105
+
106
+ * https://doi.org/10.1186/s40168-021-01114-w
107
+
108
+ The download website is here:
109
+
110
+ * http://arken.nmbu.no/~larssn/humgut/
111
+
112
+ To install:
113
+
114
+ >>> from seqsearch.databases.humgut import humgut
115
+ >>> humgut.download()
116
+ >>> humgut.untargz()
117
+ >>> humgut.get_95_cluster()
118
+ >>> humgut.autopaths.tar.remove()
119
+ >>> humgut.make_bwa_database()
120
+
121
+ It will place the resulting files in "~/databases/human/".
122
+ """
123
+
124
+ tag = "humgut"
125
+ short_name = "humgut"
126
+ long_name = 'HumGut: a comprehensive human gut prokaryotic genomes' \
127
+ ' collection'
128
+
129
+ base_url = "http://arken.nmbu.no/~larssn/humgut/"
130
+
131
+ files = ['HumGut.tar.gz',
132
+ 'HumGut.tsv',
133
+ 'ncbi_names.dmp',
134
+ 'ncbi_nodes.dmp']
135
+
136
+ all_paths = """
137
+ /raw/HumGut.tar.gz
138
+ /raw/HumGut.tsv
139
+ /raw/ncbi_names.dmp
140
+ /raw/ncbi_nodes.dmp
141
+ /cluster95/humgut95.fasta.gz
142
+ /bwa_db/
143
+ """
144
+
145
+ #------------------------------ Properties -------------------------------#
146
+ @functools.cached_property
147
+ def metadata(self):
148
+ """Parse the TSV metadata with pandas"""
149
+ # Load the file in memory #
150
+ df = pandas.read_csv(self.autopaths.HumGut_tsv, sep='\t')
151
+ # Change index #
152
+ df = df.set_index('HumGut_tax_id')
153
+ # Return #
154
+ return df
155
+
156
+ @property
157
+ def bwa_index(self):
158
+ return self.autopaths.bwa_db_dir + 'humgut95'
159
+
160
+ #--------------------------- Extra information ----------------------------#
161
+ @property
162
+ def rank_names(self):
163
+ return ['Domain', # 1 (This is Bacteria, Archaea or Eucarya)
164
+ 'Phylum', # 2 (This is for instance 'Firmicutes')
165
+ 'Class', # 3
166
+ 'Order', # 4
167
+ 'Family', # 5
168
+ 'Genus', # 6
169
+ 'Species'] # 7
170
+
171
+ #------------------------------- Methods ---------------------------------#
172
+ def id_to_metadata(self, tax_id):
173
+ """
174
+ Like a dictionary for quick look up of a genome based on its
175
+ taxonomy ID.
176
+ """
177
+ return self.metadata.loc[tax_id]
178
+
179
+ def get_95_cluster(self):
180
+ """
181
+ Will write a large compressed FASTA file will all genomes of
182
+ interest concatenated together. To do this, we will pick single files
183
+ out of the large TAR archive provided and append them one by one.
184
+ See the documentation on GitHub here:
185
+ * https://github.com/larssnip/HumGut#the-humgut-library
186
+ """
187
+ # Keep the first representative of every cluster #
188
+ df = self.metadata.drop_duplicates(subset = 'cluster95')
189
+ # Get the list of genome names to retrieve #
190
+ genomes_to_get = list(df['genome_file'])
191
+ # Function #
192
+ def find_entry(name, all_entries):
193
+ for entry in all_entries:
194
+ if entry.name == name:
195
+ return entry
196
+ raise Exception("Entry '%s' not found." % name)
197
+ # Open the tar file with all genomes for reading.
198
+ # Fetch every one of the separate FASTA files (one per genome) from
199
+ # within the archive and concatenate them all into one big FASTA file.
200
+ with tarfile.open(self.autopaths.tar, "r:gz") as tar:
201
+ # Message #
202
+ msg = "Reading file list from '%s'..."
203
+ print(msg % self.autopaths.tar)
204
+ # Get all the member files #
205
+ members = tar.getmembers()
206
+ # Message #
207
+ msg = "Extracting %i genomes..."
208
+ print(msg % len(genomes_to_get))
209
+ # Iterate #
210
+ with open(self.autopaths.fasta, "wb") as out_file:
211
+ for genome in tqdm(genomes_to_get):
212
+ info = find_entry('fna/' + genome, members)
213
+ handle = tar.extractfile(info)
214
+ shutil.copyfileobj(handle, out_file)
215
+ break
216
+
217
+ def make_bwa_database(self, verbose=True, print_time=True):
218
+ """
219
+ Using the 95% clustered FASTA file, create a BWA compatible database.
220
+ On a typical single threaded Intel process this takes about:
221
+ [main] Real time: 22335.125 sec; CPU: 22106.508 sec
222
+ """
223
+ # The big fasta with all the genomes #
224
+ fasta = FASTA(self.autopaths.fasta)
225
+ # Create a timer #
226
+ if print_time:
227
+ timer = Timer()
228
+ timer.print_start()
229
+ # Make a BWA index with the 'bwtsw' algorithm #
230
+ fasta.index_bwa(self.bwa_index, verbose=verbose)
231
+ # End message #
232
+ if print_time:
233
+ timer.print_end()
234
+ timer.print_total_elapsed()
235
+
236
+ ###############################################################################
237
+ # Create a singleton #
238
+ humgut = HumGut()
@@ -16,8 +16,6 @@ from autopaths.auto_paths import AutoPaths
16
16
  from autopaths.file_path import FilePath
17
17
  from autopaths.dir_path import DirectoryPath
18
18
 
19
- # Third party modules #
20
-
21
19
  # Constants #
22
20
  home = os.environ.get('HOME', '~') + '/'
23
21
 
@@ -15,8 +15,6 @@ from seqsearch.databases import Database
15
15
  from autopaths.auto_paths import AutoPaths
16
16
  from autopaths.file_path import FilePath
17
17
 
18
- # Third party modules #
19
-
20
18
  # Constants #
21
19
  home = os.environ.get('HOME', '~') + '/'
22
20
 
@@ -15,8 +15,6 @@ from seqsearch.databases import Database
15
15
  from autopaths.auto_paths import AutoPaths
16
16
  from autopaths.file_path import FilePath
17
17
 
18
- # Third party modules #
19
-
20
18
  # Constants #
21
19
  home = os.environ.get('HOME', '~') + '/'
22
20