seqsearch 2.2.1__tar.gz → 2.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {seqsearch-2.2.1 → seqsearch-2.2.4}/LICENSE.txt +1 -1
- {seqsearch-2.2.1 → seqsearch-2.2.4}/PKG-INFO +26 -9
- {seqsearch-2.2.1 → seqsearch-2.2.4}/README.md +2 -2
- seqsearch-2.2.4/pyproject.toml +47 -0
- seqsearch-2.2.4/seqsearch/__init__.py +11 -0
- seqsearch-2.2.4/seqsearch/databases/__init__.py +223 -0
- seqsearch-2.2.4/seqsearch/databases/human.py +51 -0
- seqsearch-2.2.4/seqsearch/databases/humgut.py +238 -0
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/mothur/greengenes.py +0 -2
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/mothur/rdp.py +0 -2
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/mothur/silva.py +0 -2
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/ncbi_16s.py +2 -2
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/nr.py +1 -8
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/nt.py +2 -4
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/pfam.py +7 -9
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/pr_two.py +2 -4
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/refseq.py +3 -5
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/silva.py +2 -4
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/string.py +2 -9
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/databases/tigrfam.py +2 -4
- {seqsearch-2.2.1/seqsearch/databases → seqsearch-2.2.4/seqsearch/examples}/download.py +0 -6
- seqsearch-2.2.4/seqsearch/mapping/bam.py +35 -0
- seqsearch-2.2.4/seqsearch/mapping/bam_and_sam.py +186 -0
- seqsearch-2.2.4/seqsearch/mapping/bwa.py +103 -0
- seqsearch-2.2.4/seqsearch/mapping/sam.py +27 -0
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/blast.py +12 -3
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/hmmer.py +1 -1
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/vsearch.py +1 -3
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch.egg-info/PKG-INFO +26 -9
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch.egg-info/SOURCES.txt +8 -2
- seqsearch-2.2.4/seqsearch.egg-info/requires.txt +12 -0
- seqsearch-2.2.1/seqsearch/__init__.py +0 -16
- seqsearch-2.2.1/seqsearch/databases/__init__.py +0 -139
- seqsearch-2.2.1/seqsearch.egg-info/requires.txt +0 -12
- seqsearch-2.2.1/setup.py +0 -38
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/examples/blast_object/simple_blast.py +0 -0
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/examples/seqsearch_object/simple_query.py +0 -0
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/__init__.py +0 -0
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/core.py +0 -0
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch/search/parallel.py +0 -0
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch.egg-info/dependency_links.txt +0 -0
- {seqsearch-2.2.1 → seqsearch-2.2.4}/seqsearch.egg-info/top_level.txt +0 -0
- {seqsearch-2.2.1 → seqsearch-2.2.4}/setup.cfg +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
Copyright (c)
|
|
1
|
+
Copyright (c) 2026 Lucas Sinclair
|
|
2
2
|
|
|
3
3
|
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
4
|
|
|
@@ -1,21 +1,38 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: seqsearch
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.4
|
|
4
4
|
Summary: Sequence similarity searches (e.g. BLAST) made easy.
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
5
|
+
Author-email: Lucas Sinclair <lucas.sinclair@me.com>
|
|
6
|
+
License: Copyright (c) 2026 Lucas Sinclair
|
|
7
|
+
|
|
8
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
11
|
+
|
|
12
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
13
|
+
Project-URL: Homepage, https://github.com/xapple/seqsearch
|
|
14
|
+
Keywords: bioinformatics,dna,sequences,search
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
9
17
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
10
18
|
Requires-Python: >=3.8
|
|
11
19
|
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE.txt
|
|
21
|
+
Requires-Dist: autopaths>=1.6.2
|
|
22
|
+
Requires-Dist: plumbing>=2.11.3
|
|
23
|
+
Requires-Dist: fasta>=2.3.6
|
|
24
|
+
Requires-Dist: runps>=4.1.1
|
|
25
|
+
Requires-Dist: biopython
|
|
26
|
+
Requires-Dist: tqdm
|
|
12
27
|
Provides-Extra: ftp
|
|
28
|
+
Requires-Dist: ftputil; extra == "ftp"
|
|
13
29
|
Provides-Extra: downloads
|
|
14
|
-
|
|
30
|
+
Requires-Dist: wget; extra == "downloads"
|
|
31
|
+
Dynamic: license-file
|
|
15
32
|
|
|
16
33
|
[](https://badge.fury.io/py/seqsearch)
|
|
17
34
|
|
|
18
|
-
# `seqsearch` version 2.2.
|
|
35
|
+
# `seqsearch` version 2.2.4
|
|
19
36
|
|
|
20
37
|
`seqsearch` is a python package for dealing sequence similarity searches (e.g. BLAST on DNA sequences) and automation.
|
|
21
38
|
|
|
@@ -53,7 +70,7 @@ Bellow are some examples to illustrate the various ways there are to use this pa
|
|
|
53
70
|
|
|
54
71
|
### Searches
|
|
55
72
|
|
|
56
|
-
You can parallelize BLAST searches by splitting the input into several files. It's easier to chop-up the input, because database chopping requires message passing across the nodes like mpiblast does (when and if it works).
|
|
73
|
+
You can parallelize BLAST searches by splitting the input into several files. It's easier to chop-up the input, because database chopping requires message passing across the nodes like `mpiblast` does (when and if it works).
|
|
57
74
|
|
|
58
75
|
Input chopping is fine as long as the database to search against fits in the RAM of the nodes. If the input is small and the database is large you can always switch them one for the other (in most cases).
|
|
59
76
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[](https://badge.fury.io/py/seqsearch)
|
|
2
2
|
|
|
3
|
-
# `seqsearch` version 2.2.
|
|
3
|
+
# `seqsearch` version 2.2.4
|
|
4
4
|
|
|
5
5
|
`seqsearch` is a python package for dealing sequence similarity searches (e.g. BLAST on DNA sequences) and automation.
|
|
6
6
|
|
|
@@ -38,7 +38,7 @@ Bellow are some examples to illustrate the various ways there are to use this pa
|
|
|
38
38
|
|
|
39
39
|
### Searches
|
|
40
40
|
|
|
41
|
-
You can parallelize BLAST searches by splitting the input into several files. It's easier to chop-up the input, because database chopping requires message passing across the nodes like mpiblast does (when and if it works).
|
|
41
|
+
You can parallelize BLAST searches by splitting the input into several files. It's easier to chop-up the input, because database chopping requires message passing across the nodes like `mpiblast` does (when and if it works).
|
|
42
42
|
|
|
43
43
|
Input chopping is fine as long as the database to search against fits in the RAM of the nodes. If the input is small and the database is large you can always switch them one for the other (in most cases).
|
|
44
44
|
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "seqsearch"
|
|
7
|
+
version = "2.2.4"
|
|
8
|
+
|
|
9
|
+
description = "Sequence similarity searches (e.g. BLAST) made easy."
|
|
10
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
11
|
+
license = { file = "LICENSE.txt" }
|
|
12
|
+
requires-python = ">=3.8"
|
|
13
|
+
|
|
14
|
+
authors = [
|
|
15
|
+
{ name = "Lucas Sinclair", email = "lucas.sinclair@me.com" },
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
keywords = ["bioinformatics", "dna", "sequences", "search"]
|
|
19
|
+
|
|
20
|
+
classifiers = [
|
|
21
|
+
"License :: OSI Approved :: MIT License",
|
|
22
|
+
"Operating System :: OS Independent",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Bio-Informatics",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
dependencies = [
|
|
27
|
+
"autopaths>=1.6.2",
|
|
28
|
+
"plumbing>=2.11.3",
|
|
29
|
+
"fasta>=2.3.6",
|
|
30
|
+
"runps>=4.1.1",
|
|
31
|
+
"biopython",
|
|
32
|
+
"tqdm",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
ftp = ["ftputil"]
|
|
37
|
+
downloads = ["wget"]
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
Homepage = "https://github.com/xapple/seqsearch"
|
|
41
|
+
|
|
42
|
+
[tool.setuptools]
|
|
43
|
+
include-package-data = true
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
namespaces = true
|
|
47
|
+
exclude = ["example"]
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Written by Lucas Sinclair.
|
|
6
|
+
MIT Licensed.
|
|
7
|
+
Contact at www.sinclair.bio
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# Built-in modules #
|
|
11
|
+
import os, fnmatch
|
|
12
|
+
from collections import Counter
|
|
13
|
+
import urllib.request
|
|
14
|
+
|
|
15
|
+
# First party modules #
|
|
16
|
+
from fasta import FASTA
|
|
17
|
+
from autopaths.auto_paths import AutoPaths
|
|
18
|
+
from autopaths.dir_path import DirectoryPath
|
|
19
|
+
from autopaths.file_path import FilePath
|
|
20
|
+
from plumbing.cache import property_cached
|
|
21
|
+
from plumbing.common import natural_sort
|
|
22
|
+
from plumbing.scraping import download_from_url
|
|
23
|
+
|
|
24
|
+
# Third party modules #
|
|
25
|
+
from tqdm import tqdm
|
|
26
|
+
|
|
27
|
+
# Constants #
|
|
28
|
+
home = os.environ.get('HOME', '~') + '/'
|
|
29
|
+
base_directory = DirectoryPath(home + "databases/")
|
|
30
|
+
|
|
31
|
+
###############################################################################
|
|
32
|
+
class Database:
|
|
33
|
+
"""General database object to inherit from."""
|
|
34
|
+
|
|
35
|
+
all_paths = """
|
|
36
|
+
/raw/
|
|
37
|
+
/unzipped/
|
|
38
|
+
/blast_db/
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, seq_type='nucl', base_dir=None):
|
|
42
|
+
# The sequence type is either 'prot' or 'nucl' #
|
|
43
|
+
self.seq_type = seq_type
|
|
44
|
+
# The default base directory #
|
|
45
|
+
if base_dir is None: base_dir = base_directory
|
|
46
|
+
# Make base_dir object #
|
|
47
|
+
self.base_dir = base_dir + self.short_name + '/'
|
|
48
|
+
# Make autopaths object #
|
|
49
|
+
self.autopaths = AutoPaths(self.base_dir, self.all_paths)
|
|
50
|
+
|
|
51
|
+
def __repr__(self):
|
|
52
|
+
# Get the name of this class #
|
|
53
|
+
name = self.__class__.__name__
|
|
54
|
+
# Return a user-friendly string #
|
|
55
|
+
return '<%s at "%s">' % (name, self.base_dir)
|
|
56
|
+
|
|
57
|
+
def __bool__(self):
|
|
58
|
+
"""
|
|
59
|
+
Return True if the database was already downloaded and the
|
|
60
|
+
results are stored on the filesystem. Return False otherwise.
|
|
61
|
+
"""
|
|
62
|
+
return not self.autopaths.unzipped_dir.empty
|
|
63
|
+
|
|
64
|
+
@property_cached
|
|
65
|
+
def files_to_retrieve(self):
|
|
66
|
+
"""The files we want to download with their destinations."""
|
|
67
|
+
return {f: FilePath(self.autopaths.raw_dir + f)
|
|
68
|
+
for f in self.files}
|
|
69
|
+
|
|
70
|
+
#---------------------------- GZIP compression ---------------------------#
|
|
71
|
+
def ungzip(self):
|
|
72
|
+
"""Ungzip them."""
|
|
73
|
+
# Check the extension #
|
|
74
|
+
for f in tqdm(self.raw_files):
|
|
75
|
+
if f.endswith('.gz') or f.endswith('.gzip'): continue
|
|
76
|
+
destination = self.autopaths.unzipped_dir + f.prefix
|
|
77
|
+
f.ungzip_to(destination)
|
|
78
|
+
# Make them only readable #
|
|
79
|
+
for f in self.autopaths.unzipped_dir:
|
|
80
|
+
f.permissions.only_readable()
|
|
81
|
+
|
|
82
|
+
def untargz(self):
|
|
83
|
+
"""Untargzip them."""
|
|
84
|
+
# Check the extension #
|
|
85
|
+
for f in tqdm(self.raw_files):
|
|
86
|
+
if f.endswith('.tar.gz') or f.endswith('.tgz'): continue
|
|
87
|
+
f.untargz_to(self.autopaths.unzipped_dir)
|
|
88
|
+
# Make them only readable #
|
|
89
|
+
for f in self.autopaths.unzipped_dir:
|
|
90
|
+
f.permissions.only_readable()
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def raw_files(self):
|
|
94
|
+
"""The files we have downloaded."""
|
|
95
|
+
return self.autopaths.raw_dir.contents
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def unzipped_files(self):
|
|
99
|
+
"""The files we have downloaded."""
|
|
100
|
+
return self.autopaths.unzipped_dir.contents
|
|
101
|
+
|
|
102
|
+
#------------------------ Only for FASTA databases -----------------------#
|
|
103
|
+
@property
|
|
104
|
+
def fasta_files(self):
|
|
105
|
+
"""The files we have downloaded."""
|
|
106
|
+
return map(FASTA, self.raw_files)
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def sequences(self):
|
|
110
|
+
"""All the sequences from all the raw files."""
|
|
111
|
+
for fasta in self.raw_files:
|
|
112
|
+
for seq in fasta: yield seq
|
|
113
|
+
|
|
114
|
+
#------------------ Only for preformatted BLAST databases ----------------#
|
|
115
|
+
@property_cached
|
|
116
|
+
def blast_db(self):
|
|
117
|
+
"""A BLASTable version of the sequences."""
|
|
118
|
+
# Import #
|
|
119
|
+
from seqsearch.search.blast import BLASTdb
|
|
120
|
+
# Create object #
|
|
121
|
+
db = BLASTdb(self.autopaths.unzipped_dir + self.db_name,
|
|
122
|
+
self.seq_type)
|
|
123
|
+
# Return #
|
|
124
|
+
return db
|
|
125
|
+
|
|
126
|
+
#--------------------- Only for taxonomic databases ----------------------#
|
|
127
|
+
@property_cached
|
|
128
|
+
def tax_depth_freq(self):
|
|
129
|
+
def depths():
|
|
130
|
+
with open(self.taxonomy, 'r') as handle:
|
|
131
|
+
for line in handle:
|
|
132
|
+
line = line.strip('\n')
|
|
133
|
+
otu_name, species = line.split('\t')
|
|
134
|
+
yield len(species.split(';'))
|
|
135
|
+
return Counter(depths())
|
|
136
|
+
|
|
137
|
+
#-------------------- Only for BWA indexed databases ---------------------#
|
|
138
|
+
@property
|
|
139
|
+
def bwa_index(self):
|
|
140
|
+
# Get the first file #
|
|
141
|
+
first = next(self.unzipped_files)
|
|
142
|
+
# Get only the prefix without the extension #
|
|
143
|
+
return first.prefix_path
|
|
144
|
+
|
|
145
|
+
###############################################################################
|
|
146
|
+
class DatabaseHTTP(Database):
|
|
147
|
+
"""A database that is stored on an HTTP server."""
|
|
148
|
+
|
|
149
|
+
@property_cached
|
|
150
|
+
def files_to_retrieve(self):
|
|
151
|
+
"""The files we want to download with their destinations."""
|
|
152
|
+
return {f: FilePath(self.autopaths.raw_dir + f)
|
|
153
|
+
for f in self.files}
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def files_remaining(self):
|
|
157
|
+
"""The files we haven't downloaded yet based on size checks."""
|
|
158
|
+
# Function to get the size of a file #
|
|
159
|
+
def get_size_http(url):
|
|
160
|
+
response = urllib.request.urlopen(url)
|
|
161
|
+
return int(response.getheader("Content-Length"))
|
|
162
|
+
# Check each file #
|
|
163
|
+
return {source: dest
|
|
164
|
+
for source, dest in self.files_to_retrieve.items()
|
|
165
|
+
if dest.count_bytes == 0 or
|
|
166
|
+
dest.count_bytes != get_size_http(self.base_url + source)}
|
|
167
|
+
|
|
168
|
+
def download(self):
|
|
169
|
+
"""Retrieve all files from the website."""
|
|
170
|
+
for source, dest in self.files_remaining.items():
|
|
171
|
+
# Get the full URL #
|
|
172
|
+
url = self.base_url + source
|
|
173
|
+
# Similar to wget #
|
|
174
|
+
download_from_url(url, dest,
|
|
175
|
+
stream = True,
|
|
176
|
+
progress = True,
|
|
177
|
+
desc = source,
|
|
178
|
+
cleanup = True,
|
|
179
|
+
)
|
|
180
|
+
# Make it readable only #
|
|
181
|
+
dest.permissions.only_readable()
|
|
182
|
+
|
|
183
|
+
###############################################################################
|
|
184
|
+
class DatabaseFTP(Database):
|
|
185
|
+
"""A database that is stored on an FTP server."""
|
|
186
|
+
|
|
187
|
+
@property_cached
|
|
188
|
+
def ftp(self):
|
|
189
|
+
"""If the data is to be obtained by FTP, here is the ftputil object."""
|
|
190
|
+
from ftputil import FTPHost
|
|
191
|
+
ftp = FTPHost(self.ftp_url, "anonymous")
|
|
192
|
+
ftp.chdir(self.ftp_dir)
|
|
193
|
+
return ftp
|
|
194
|
+
|
|
195
|
+
@property_cached
|
|
196
|
+
def files_to_retrieve(self):
|
|
197
|
+
"""The files we want to download with their destinations."""
|
|
198
|
+
# In the case we specify a pattern #
|
|
199
|
+
if hasattr(self, "pattern"):
|
|
200
|
+
files = self.ftp.listdir(self.ftp.curdir)
|
|
201
|
+
files.sort(key=natural_sort)
|
|
202
|
+
return {f: FilePath(self.autopaths.raw_dir + f)
|
|
203
|
+
for f in files if fnmatch.fnmatch(f, self.pattern)}
|
|
204
|
+
# In the case we specify a list of files #
|
|
205
|
+
if hasattr(self, "files"):
|
|
206
|
+
return {f: FilePath(self.autopaths.raw_dir + f)
|
|
207
|
+
for f in self.files}
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def files_remaining(self):
|
|
211
|
+
"""The files we haven't downloaded yet based on size checks."""
|
|
212
|
+
return {source: dest for source, dest in self.files_to_retrieve.items()
|
|
213
|
+
if dest.count_bytes != self.ftp.path.getsize(source)}
|
|
214
|
+
|
|
215
|
+
def download(self):
|
|
216
|
+
"""Retrieve all files from the FTP site."""
|
|
217
|
+
# Create the directory #
|
|
218
|
+
self.base_dir.create_if_not_exists()
|
|
219
|
+
# Loop over files #
|
|
220
|
+
for source, dest in tqdm(self.files_remaining.items()):
|
|
221
|
+
dest.remove()
|
|
222
|
+
self.ftp.download(source, dest)
|
|
223
|
+
dest.permissions.only_readable()
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Written by Lucas Sinclair.
|
|
6
|
+
MIT Licensed.
|
|
7
|
+
Contact at www.sinclair.bio
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# Built-in modules #
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
# First party modules #
|
|
14
|
+
from seqsearch.databases import DatabaseFTP
|
|
15
|
+
|
|
16
|
+
# Third party modules #
|
|
17
|
+
|
|
18
|
+
# Constants #
|
|
19
|
+
home = os.environ.get('HOME', '~') + '/'
|
|
20
|
+
|
|
21
|
+
###############################################################################
|
|
22
|
+
class HumanGenome(DatabaseFTP):
|
|
23
|
+
"""
|
|
24
|
+
The NCBI provides the latest version of the human genome, as well
|
|
25
|
+
as preformatted files here:
|
|
26
|
+
|
|
27
|
+
https://www.ncbi.nlm.nih.gov/genome/guide/human/
|
|
28
|
+
|
|
29
|
+
To install:
|
|
30
|
+
|
|
31
|
+
>>> from seqsearch.databases.human import hg38
|
|
32
|
+
>>> hg38.download()
|
|
33
|
+
>>> hg38.untargz()
|
|
34
|
+
>>> hg38.autopaths.raw_dir.remove()
|
|
35
|
+
|
|
36
|
+
It will place the resulting files in "~/databases/human/".
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
tag = "hg38"
|
|
40
|
+
short_name = "grc_h38"
|
|
41
|
+
long_name = 'Human Genome v38 at NCBI'
|
|
42
|
+
|
|
43
|
+
ftp_url = "ftp.ncbi.nlm.nih.gov"
|
|
44
|
+
ftp_dir = "genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/" \
|
|
45
|
+
"seqs_for_alignment_pipelines.ucsc_ids/"
|
|
46
|
+
|
|
47
|
+
files = ['GCA_000001405.15_GRCh38_full_analysis_set.fna.bwa_index.tar.gz']
|
|
48
|
+
|
|
49
|
+
###############################################################################
|
|
50
|
+
# Create a singleton #
|
|
51
|
+
hg38 = HumanGenome()
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Written by Lucas Sinclair.
|
|
6
|
+
MIT Licensed.
|
|
7
|
+
Contact at www.sinclair.bio
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
# Built-in modules #
|
|
11
|
+
import os, re, tarfile, shutil, functools
|
|
12
|
+
|
|
13
|
+
# First party modules #
|
|
14
|
+
from seqsearch.databases import DatabaseHTTP
|
|
15
|
+
from fasta import FASTA
|
|
16
|
+
from plumbing.timer import Timer
|
|
17
|
+
|
|
18
|
+
# Third party modules #
|
|
19
|
+
import pandas
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
|
|
22
|
+
# Constants #
|
|
23
|
+
home = os.environ.get('HOME', '~') + '/'
|
|
24
|
+
|
|
25
|
+
###############################################################################
|
|
26
|
+
class HumGutGenome:
|
|
27
|
+
"""A genome found in the database, with associated counts."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, tax_id, count):
|
|
30
|
+
"""Use the HumGut genome ID to create the instance."""
|
|
31
|
+
# Save attributes #
|
|
32
|
+
self.tax_id = tax_id
|
|
33
|
+
self.count = count
|
|
34
|
+
|
|
35
|
+
def __repr__(self):
|
|
36
|
+
return '<%s object ID %s>' % (self.__class__.__name__, self.tax_id)
|
|
37
|
+
|
|
38
|
+
@functools.cached_property
|
|
39
|
+
def metadata(self):
|
|
40
|
+
"""
|
|
41
|
+
An example output is:
|
|
42
|
+
|
|
43
|
+
{'HumGut_name': 'HumGut_20705',
|
|
44
|
+
'cluster975': 20705,
|
|
45
|
+
'cluster95': 3214,
|
|
46
|
+
'gtdbtk_tax_id': 4030631,
|
|
47
|
+
'gtdbtk_organism_name': 's__Enterococcus_D casseliflavus',
|
|
48
|
+
'gtdbtk_taxonomy': 'd__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Enterococcaceae;g__Enterococcus_D;s__Enterococcus_D casseliflavus',
|
|
49
|
+
'ncbi_tax_id': 1218087,
|
|
50
|
+
'ncbi_organism_name': 'Enterococcus casseliflavus NBRC 100478',
|
|
51
|
+
'ncbi_rank': 'strain',
|
|
52
|
+
'prevalence_score': 0.7849960826259196,
|
|
53
|
+
'metagenomes_present': 22,
|
|
54
|
+
'completeness': 99.24528301886792,
|
|
55
|
+
'contamination': 0.389531345100426,
|
|
56
|
+
'GC': 0.4235326316891364,
|
|
57
|
+
'genome_size': 3668336,
|
|
58
|
+
'source': 'RefSeq',
|
|
59
|
+
'genome_type': 'Complete Genome',
|
|
60
|
+
'cluster975_size': 68,
|
|
61
|
+
'cluster95_size': 125,
|
|
62
|
+
'genome_file': 'GCF_003641225.1_ASM364122v1_genomic.fna.gz',
|
|
63
|
+
'ftp_download': 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/641/225/GCF_003641225.1_ASM364122v1/GCF_003641225.1_ASM364122v1_genomic.fna.gz'}
|
|
64
|
+
"""
|
|
65
|
+
return humgut.id_to_metadata(self.tax_id)
|
|
66
|
+
|
|
67
|
+
@functools.cached_property
|
|
68
|
+
def tax(self):
|
|
69
|
+
"""
|
|
70
|
+
Parse the gtdbtk_taxonomy string into a list.
|
|
71
|
+
A typical output is the following:
|
|
72
|
+
|
|
73
|
+
['Bacteria',
|
|
74
|
+
'Firmicutes',
|
|
75
|
+
'Bacilli',
|
|
76
|
+
'Lactobacillales',
|
|
77
|
+
'Enterococcaceae',
|
|
78
|
+
'Enterococcus_D']
|
|
79
|
+
"""
|
|
80
|
+
# Parse the string 'd__Bacteria;p__Firmicutes;...' #
|
|
81
|
+
pattern = '__(.+?);'
|
|
82
|
+
tax = re.findall(pattern, self.metadata['gtdbtk_taxonomy'])
|
|
83
|
+
# If the classification doesn't go all the way down we will add
|
|
84
|
+
# 'Unclassified' until reaching the species level
|
|
85
|
+
count_ranks = len(humgut.rank_names)
|
|
86
|
+
tax += ['Unclassified'] * (count_ranks - len(tax))
|
|
87
|
+
# Return tax #
|
|
88
|
+
return tax
|
|
89
|
+
|
|
90
|
+
###############################################################################
|
|
91
|
+
class HumGut(DatabaseHTTP):
|
|
92
|
+
"""
|
|
93
|
+
Quote from the paper:
|
|
94
|
+
|
|
95
|
+
"We screened > 5,700 healthy human gut metagenomes for the containment of
|
|
96
|
+
> 490,000 publicly available prokaryotic genomes sourced from RefSeq and
|
|
97
|
+
the recently announced UHGG collection. This resulted in a pool of
|
|
98
|
+
> 381,000 genomes that were subsequently scored and ranked based on their
|
|
99
|
+
prevalence in the healthy human metagenomes. The genomes were then
|
|
100
|
+
clustered at a 97.5% sequence identity resolution, and cluster
|
|
101
|
+
representatives (30,691 in total) were retained to comprise the HumGut
|
|
102
|
+
collection."
|
|
103
|
+
|
|
104
|
+
The publication is here:
|
|
105
|
+
|
|
106
|
+
* https://doi.org/10.1186/s40168-021-01114-w
|
|
107
|
+
|
|
108
|
+
The download website is here:
|
|
109
|
+
|
|
110
|
+
* http://arken.nmbu.no/~larssn/humgut/
|
|
111
|
+
|
|
112
|
+
To install:
|
|
113
|
+
|
|
114
|
+
>>> from seqsearch.databases.humgut import humgut
|
|
115
|
+
>>> humgut.download()
|
|
116
|
+
>>> humgut.untargz()
|
|
117
|
+
>>> humgut.get_95_cluster()
|
|
118
|
+
>>> humgut.autopaths.tar.remove()
|
|
119
|
+
>>> humgut.make_bwa_database()
|
|
120
|
+
|
|
121
|
+
It will place the resulting files in "~/databases/human/".
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
tag = "humgut"
|
|
125
|
+
short_name = "humgut"
|
|
126
|
+
long_name = 'HumGut: a comprehensive human gut prokaryotic genomes' \
|
|
127
|
+
' collection'
|
|
128
|
+
|
|
129
|
+
base_url = "http://arken.nmbu.no/~larssn/humgut/"
|
|
130
|
+
|
|
131
|
+
files = ['HumGut.tar.gz',
|
|
132
|
+
'HumGut.tsv',
|
|
133
|
+
'ncbi_names.dmp',
|
|
134
|
+
'ncbi_nodes.dmp']
|
|
135
|
+
|
|
136
|
+
all_paths = """
|
|
137
|
+
/raw/HumGut.tar.gz
|
|
138
|
+
/raw/HumGut.tsv
|
|
139
|
+
/raw/ncbi_names.dmp
|
|
140
|
+
/raw/ncbi_nodes.dmp
|
|
141
|
+
/cluster95/humgut95.fasta.gz
|
|
142
|
+
/bwa_db/
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
#------------------------------ Properties -------------------------------#
|
|
146
|
+
@functools.cached_property
|
|
147
|
+
def metadata(self):
|
|
148
|
+
"""Parse the TSV metadata with pandas"""
|
|
149
|
+
# Load the file in memory #
|
|
150
|
+
df = pandas.read_csv(self.autopaths.HumGut_tsv, sep='\t')
|
|
151
|
+
# Change index #
|
|
152
|
+
df = df.set_index('HumGut_tax_id')
|
|
153
|
+
# Return #
|
|
154
|
+
return df
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def bwa_index(self):
|
|
158
|
+
return self.autopaths.bwa_db_dir + 'humgut95'
|
|
159
|
+
|
|
160
|
+
#--------------------------- Extra information ----------------------------#
|
|
161
|
+
@property
|
|
162
|
+
def rank_names(self):
|
|
163
|
+
return ['Domain', # 1 (This is Bacteria, Archaea or Eucarya)
|
|
164
|
+
'Phylum', # 2 (This is for instance 'Firmicutes')
|
|
165
|
+
'Class', # 3
|
|
166
|
+
'Order', # 4
|
|
167
|
+
'Family', # 5
|
|
168
|
+
'Genus', # 6
|
|
169
|
+
'Species'] # 7
|
|
170
|
+
|
|
171
|
+
#------------------------------- Methods ---------------------------------#
|
|
172
|
+
def id_to_metadata(self, tax_id):
|
|
173
|
+
"""
|
|
174
|
+
Like a dictionary for quick look up of a genome based on its
|
|
175
|
+
taxonomy ID.
|
|
176
|
+
"""
|
|
177
|
+
return self.metadata.loc[tax_id]
|
|
178
|
+
|
|
179
|
+
def get_95_cluster(self):
|
|
180
|
+
"""
|
|
181
|
+
Will write a large compressed FASTA file will all genomes of
|
|
182
|
+
interest concatenated together. To do this, we will pick single files
|
|
183
|
+
out of the large TAR archive provided and append them one by one.
|
|
184
|
+
See the documentation on GitHub here:
|
|
185
|
+
* https://github.com/larssnip/HumGut#the-humgut-library
|
|
186
|
+
"""
|
|
187
|
+
# Keep the first representative of every cluster #
|
|
188
|
+
df = self.metadata.drop_duplicates(subset = 'cluster95')
|
|
189
|
+
# Get the list of genome names to retrieve #
|
|
190
|
+
genomes_to_get = list(df['genome_file'])
|
|
191
|
+
# Function #
|
|
192
|
+
def find_entry(name, all_entries):
|
|
193
|
+
for entry in all_entries:
|
|
194
|
+
if entry.name == name:
|
|
195
|
+
return entry
|
|
196
|
+
raise Exception("Entry '%s' not found." % name)
|
|
197
|
+
# Open the tar file with all genomes for reading.
|
|
198
|
+
# Fetch every one of the separate FASTA files (one per genome) from
|
|
199
|
+
# within the archive and concatenate them all into one big FASTA file.
|
|
200
|
+
with tarfile.open(self.autopaths.tar, "r:gz") as tar:
|
|
201
|
+
# Message #
|
|
202
|
+
msg = "Reading file list from '%s'..."
|
|
203
|
+
print(msg % self.autopaths.tar)
|
|
204
|
+
# Get all the member files #
|
|
205
|
+
members = tar.getmembers()
|
|
206
|
+
# Message #
|
|
207
|
+
msg = "Extracting %i genomes..."
|
|
208
|
+
print(msg % len(genomes_to_get))
|
|
209
|
+
# Iterate #
|
|
210
|
+
with open(self.autopaths.fasta, "wb") as out_file:
|
|
211
|
+
for genome in tqdm(genomes_to_get):
|
|
212
|
+
info = find_entry('fna/' + genome, members)
|
|
213
|
+
handle = tar.extractfile(info)
|
|
214
|
+
shutil.copyfileobj(handle, out_file)
|
|
215
|
+
break
|
|
216
|
+
|
|
217
|
+
def make_bwa_database(self, verbose=True, print_time=True):
|
|
218
|
+
"""
|
|
219
|
+
Using the 95% clustered FASTA file, create a BWA compatible database.
|
|
220
|
+
On a typical single threaded Intel process this takes about:
|
|
221
|
+
[main] Real time: 22335.125 sec; CPU: 22106.508 sec
|
|
222
|
+
"""
|
|
223
|
+
# The big fasta with all the genomes #
|
|
224
|
+
fasta = FASTA(self.autopaths.fasta)
|
|
225
|
+
# Create a timer #
|
|
226
|
+
if print_time:
|
|
227
|
+
timer = Timer()
|
|
228
|
+
timer.print_start()
|
|
229
|
+
# Make a BWA index with the 'bwtsw' algorithm #
|
|
230
|
+
fasta.index_bwa(self.bwa_index, verbose=verbose)
|
|
231
|
+
# End message #
|
|
232
|
+
if print_time:
|
|
233
|
+
timer.print_end()
|
|
234
|
+
timer.print_total_elapsed()
|
|
235
|
+
|
|
236
|
+
###############################################################################
|
|
237
|
+
# Create a singleton #
|
|
238
|
+
humgut = HumGut()
|