oxymetag 1.0.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oxymetag/__init__.py +1 -1
- oxymetag/cli.py +15 -13
- oxymetag/core.py +114 -15
- oxymetag/data/VTML20.out +33 -0
- oxymetag/data/nucleotide.out +9 -0
- oxymetag/data/oxygen_model.rds +0 -0
- oxymetag/data/oxymetag_pfams_n117.dmnd +0 -0
- oxymetag/data/oxymetag_pfams_n117_db +0 -0
- oxymetag/data/oxymetag_pfams_n117_db.dbtype +0 -0
- oxymetag/data/oxymetag_pfams_n117_db.index +23972 -0
- oxymetag/data/oxymetag_pfams_n117_db.lookup +23972 -0
- oxymetag/data/oxymetag_pfams_n117_db.source +1 -0
- oxymetag/data/oxymetag_pfams_n117_db_h +0 -0
- oxymetag/data/oxymetag_pfams_n117_db_h.dbtype +0 -0
- oxymetag/data/oxymetag_pfams_n117_db_h.index +23972 -0
- oxymetag/scripts/predict_oxygen.R +86 -38
- oxymetag/utils.py +32 -14
- {oxymetag-1.0.0.dist-info → oxymetag-1.1.1.dist-info}/METADATA +116 -52
- oxymetag-1.1.1.dist-info/RECORD +29 -0
- oxymetag-1.0.0.dist-info/RECORD +0 -18
- {oxymetag-1.0.0.dist-info → oxymetag-1.1.1.dist-info}/LICENSE +0 -0
- {oxymetag-1.0.0.dist-info → oxymetag-1.1.1.dist-info}/WHEEL +0 -0
- {oxymetag-1.0.0.dist-info → oxymetag-1.1.1.dist-info}/entry_points.txt +0 -0
- {oxymetag-1.0.0.dist-info → oxymetag-1.1.1.dist-info}/top_level.txt +0 -0
oxymetag/__init__.py
CHANGED
oxymetag/cli.py
CHANGED
|
@@ -11,7 +11,6 @@ from . import __version__
|
|
|
11
11
|
from .core import extract_reads, profile_samples, predict_aerobes
|
|
12
12
|
from .utils import check_dependencies, run_kraken2_setup, OxyMetaGError
|
|
13
13
|
|
|
14
|
-
# Set up logging
|
|
15
14
|
logging.basicConfig(
|
|
16
15
|
level=logging.INFO,
|
|
17
16
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
@@ -29,10 +28,8 @@ def main():
|
|
|
29
28
|
|
|
30
29
|
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
|
31
30
|
|
|
32
|
-
# Setup command
|
|
33
31
|
setup_parser = subparsers.add_parser('setup', help='Setup Kraken2 database')
|
|
34
32
|
|
|
35
|
-
# Extract command
|
|
36
33
|
extract_parser = subparsers.add_parser('extract', help='Extract bacterial reads')
|
|
37
34
|
extract_parser.add_argument('-i', '--input', nargs='+', required=True,
|
|
38
35
|
help='Input fastq.gz files')
|
|
@@ -43,27 +40,31 @@ def main():
|
|
|
43
40
|
extract_parser.add_argument('--kraken-db', default='kraken2_db',
|
|
44
41
|
help='Kraken2 database path (default: kraken2_db)')
|
|
45
42
|
|
|
46
|
-
|
|
47
|
-
profile_parser = subparsers.add_parser('profile', help='Profile samples with DIAMOND')
|
|
43
|
+
profile_parser = subparsers.add_parser('profile', help='Profile samples with DIAMOND or MMseqs2')
|
|
48
44
|
profile_parser.add_argument('-i', '--input', default='BactReads',
|
|
49
45
|
help='Input directory (default: BactReads)')
|
|
50
|
-
profile_parser.add_argument('-o', '--output', default=
|
|
51
|
-
help='Output directory (default: diamond_output)')
|
|
46
|
+
profile_parser.add_argument('-o', '--output', default=None,
|
|
47
|
+
help='Output directory (default: diamond_output or mmseqs_output)')
|
|
52
48
|
profile_parser.add_argument('-t', '--threads', type=int, default=4,
|
|
53
49
|
help='Number of threads (default: 4)')
|
|
50
|
+
profile_parser.add_argument('-m', '--method', choices=['diamond', 'mmseqs2'],
|
|
51
|
+
default='diamond',
|
|
52
|
+
help='Profiling method (default: diamond)')
|
|
54
53
|
profile_parser.add_argument('--diamond-db',
|
|
55
54
|
help='DIAMOND database path (default: package database)')
|
|
55
|
+
profile_parser.add_argument('--mmseqs-db',
|
|
56
|
+
help='MMseqs2 database path (default: package database)')
|
|
56
57
|
|
|
57
|
-
# Predict command
|
|
58
58
|
predict_parser = subparsers.add_parser('predict', help='Predict aerobe levels')
|
|
59
|
-
predict_parser.add_argument('-i', '--input', default=
|
|
60
|
-
help='Input directory (default: diamond_output)')
|
|
59
|
+
predict_parser.add_argument('-i', '--input', default=None,
|
|
60
|
+
help='Input directory (default: diamond_output for modern, mmseqs_output for ancient)')
|
|
61
61
|
predict_parser.add_argument('-o', '--output', default='per_aerobe_predictions.tsv',
|
|
62
62
|
help='Output file (default: per_aerobe_predictions.tsv)')
|
|
63
63
|
predict_parser.add_argument('-t', '--threads', type=int, default=4,
|
|
64
64
|
help='Number of threads (default: 4)')
|
|
65
65
|
predict_parser.add_argument('-m', '--mode', choices=['modern', 'ancient', 'custom'],
|
|
66
|
-
default='modern',
|
|
66
|
+
default='modern',
|
|
67
|
+
help='Filtering mode: modern=DIAMOND, ancient=MMseqs2 (default: modern)')
|
|
67
68
|
predict_parser.add_argument('--idcut', type=float,
|
|
68
69
|
help='Custom identity cutoff (for custom mode)')
|
|
69
70
|
predict_parser.add_argument('--bitcut', type=float,
|
|
@@ -87,11 +88,12 @@ def main():
|
|
|
87
88
|
extract_reads(args.input, args.output, args.threads, args.kraken_db)
|
|
88
89
|
|
|
89
90
|
elif args.command == 'profile':
|
|
90
|
-
profile_samples(args.input, args.output, args.threads, args.
|
|
91
|
+
profile_samples(args.input, args.output, args.threads, args.method,
|
|
92
|
+
args.diamond_db, args.mmseqs_db)
|
|
91
93
|
|
|
92
94
|
elif args.command == 'predict':
|
|
93
95
|
predict_aerobes(args.input, args.output, args.mode,
|
|
94
|
-
|
|
96
|
+
args.idcut, args.bitcut, args.ecut, args.threads)
|
|
95
97
|
|
|
96
98
|
logger.info("Command completed successfully")
|
|
97
99
|
|
oxymetag/core.py
CHANGED
|
@@ -108,20 +108,18 @@ def extract_reads(input_files: List[str], output_dir: str = "BactReads",
|
|
|
108
108
|
continue
|
|
109
109
|
|
|
110
110
|
|
|
111
|
-
def profile_samples(input_dir: str = "BactReads", output_dir: str =
|
|
112
|
-
threads: int = 4,
|
|
111
|
+
def profile_samples(input_dir: str = "BactReads", output_dir: str = None,
|
|
112
|
+
threads: int = 4, method: str = "diamond",
|
|
113
|
+
diamond_db: str = None, mmseqs_db: str = None):
|
|
113
114
|
"""
|
|
114
|
-
Profile samples using DIAMOND
|
|
115
|
+
Profile samples using DIAMOND or MMseqs2 against Pfam database
|
|
115
116
|
"""
|
|
116
117
|
from .utils import get_package_data_path
|
|
117
118
|
|
|
118
|
-
logger.info(f"Starting sample profiling with {threads} threads")
|
|
119
|
+
logger.info(f"Starting sample profiling with {method} using {threads} threads")
|
|
119
120
|
|
|
120
|
-
if
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if not Path(diamond_db).exists():
|
|
124
|
-
raise OxyMetaGError(f"DIAMOND database not found: {diamond_db}")
|
|
121
|
+
if output_dir is None:
|
|
122
|
+
output_dir = 'diamond_output' if method == 'diamond' else 'mmseqs_output'
|
|
125
123
|
|
|
126
124
|
output_path = Path(output_dir)
|
|
127
125
|
output_path.mkdir(exist_ok=True)
|
|
@@ -132,6 +130,8 @@ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_out
|
|
|
132
130
|
patterns = [
|
|
133
131
|
'*_R1_bacterial.fastq.gz',
|
|
134
132
|
'*_1_bacterial.fastq.gz',
|
|
133
|
+
'*_bacterial_R1.fastq.gz',
|
|
134
|
+
'*_bacterial_1.fastq.gz',
|
|
135
135
|
'*_bacterial.fastq.gz'
|
|
136
136
|
]
|
|
137
137
|
|
|
@@ -147,9 +147,32 @@ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_out
|
|
|
147
147
|
logger.error(f"FASTQ files in {input_dir}: {[f.name for f in all_files[:5]]}")
|
|
148
148
|
raise OxyMetaGError(f"No bacterial read files found in {input_dir}")
|
|
149
149
|
|
|
150
|
+
if method == 'diamond':
|
|
151
|
+
_profile_with_diamond(input_files, output_path, threads, diamond_db)
|
|
152
|
+
elif method == 'mmseqs2':
|
|
153
|
+
_profile_with_mmseqs(input_files, output_path, threads, mmseqs_db)
|
|
154
|
+
else:
|
|
155
|
+
raise OxyMetaGError(f"Unknown method: {method}")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _profile_with_diamond(input_files: List[Path], output_path: Path,
|
|
159
|
+
threads: int, diamond_db: str = None):
|
|
160
|
+
"""Profile samples using DIAMOND blastx"""
|
|
161
|
+
from .utils import get_package_data_path
|
|
162
|
+
|
|
163
|
+
if diamond_db is None:
|
|
164
|
+
diamond_db = get_package_data_path("oxymetag_pfams.dmnd")
|
|
165
|
+
|
|
166
|
+
if not Path(diamond_db).exists():
|
|
167
|
+
raise OxyMetaGError(f"DIAMOND database not found: {diamond_db}")
|
|
168
|
+
|
|
150
169
|
for input_file in input_files:
|
|
151
170
|
base_name = input_file.stem.replace('.fastq', '').replace('.gz', '')
|
|
152
|
-
base_name = base_name.replace('_R1_bacterial', '')
|
|
171
|
+
base_name = (base_name.replace('_R1_bacterial', '')
|
|
172
|
+
.replace('_1_bacterial', '')
|
|
173
|
+
.replace('_bacterial_R1', '')
|
|
174
|
+
.replace('_bacterial_1', '')
|
|
175
|
+
.replace('_bacterial', ''))
|
|
153
176
|
|
|
154
177
|
logger.info(f"Processing {input_file}")
|
|
155
178
|
|
|
@@ -172,20 +195,95 @@ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_out
|
|
|
172
195
|
continue
|
|
173
196
|
|
|
174
197
|
|
|
175
|
-
def
|
|
198
|
+
def _profile_with_mmseqs(input_files: List[Path], output_path: Path,
|
|
199
|
+
threads: int, mmseqs_db: str = None):
|
|
200
|
+
"""Profile samples using MMseqs2 easy-search"""
|
|
201
|
+
from .utils import get_package_data_path
|
|
202
|
+
|
|
203
|
+
if mmseqs_db is None:
|
|
204
|
+
mmseqs_db = get_package_data_path("oxymetag_pfams_n117_db")
|
|
205
|
+
|
|
206
|
+
if not Path(mmseqs_db).exists():
|
|
207
|
+
raise OxyMetaGError(f"MMseqs2 database not found: {mmseqs_db}")
|
|
208
|
+
|
|
209
|
+
data_dir = Path(get_package_data_path(""))
|
|
210
|
+
vtml_matrix = data_dir / "VTML20.out"
|
|
211
|
+
nucl_matrix = data_dir / "nucleotide.out"
|
|
212
|
+
|
|
213
|
+
if not vtml_matrix.exists():
|
|
214
|
+
raise OxyMetaGError(f"VTML20.out matrix not found: {vtml_matrix}")
|
|
215
|
+
if not nucl_matrix.exists():
|
|
216
|
+
raise OxyMetaGError(f"nucleotide.out matrix not found: {nucl_matrix}")
|
|
217
|
+
|
|
218
|
+
for input_file in input_files:
|
|
219
|
+
base_name = input_file.stem.replace('.fastq', '').replace('.gz', '')
|
|
220
|
+
base_name = (base_name.replace('_R1_bacterial', '')
|
|
221
|
+
.replace('_1_bacterial', '')
|
|
222
|
+
.replace('_bacterial_R1', '')
|
|
223
|
+
.replace('_bacterial_1', '')
|
|
224
|
+
.replace('_bacterial', ''))
|
|
225
|
+
|
|
226
|
+
logger.info(f"Processing {input_file} with MMseqs2")
|
|
227
|
+
|
|
228
|
+
output_file = output_path / f"{base_name}_mmseqs.tsv"
|
|
229
|
+
tmp_dir = output_path / f"{base_name}_tmp"
|
|
230
|
+
tmp_dir.mkdir(exist_ok=True)
|
|
231
|
+
|
|
232
|
+
cmd = [
|
|
233
|
+
'mmseqs', 'easy-search',
|
|
234
|
+
str(input_file),
|
|
235
|
+
str(mmseqs_db),
|
|
236
|
+
str(output_file),
|
|
237
|
+
str(tmp_dir),
|
|
238
|
+
'--min-length', '12',
|
|
239
|
+
'-e', '10.0',
|
|
240
|
+
'--min-seq-id', '0.86',
|
|
241
|
+
'-c', '0.65',
|
|
242
|
+
'--cov-mode', '2',
|
|
243
|
+
'--format-mode', '0',
|
|
244
|
+
'--format-output', 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen,cigar,qaln,taln',
|
|
245
|
+
'--comp-bias-corr', '0',
|
|
246
|
+
'--mask', '0',
|
|
247
|
+
'--exact-kmer-matching', '1',
|
|
248
|
+
'--sub-mat', f'aa:{vtml_matrix},nucl:{nucl_matrix}',
|
|
249
|
+
'--seed-sub-mat', f'aa:{vtml_matrix},nucl:{nucl_matrix}',
|
|
250
|
+
'-s', '2',
|
|
251
|
+
'-k', '6',
|
|
252
|
+
'--spaced-kmer-pattern', '11011101',
|
|
253
|
+
'--max-seqs', '10000',
|
|
254
|
+
'--max-rejected', '10',
|
|
255
|
+
'--threads', str(threads),
|
|
256
|
+
'--remove-tmp-files', '0',
|
|
257
|
+
'--use-all-table-starts', '1'
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
subprocess.run(cmd, check=True)
|
|
262
|
+
logger.info(f"MMseqs2 profiling completed for {input_file}")
|
|
263
|
+
|
|
264
|
+
except subprocess.CalledProcessError as e:
|
|
265
|
+
logger.error(f"Failed to process {input_file}: {e}")
|
|
266
|
+
continue
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def predict_aerobes(input_dir: str = None, output_file: str = "per_aerobe_predictions.tsv",
|
|
176
270
|
mode: str = "modern", id_cut: float = None, bit_cut: float = None,
|
|
177
271
|
e_cut: float = None, threads: int = 4):
|
|
178
272
|
"""
|
|
179
|
-
Predict aerobe levels from DIAMOND results
|
|
273
|
+
Predict aerobe levels from DIAMOND or MMseqs2 results
|
|
274
|
+
Mode determines method: modern=DIAMOND, ancient=MMseqs2
|
|
180
275
|
"""
|
|
181
276
|
from .utils import get_package_data_path
|
|
182
277
|
|
|
183
278
|
logger.info(f"Starting aerobe level prediction in {mode} mode")
|
|
184
279
|
|
|
280
|
+
if input_dir is None:
|
|
281
|
+
input_dir = 'diamond_output' if mode == 'modern' else 'mmseqs_output'
|
|
282
|
+
|
|
185
283
|
if mode == "modern":
|
|
186
284
|
identity_cutoff, bitscore_cutoff, evalue_cutoff = 60.0, 50.0, 0.001
|
|
187
285
|
elif mode == "ancient":
|
|
188
|
-
identity_cutoff, bitscore_cutoff, evalue_cutoff =
|
|
286
|
+
identity_cutoff, bitscore_cutoff, evalue_cutoff = 86.0, 50.0, 0.001
|
|
189
287
|
elif mode == "custom":
|
|
190
288
|
if any(x is None for x in [id_cut, bit_cut, e_cut]):
|
|
191
289
|
raise OxyMetaGError("Custom mode requires id_cut, bit_cut, and e_cut parameters")
|
|
@@ -194,7 +292,8 @@ def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_a
|
|
|
194
292
|
raise OxyMetaGError("Mode must be 'modern', 'ancient', or 'custom'")
|
|
195
293
|
|
|
196
294
|
package_data_dir = str(Path(get_package_data_path("")).parent / "data")
|
|
197
|
-
|
|
295
|
+
package_base = Path(__file__).parent
|
|
296
|
+
r_script_path = str(package_base / "scripts" / "predict_oxygen.R")
|
|
198
297
|
|
|
199
298
|
if not Path(input_dir).exists():
|
|
200
299
|
raise OxyMetaGError(f"Input directory not found: {input_dir}")
|
|
@@ -223,4 +322,4 @@ def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_a
|
|
|
223
322
|
logger.info(f"Results saved to {output_file}")
|
|
224
323
|
return results_df
|
|
225
324
|
else:
|
|
226
|
-
raise OxyMetaGError(f"Output file was not created: {output_file}")
|
|
325
|
+
raise OxyMetaGError(f"Output file was not created: {output_file}")
|
oxymetag/data/VTML20.out
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# VTML_20
|
|
2
|
+
#
|
|
3
|
+
# This matrix was produced from: vtml_20qij.mat using vtml_P.mat background frequencies
|
|
4
|
+
#
|
|
5
|
+
# VTML_20 substitution matrix, Units = bits/2.0
|
|
6
|
+
# Expected score = -2.916179 bits; Entropy = 2.912514 bits
|
|
7
|
+
# Target fraction identity = 0.8307
|
|
8
|
+
# Lowest Score = -16, Highest Score= 12
|
|
9
|
+
#
|
|
10
|
+
# Background (precomputed optional): 0.0721 0.0135 0.0522 0.0728 0.038 0.0804 0.0256 0.0699 0.0703 0.107 0.0232 0.0503 0.0393 0.0339 0.0523 0.0698 0.05 0.0683 0.0143 0.0384 0.00001
|
|
11
|
+
# Lambda (precomputed optional): 0.34657
|
|
12
|
+
A C D E F G H I K L M N P Q R S T V W Y X
|
|
13
|
+
A 7 -3 -6 -5 -8 -4 -7 -7 -6 -7 -5 -6 -4 -5 -7 -2 -3 -3 -9 -8 0
|
|
14
|
+
C -3 12 -14 -14 -13 -7 -6 -5 -13 -12 -4 -8 -9 -13 -7 -3 -5 -3 -15 -4 0
|
|
15
|
+
D -6 -14 8 -1 -16 -6 -4 -12 -5 -15 -9 -1 -6 -4 -12 -5 -6 -9 -10 -14 0
|
|
16
|
+
E -5 -14 -1 7 -14 -6 -6 -10 -2 -8 -8 -5 -6 -1 -10 -5 -6 -7 -16 -7 0
|
|
17
|
+
F -8 -13 -16 -14 9 -11 -5 -5 -14 -3 -3 -10 -9 -8 -10 -7 -8 -6 -3 0 0
|
|
18
|
+
G -4 -7 -6 -6 -11 7 -7 -15 -7 -11 -10 -5 -8 -8 -7 -4 -8 -10 -9 -10 0
|
|
19
|
+
H -7 -6 -4 -6 -5 -7 10 -9 -5 -7 -12 -3 -6 -2 -3 -5 -5 -8 -6 -1 0
|
|
20
|
+
I -7 -5 -12 -10 -5 -15 -9 7 -9 -2 -2 -9 -10 -9 -8 -9 -5 1 -6 -8 0
|
|
21
|
+
K -6 -13 -5 -2 -14 -7 -5 -9 7 -8 -5 -3 -6 -2 0 -5 -4 -8 -9 -8 0
|
|
22
|
+
L -7 -12 -15 -8 -3 -11 -7 -2 -8 6 0 -9 -7 -6 -8 -8 -7 -3 -6 -6 0
|
|
23
|
+
M -5 -4 -9 -8 -3 -10 -12 -2 -5 0 10 -7 -10 -4 -6 -8 -4 -3 -13 -11 0
|
|
24
|
+
N -6 -8 -1 -5 -10 -5 -3 -9 -3 -9 -7 8 -8 -4 -5 -2 -4 -9 -10 -6 0
|
|
25
|
+
P -4 -9 -6 -6 -9 -8 -6 -10 -6 -7 -10 -8 9 -5 -7 -4 -6 -7 -9 -15 0
|
|
26
|
+
Q -5 -13 -4 -1 -8 -8 -2 -9 -2 -6 -4 -4 -5 9 -2 -4 -5 -7 -15 -12 0
|
|
27
|
+
R -7 -7 -12 -10 -10 -7 -3 -8 0 -8 -6 -5 -7 -2 8 -6 -6 -9 -8 -7 0
|
|
28
|
+
S -2 -3 -5 -5 -7 -4 -5 -9 -5 -8 -8 -2 -4 -4 -6 7 -1 -8 -8 -6 0
|
|
29
|
+
T -3 -5 -6 -6 -8 -8 -5 -5 -4 -7 -4 -4 -6 -5 -6 -1 8 -4 -15 -8 0
|
|
30
|
+
V -3 -3 -9 -7 -6 -10 -8 1 -8 -3 -3 -9 -7 -7 -9 -8 -4 7 -13 -8 0
|
|
31
|
+
W -9 -15 -10 -16 -3 -9 -6 -6 -9 -6 -13 -10 -9 -15 -8 -8 -15 -13 12 -2 0
|
|
32
|
+
Y -8 -4 -14 -7 0 -10 -1 -8 -8 -6 -11 -6 -15 -12 -7 -6 -8 -8 -2 9 0
|
|
33
|
+
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# NUCL in 1/2 Bit
|
|
2
|
+
# Background (precomputed optional): 0.2499975 0.2499975 0.2499975 0.2499975 0.00001
|
|
3
|
+
# Lambda (precomputed optional): 0.6337314
|
|
4
|
+
A C T G X
|
|
5
|
+
A 2.0000 -3.0000 -3.0000 -3.0000 -3.0000
|
|
6
|
+
C -3.0000 2.0000 -3.0000 -3.0000 -3.0000
|
|
7
|
+
T -3.0000 -3.0000 2.0000 -3.0000 -3.0000
|
|
8
|
+
G -3.0000 -3.0000 -3.0000 2.0000 -3.0000
|
|
9
|
+
X -3.0000 -3.0000 -3.0000 -3.0000 -3.0000
|
oxymetag/data/oxygen_model.rds
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|