oxymetag 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oxymetag/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
2
  OxyMetaG: Oxygen metabolism profiling from metagenomic data
3
3
  """
4
4
 
5
- __version__ = "1.0.0"
5
+ __version__ = "1.1.0"
6
6
  __author__ = "Clifton P. Bueno de Mesquita"
7
7
  __email__ = "cliff.buenodemesquita@colorado.edu"
8
8
 
oxymetag/cli.py CHANGED
@@ -11,7 +11,6 @@ from . import __version__
11
11
  from .core import extract_reads, profile_samples, predict_aerobes
12
12
  from .utils import check_dependencies, run_kraken2_setup, OxyMetaGError
13
13
 
14
- # Set up logging
15
14
  logging.basicConfig(
16
15
  level=logging.INFO,
17
16
  format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
@@ -29,10 +28,8 @@ def main():
29
28
 
30
29
  subparsers = parser.add_subparsers(dest='command', help='Available commands')
31
30
 
32
- # Setup command
33
31
  setup_parser = subparsers.add_parser('setup', help='Setup Kraken2 database')
34
32
 
35
- # Extract command
36
33
  extract_parser = subparsers.add_parser('extract', help='Extract bacterial reads')
37
34
  extract_parser.add_argument('-i', '--input', nargs='+', required=True,
38
35
  help='Input fastq.gz files')
@@ -43,27 +40,31 @@ def main():
43
40
  extract_parser.add_argument('--kraken-db', default='kraken2_db',
44
41
  help='Kraken2 database path (default: kraken2_db)')
45
42
 
46
- # Profile command
47
- profile_parser = subparsers.add_parser('profile', help='Profile samples with DIAMOND')
43
+ profile_parser = subparsers.add_parser('profile', help='Profile samples with DIAMOND or MMseqs2')
48
44
  profile_parser.add_argument('-i', '--input', default='BactReads',
49
45
  help='Input directory (default: BactReads)')
50
- profile_parser.add_argument('-o', '--output', default='diamond_output',
51
- help='Output directory (default: diamond_output)')
46
+ profile_parser.add_argument('-o', '--output', default=None,
47
+ help='Output directory (default: diamond_output or mmseqs_output)')
52
48
  profile_parser.add_argument('-t', '--threads', type=int, default=4,
53
49
  help='Number of threads (default: 4)')
50
+ profile_parser.add_argument('-m', '--method', choices=['diamond', 'mmseqs2'],
51
+ default='diamond',
52
+ help='Profiling method (default: diamond)')
54
53
  profile_parser.add_argument('--diamond-db',
55
54
  help='DIAMOND database path (default: package database)')
55
+ profile_parser.add_argument('--mmseqs-db',
56
+ help='MMseqs2 database path (default: package database)')
56
57
 
57
- # Predict command
58
58
  predict_parser = subparsers.add_parser('predict', help='Predict aerobe levels')
59
- predict_parser.add_argument('-i', '--input', default='diamond_output',
60
- help='Input directory (default: diamond_output)')
59
+ predict_parser.add_argument('-i', '--input', default=None,
60
+ help='Input directory (default: diamond_output for modern, mmseqs_output for ancient)')
61
61
  predict_parser.add_argument('-o', '--output', default='per_aerobe_predictions.tsv',
62
62
  help='Output file (default: per_aerobe_predictions.tsv)')
63
63
  predict_parser.add_argument('-t', '--threads', type=int, default=4,
64
64
  help='Number of threads (default: 4)')
65
65
  predict_parser.add_argument('-m', '--mode', choices=['modern', 'ancient', 'custom'],
66
- default='modern', help='Filtering mode (default: modern)')
66
+ default='modern',
67
+ help='Filtering mode: modern=DIAMOND, ancient=MMseqs2 (default: modern)')
67
68
  predict_parser.add_argument('--idcut', type=float,
68
69
  help='Custom identity cutoff (for custom mode)')
69
70
  predict_parser.add_argument('--bitcut', type=float,
@@ -87,11 +88,12 @@ def main():
87
88
  extract_reads(args.input, args.output, args.threads, args.kraken_db)
88
89
 
89
90
  elif args.command == 'profile':
90
- profile_samples(args.input, args.output, args.threads, args.diamond_db)
91
+ profile_samples(args.input, args.output, args.threads, args.method,
92
+ args.diamond_db, args.mmseqs_db)
91
93
 
92
94
  elif args.command == 'predict':
93
95
  predict_aerobes(args.input, args.output, args.mode,
94
- args.idcut, args.bitcut, args.ecut, args.threads)
96
+ args.idcut, args.bitcut, args.ecut, args.threads)
95
97
 
96
98
  logger.info("Command completed successfully")
97
99
 
oxymetag/core.py CHANGED
@@ -108,20 +108,18 @@ def extract_reads(input_files: List[str], output_dir: str = "BactReads",
108
108
  continue
109
109
 
110
110
 
111
- def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_output",
112
- threads: int = 4, diamond_db: str = None):
111
+ def profile_samples(input_dir: str = "BactReads", output_dir: str = None,
112
+ threads: int = 4, method: str = "diamond",
113
+ diamond_db: str = None, mmseqs_db: str = None):
113
114
  """
114
- Profile samples using DIAMOND blastx against Pfam database
115
+ Profile samples using DIAMOND or MMseqs2 against Pfam database
115
116
  """
116
117
  from .utils import get_package_data_path
117
118
 
118
- logger.info(f"Starting sample profiling with {threads} threads")
119
+ logger.info(f"Starting sample profiling with {method} using {threads} threads")
119
120
 
120
- if diamond_db is None:
121
- diamond_db = get_package_data_path("oxymetag_pfams.dmnd")
122
-
123
- if not Path(diamond_db).exists():
124
- raise OxyMetaGError(f"DIAMOND database not found: {diamond_db}")
121
+ if output_dir is None:
122
+ output_dir = 'diamond_output' if method == 'diamond' else 'mmseqs_output'
125
123
 
126
124
  output_path = Path(output_dir)
127
125
  output_path.mkdir(exist_ok=True)
@@ -132,6 +130,8 @@ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_out
132
130
  patterns = [
133
131
  '*_R1_bacterial.fastq.gz',
134
132
  '*_1_bacterial.fastq.gz',
133
+ '*_bacterial_R1.fastq.gz',
134
+ '*_bacterial_1.fastq.gz',
135
135
  '*_bacterial.fastq.gz'
136
136
  ]
137
137
 
@@ -147,9 +147,32 @@ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_out
147
147
  logger.error(f"FASTQ files in {input_dir}: {[f.name for f in all_files[:5]]}")
148
148
  raise OxyMetaGError(f"No bacterial read files found in {input_dir}")
149
149
 
150
+ if method == 'diamond':
151
+ _profile_with_diamond(input_files, output_path, threads, diamond_db)
152
+ elif method == 'mmseqs2':
153
+ _profile_with_mmseqs(input_files, output_path, threads, mmseqs_db)
154
+ else:
155
+ raise OxyMetaGError(f"Unknown method: {method}")
156
+
157
+
158
+ def _profile_with_diamond(input_files: List[Path], output_path: Path,
159
+ threads: int, diamond_db: str = None):
160
+ """Profile samples using DIAMOND blastx"""
161
+ from .utils import get_package_data_path
162
+
163
+ if diamond_db is None:
164
+ diamond_db = get_package_data_path("oxymetag_pfams.dmnd")
165
+
166
+ if not Path(diamond_db).exists():
167
+ raise OxyMetaGError(f"DIAMOND database not found: {diamond_db}")
168
+
150
169
  for input_file in input_files:
151
170
  base_name = input_file.stem.replace('.fastq', '').replace('.gz', '')
152
- base_name = base_name.replace('_R1_bacterial', '').replace('_1_bacterial', '').replace('_bacterial', '')
171
+ base_name = (base_name.replace('_R1_bacterial', '')
172
+ .replace('_1_bacterial', '')
173
+ .replace('_bacterial_R1', '')
174
+ .replace('_bacterial_1', '')
175
+ .replace('_bacterial', ''))
153
176
 
154
177
  logger.info(f"Processing {input_file}")
155
178
 
@@ -172,20 +195,95 @@ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_out
172
195
  continue
173
196
 
174
197
 
175
- def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_aerobe_predictions.tsv",
198
+ def _profile_with_mmseqs(input_files: List[Path], output_path: Path,
199
+ threads: int, mmseqs_db: str = None):
200
+ """Profile samples using MMseqs2 easy-search"""
201
+ from .utils import get_package_data_path
202
+
203
+ if mmseqs_db is None:
204
+ mmseqs_db = get_package_data_path("oxymetag_pfams_n117_db")
205
+
206
+ if not Path(mmseqs_db).exists():
207
+ raise OxyMetaGError(f"MMseqs2 database not found: {mmseqs_db}")
208
+
209
+ data_dir = Path(get_package_data_path(""))
210
+ vtml_matrix = data_dir / "VTML20.out"
211
+ nucl_matrix = data_dir / "nucleotide.out"
212
+
213
+ if not vtml_matrix.exists():
214
+ raise OxyMetaGError(f"VTML20.out matrix not found: {vtml_matrix}")
215
+ if not nucl_matrix.exists():
216
+ raise OxyMetaGError(f"nucleotide.out matrix not found: {nucl_matrix}")
217
+
218
+ for input_file in input_files:
219
+ base_name = input_file.stem.replace('.fastq', '').replace('.gz', '')
220
+ base_name = (base_name.replace('_R1_bacterial', '')
221
+ .replace('_1_bacterial', '')
222
+ .replace('_bacterial_R1', '')
223
+ .replace('_bacterial_1', '')
224
+ .replace('_bacterial', ''))
225
+
226
+ logger.info(f"Processing {input_file} with MMseqs2")
227
+
228
+ output_file = output_path / f"{base_name}_mmseqs.tsv"
229
+ tmp_dir = output_path / f"{base_name}_tmp"
230
+ tmp_dir.mkdir(exist_ok=True)
231
+
232
+ cmd = [
233
+ 'mmseqs', 'easy-search',
234
+ str(input_file),
235
+ str(mmseqs_db),
236
+ str(output_file),
237
+ str(tmp_dir),
238
+ '--min-length', '12',
239
+ '-e', '10.0',
240
+ '--min-seq-id', '0.86',
241
+ '-c', '0.65',
242
+ '--cov-mode', '2',
243
+ '--format-mode', '0',
244
+ '--format-output', 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen,cigar,qaln,taln',
245
+ '--comp-bias-corr', '0',
246
+ '--mask', '0',
247
+ '--exact-kmer-matching', '1',
248
+ '--sub-mat', f'aa:{vtml_matrix},nucl:{nucl_matrix}',
249
+ '--seed-sub-mat', f'aa:{vtml_matrix},nucl:{nucl_matrix}',
250
+ '-s', '2',
251
+ '-k', '6',
252
+ '--spaced-kmer-pattern', '11011101',
253
+ '--max-seqs', '10000',
254
+ '--max-rejected', '10',
255
+ '--threads', str(threads),
256
+ '--remove-tmp-files', '0',
257
+ '--use-all-table-starts', '1'
258
+ ]
259
+
260
+ try:
261
+ subprocess.run(cmd, check=True)
262
+ logger.info(f"MMseqs2 profiling completed for {input_file}")
263
+
264
+ except subprocess.CalledProcessError as e:
265
+ logger.error(f"Failed to process {input_file}: {e}")
266
+ continue
267
+
268
+
269
+ def predict_aerobes(input_dir: str = None, output_file: str = "per_aerobe_predictions.tsv",
176
270
  mode: str = "modern", id_cut: float = None, bit_cut: float = None,
177
271
  e_cut: float = None, threads: int = 4):
178
272
  """
179
- Predict aerobe levels from DIAMOND results
273
+ Predict aerobe levels from DIAMOND or MMseqs2 results
274
+ Mode determines method: modern=DIAMOND, ancient=MMseqs2
180
275
  """
181
276
  from .utils import get_package_data_path
182
277
 
183
278
  logger.info(f"Starting aerobe level prediction in {mode} mode")
184
279
 
280
+ if input_dir is None:
281
+ input_dir = 'diamond_output' if mode == 'modern' else 'mmseqs_output'
282
+
185
283
  if mode == "modern":
186
284
  identity_cutoff, bitscore_cutoff, evalue_cutoff = 60.0, 50.0, 0.001
187
285
  elif mode == "ancient":
188
- identity_cutoff, bitscore_cutoff, evalue_cutoff = 45.0, 25.0, 0.1
286
+ identity_cutoff, bitscore_cutoff, evalue_cutoff = 86.0, 50.0, 0.001
189
287
  elif mode == "custom":
190
288
  if any(x is None for x in [id_cut, bit_cut, e_cut]):
191
289
  raise OxyMetaGError("Custom mode requires id_cut, bit_cut, and e_cut parameters")
@@ -194,7 +292,8 @@ def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_a
194
292
  raise OxyMetaGError("Mode must be 'modern', 'ancient', or 'custom'")
195
293
 
196
294
  package_data_dir = str(Path(get_package_data_path("")).parent / "data")
197
- r_script_path = get_package_data_path("../scripts/predict_oxygen.R")
295
+ package_base = Path(__file__).parent
296
+ r_script_path = str(package_base / "scripts" / "predict_oxygen.R")
198
297
 
199
298
  if not Path(input_dir).exists():
200
299
  raise OxyMetaGError(f"Input directory not found: {input_dir}")
@@ -223,4 +322,4 @@ def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_a
223
322
  logger.info(f"Results saved to {output_file}")
224
323
  return results_df
225
324
  else:
226
- raise OxyMetaGError(f"Output file was not created: {output_file}")
325
+ raise OxyMetaGError(f"Output file was not created: {output_file}")
@@ -0,0 +1,33 @@
1
+ # VTML_20
2
+ #
3
+ # This matrix was produced from: vtml_20qij.mat using vtml_P.mat background frequencies
4
+ #
5
+ # VTML_20 substitution matrix, Units = bits/2.0
6
+ # Expected score = -2.916179 bits; Entropy = 2.912514 bits
7
+ # Target fraction identity = 0.8307
8
+ # Lowest Score = -16, Highest Score= 12
9
+ #
10
+ # Background (precomputed optional): 0.0721 0.0135 0.0522 0.0728 0.038 0.0804 0.0256 0.0699 0.0703 0.107 0.0232 0.0503 0.0393 0.0339 0.0523 0.0698 0.05 0.0683 0.0143 0.0384 0.00001
11
+ # Lambda (precomputed optional): 0.34657
12
+ A C D E F G H I K L M N P Q R S T V W Y X
13
+ A 7 -3 -6 -5 -8 -4 -7 -7 -6 -7 -5 -6 -4 -5 -7 -2 -3 -3 -9 -8 0
14
+ C -3 12 -14 -14 -13 -7 -6 -5 -13 -12 -4 -8 -9 -13 -7 -3 -5 -3 -15 -4 0
15
+ D -6 -14 8 -1 -16 -6 -4 -12 -5 -15 -9 -1 -6 -4 -12 -5 -6 -9 -10 -14 0
16
+ E -5 -14 -1 7 -14 -6 -6 -10 -2 -8 -8 -5 -6 -1 -10 -5 -6 -7 -16 -7 0
17
+ F -8 -13 -16 -14 9 -11 -5 -5 -14 -3 -3 -10 -9 -8 -10 -7 -8 -6 -3 0 0
18
+ G -4 -7 -6 -6 -11 7 -7 -15 -7 -11 -10 -5 -8 -8 -7 -4 -8 -10 -9 -10 0
19
+ H -7 -6 -4 -6 -5 -7 10 -9 -5 -7 -12 -3 -6 -2 -3 -5 -5 -8 -6 -1 0
20
+ I -7 -5 -12 -10 -5 -15 -9 7 -9 -2 -2 -9 -10 -9 -8 -9 -5 1 -6 -8 0
21
+ K -6 -13 -5 -2 -14 -7 -5 -9 7 -8 -5 -3 -6 -2 0 -5 -4 -8 -9 -8 0
22
+ L -7 -12 -15 -8 -3 -11 -7 -2 -8 6 0 -9 -7 -6 -8 -8 -7 -3 -6 -6 0
23
+ M -5 -4 -9 -8 -3 -10 -12 -2 -5 0 10 -7 -10 -4 -6 -8 -4 -3 -13 -11 0
24
+ N -6 -8 -1 -5 -10 -5 -3 -9 -3 -9 -7 8 -8 -4 -5 -2 -4 -9 -10 -6 0
25
+ P -4 -9 -6 -6 -9 -8 -6 -10 -6 -7 -10 -8 9 -5 -7 -4 -6 -7 -9 -15 0
26
+ Q -5 -13 -4 -1 -8 -8 -2 -9 -2 -6 -4 -4 -5 9 -2 -4 -5 -7 -15 -12 0
27
+ R -7 -7 -12 -10 -10 -7 -3 -8 0 -8 -6 -5 -7 -2 8 -6 -6 -9 -8 -7 0
28
+ S -2 -3 -5 -5 -7 -4 -5 -9 -5 -8 -8 -2 -4 -4 -6 7 -1 -8 -8 -6 0
29
+ T -3 -5 -6 -6 -8 -8 -5 -5 -4 -7 -4 -4 -6 -5 -6 -1 8 -4 -15 -8 0
30
+ V -3 -3 -9 -7 -6 -10 -8 1 -8 -3 -3 -9 -7 -7 -9 -8 -4 7 -13 -8 0
31
+ W -9 -15 -10 -16 -3 -9 -6 -6 -9 -6 -13 -10 -9 -15 -8 -8 -15 -13 12 -2 0
32
+ Y -8 -4 -14 -7 0 -10 -1 -8 -8 -6 -11 -6 -15 -12 -7 -6 -8 -8 -2 9 0
33
+ X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
@@ -0,0 +1,9 @@
1
+ # NUCL in 1/2 Bit
2
+ # Background (precomputed optional): 0.2499975 0.2499975 0.2499975 0.2499975 0.00001
3
+ # Lambda (precomputed optional): 0.6337314
4
+ A C T G X
5
+ A 2.0000 -3.0000 -3.0000 -3.0000 -3.0000
6
+ C -3.0000 2.0000 -3.0000 -3.0000 -3.0000
7
+ T -3.0000 -3.0000 2.0000 -3.0000 -3.0000
8
+ G -3.0000 -3.0000 -3.0000 2.0000 -3.0000
9
+ X -3.0000 -3.0000 -3.0000 -3.0000 -3.0000
Binary file
Binary file
Binary file