oxymetag 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oxymetag/__init__.py +18 -0
- oxymetag/cli.py +107 -0
- oxymetag/core.py +226 -0
- oxymetag/data/.DS_Store +0 -0
- oxymetag/data/Oxygen_pfams.csv +21 -0
- oxymetag/data/oxygen_model.rds +0 -0
- oxymetag/data/oxymetag_pfams.dmnd +0 -0
- oxymetag/data/pfam_headers_table.txt +3870 -0
- oxymetag/data/pfam_lengths.tsv +21 -0
- oxymetag/scripts/predict_oxygen.R +160 -0
- oxymetag/utils.py +73 -0
- oxymetag-1.0.0.dist-info/LICENSE +674 -0
- oxymetag-1.0.0.dist-info/METADATA +235 -0
- oxymetag-1.0.0.dist-info/RECORD +18 -0
- oxymetag-1.0.0.dist-info/WHEEL +5 -0
- oxymetag-1.0.0.dist-info/entry_points.txt +2 -0
- oxymetag-1.0.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
oxymetag/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OxyMetaG: Oxygen metabolism profiling from metagenomic data
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
__version__ = "1.0.0"
|
|
6
|
+
__author__ = "Clifton P. Bueno de Mesquita"
|
|
7
|
+
__email__ = "cliff.buenodemesquita@colorado.edu"
|
|
8
|
+
|
|
9
|
+
from .core import extract_reads, profile_samples, predict_aerobes
|
|
10
|
+
from .utils import check_dependencies, run_kraken2_setup
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"extract_reads",
|
|
14
|
+
"profile_samples",
|
|
15
|
+
"predict_aerobes",
|
|
16
|
+
"check_dependencies",
|
|
17
|
+
"run_kraken2_setup"
|
|
18
|
+
]
|
oxymetag/cli.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Command line interface for OxyMetaG
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import sys
|
|
7
|
+
import argparse
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
from . import __version__
|
|
11
|
+
from .core import extract_reads, profile_samples, predict_aerobes
|
|
12
|
+
from .utils import check_dependencies, run_kraken2_setup, OxyMetaGError
|
|
13
|
+
|
|
14
|
+
# Set up logging
|
|
15
|
+
logging.basicConfig(
|
|
16
|
+
level=logging.INFO,
|
|
17
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
18
|
+
)
|
|
19
|
+
logger = logging.getLogger('oxymetag')
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def main():
|
|
23
|
+
"""Main CLI interface"""
|
|
24
|
+
parser = argparse.ArgumentParser(
|
|
25
|
+
description="OxyMetaG: Oxygen metabolism profiling from metagenomic data",
|
|
26
|
+
prog="oxymetag"
|
|
27
|
+
)
|
|
28
|
+
parser.add_argument('--version', action='version', version=f'OxyMetaG {__version__}')
|
|
29
|
+
|
|
30
|
+
subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
|
31
|
+
|
|
32
|
+
# Setup command
|
|
33
|
+
setup_parser = subparsers.add_parser('setup', help='Setup Kraken2 database')
|
|
34
|
+
|
|
35
|
+
# Extract command
|
|
36
|
+
extract_parser = subparsers.add_parser('extract', help='Extract bacterial reads')
|
|
37
|
+
extract_parser.add_argument('-i', '--input', nargs='+', required=True,
|
|
38
|
+
help='Input fastq.gz files')
|
|
39
|
+
extract_parser.add_argument('-o', '--output', default='BactReads',
|
|
40
|
+
help='Output directory (default: BactReads)')
|
|
41
|
+
extract_parser.add_argument('-t', '--threads', type=int, default=48,
|
|
42
|
+
help='Number of threads (default: 48)')
|
|
43
|
+
extract_parser.add_argument('--kraken-db', default='kraken2_db',
|
|
44
|
+
help='Kraken2 database path (default: kraken2_db)')
|
|
45
|
+
|
|
46
|
+
# Profile command
|
|
47
|
+
profile_parser = subparsers.add_parser('profile', help='Profile samples with DIAMOND')
|
|
48
|
+
profile_parser.add_argument('-i', '--input', default='BactReads',
|
|
49
|
+
help='Input directory (default: BactReads)')
|
|
50
|
+
profile_parser.add_argument('-o', '--output', default='diamond_output',
|
|
51
|
+
help='Output directory (default: diamond_output)')
|
|
52
|
+
profile_parser.add_argument('-t', '--threads', type=int, default=4,
|
|
53
|
+
help='Number of threads (default: 4)')
|
|
54
|
+
profile_parser.add_argument('--diamond-db',
|
|
55
|
+
help='DIAMOND database path (default: package database)')
|
|
56
|
+
|
|
57
|
+
# Predict command
|
|
58
|
+
predict_parser = subparsers.add_parser('predict', help='Predict aerobe levels')
|
|
59
|
+
predict_parser.add_argument('-i', '--input', default='diamond_output',
|
|
60
|
+
help='Input directory (default: diamond_output)')
|
|
61
|
+
predict_parser.add_argument('-o', '--output', default='per_aerobe_predictions.tsv',
|
|
62
|
+
help='Output file (default: per_aerobe_predictions.tsv)')
|
|
63
|
+
predict_parser.add_argument('-t', '--threads', type=int, default=4,
|
|
64
|
+
help='Number of threads (default: 4)')
|
|
65
|
+
predict_parser.add_argument('-m', '--mode', choices=['modern', 'ancient', 'custom'],
|
|
66
|
+
default='modern', help='Filtering mode (default: modern)')
|
|
67
|
+
predict_parser.add_argument('--idcut', type=float,
|
|
68
|
+
help='Custom identity cutoff (for custom mode)')
|
|
69
|
+
predict_parser.add_argument('--bitcut', type=float,
|
|
70
|
+
help='Custom bitscore cutoff (for custom mode)')
|
|
71
|
+
predict_parser.add_argument('--ecut', type=float,
|
|
72
|
+
help='Custom e-value cutoff (for custom mode)')
|
|
73
|
+
|
|
74
|
+
args = parser.parse_args()
|
|
75
|
+
|
|
76
|
+
if not args.command:
|
|
77
|
+
parser.print_help()
|
|
78
|
+
sys.exit(1)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
check_dependencies()
|
|
82
|
+
|
|
83
|
+
if args.command == 'setup':
|
|
84
|
+
run_kraken2_setup()
|
|
85
|
+
|
|
86
|
+
elif args.command == 'extract':
|
|
87
|
+
extract_reads(args.input, args.output, args.threads, args.kraken_db)
|
|
88
|
+
|
|
89
|
+
elif args.command == 'profile':
|
|
90
|
+
profile_samples(args.input, args.output, args.threads, args.diamond_db)
|
|
91
|
+
|
|
92
|
+
elif args.command == 'predict':
|
|
93
|
+
predict_aerobes(args.input, args.output, args.mode,
|
|
94
|
+
args.idcut, args.bitcut, args.ecut, args.threads)
|
|
95
|
+
|
|
96
|
+
logger.info("Command completed successfully")
|
|
97
|
+
|
|
98
|
+
except OxyMetaGError as e:
|
|
99
|
+
logger.error(f"Error: {e}")
|
|
100
|
+
sys.exit(1)
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.error(f"Unexpected error: {e}")
|
|
103
|
+
sys.exit(1)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == "__main__":
|
|
107
|
+
main()
|
oxymetag/core.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Core functions for OxyMetaG
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
import subprocess
|
|
9
|
+
import glob
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import logging
|
|
13
|
+
from typing import List, Optional
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger('oxymetag')
|
|
16
|
+
|
|
17
|
+
class OxyMetaGError(Exception):
|
|
18
|
+
"""Custom exception for OxyMetaG errors"""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
def extract_reads(input_files: List[str], output_dir: str = "BactReads",
|
|
22
|
+
threads: int = 48, kraken_db: str = "kraken2_db"):
|
|
23
|
+
"""
|
|
24
|
+
Extract bacterial reads from metagenomic samples using Kraken2
|
|
25
|
+
"""
|
|
26
|
+
logger.info(f"Starting bacterial read extraction with {threads} threads")
|
|
27
|
+
|
|
28
|
+
if not Path(kraken_db).exists():
|
|
29
|
+
raise OxyMetaGError(f"Kraken2 database not found: {kraken_db}")
|
|
30
|
+
|
|
31
|
+
output_path = Path(output_dir)
|
|
32
|
+
output_path.mkdir(exist_ok=True)
|
|
33
|
+
|
|
34
|
+
for input_file in input_files:
|
|
35
|
+
input_path = Path(input_file)
|
|
36
|
+
if not input_path.exists():
|
|
37
|
+
logger.warning(f"Input file not found: {input_file}")
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
logger.info(f"Processing {input_file}")
|
|
41
|
+
|
|
42
|
+
base_name = input_path.stem.replace('.fastq', '').replace('.gz', '')
|
|
43
|
+
|
|
44
|
+
if '_R1' in base_name or '_1' in base_name:
|
|
45
|
+
kraken_base = base_name.replace('_R1', '').replace('_1', '')
|
|
46
|
+
else:
|
|
47
|
+
kraken_base = base_name
|
|
48
|
+
|
|
49
|
+
kraken_output = output_path / f"{kraken_base}_kraken.out"
|
|
50
|
+
kraken_report = output_path / f"{kraken_base}_report.txt"
|
|
51
|
+
|
|
52
|
+
if '_R1' in base_name or '_1' in base_name:
|
|
53
|
+
r2_file = str(input_path).replace('_R1', '_R2').replace('_1', '_2')
|
|
54
|
+
if Path(r2_file).exists():
|
|
55
|
+
cmd = [
|
|
56
|
+
'kraken2', '--db', kraken_db, '--threads', str(threads),
|
|
57
|
+
'--output', str(kraken_output), '--report', str(kraken_report),
|
|
58
|
+
'--paired', str(input_path), r2_file
|
|
59
|
+
]
|
|
60
|
+
else:
|
|
61
|
+
logger.warning(f"R2 file not found for {input_file}, treating as single-end")
|
|
62
|
+
cmd = [
|
|
63
|
+
'kraken2', '--db', kraken_db, '--threads', str(threads),
|
|
64
|
+
'--output', str(kraken_output), '--report', str(kraken_report),
|
|
65
|
+
str(input_path)
|
|
66
|
+
]
|
|
67
|
+
else:
|
|
68
|
+
cmd = [
|
|
69
|
+
'kraken2', '--db', kraken_db, '--threads', str(threads),
|
|
70
|
+
'--output', str(kraken_output), '--report', str(kraken_report),
|
|
71
|
+
str(input_path)
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
subprocess.run(cmd, check=True)
|
|
76
|
+
logger.info(f"Kraken2 classification completed for {input_file}")
|
|
77
|
+
|
|
78
|
+
bacterial_reads = output_path / f"{base_name}_bacterial.fastq"
|
|
79
|
+
|
|
80
|
+
cmd = [
|
|
81
|
+
'extract_kraken_reads.py',
|
|
82
|
+
'-k', str(kraken_output),
|
|
83
|
+
'-s', str(input_path),
|
|
84
|
+
'-o', str(bacterial_reads),
|
|
85
|
+
'-r', str(kraken_report),
|
|
86
|
+
'--taxid', '2',
|
|
87
|
+
'--include-children'
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
if ('_R1' in base_name or '_1' in base_name) and Path(r2_file).exists():
|
|
91
|
+
if '_R1' in base_name:
|
|
92
|
+
r2_output = output_path / f"{base_name.replace('_R1', '_R2')}_bacterial.fastq"
|
|
93
|
+
else:
|
|
94
|
+
r2_output = output_path / f"{base_name.replace('_1', '_2')}_bacterial.fastq"
|
|
95
|
+
cmd.extend(['-s2', r2_file])
|
|
96
|
+
cmd.extend(['-o2', str(r2_output)])
|
|
97
|
+
|
|
98
|
+
subprocess.run(cmd, check=True)
|
|
99
|
+
subprocess.run(['gzip', str(bacterial_reads)], check=True)
|
|
100
|
+
|
|
101
|
+
if ('_R1' in base_name or '_1' in base_name) and Path(r2_file).exists():
|
|
102
|
+
subprocess.run(['gzip', str(r2_output)], check=True)
|
|
103
|
+
|
|
104
|
+
logger.info(f"Bacterial reads extracted for {input_file}")
|
|
105
|
+
|
|
106
|
+
except subprocess.CalledProcessError as e:
|
|
107
|
+
logger.error(f"Failed to process {input_file}: {e}")
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_output",
|
|
112
|
+
threads: int = 4, diamond_db: str = None):
|
|
113
|
+
"""
|
|
114
|
+
Profile samples using DIAMOND blastx against Pfam database
|
|
115
|
+
"""
|
|
116
|
+
from .utils import get_package_data_path
|
|
117
|
+
|
|
118
|
+
logger.info(f"Starting sample profiling with {threads} threads")
|
|
119
|
+
|
|
120
|
+
if diamond_db is None:
|
|
121
|
+
diamond_db = get_package_data_path("oxymetag_pfams.dmnd")
|
|
122
|
+
|
|
123
|
+
if not Path(diamond_db).exists():
|
|
124
|
+
raise OxyMetaGError(f"DIAMOND database not found: {diamond_db}")
|
|
125
|
+
|
|
126
|
+
output_path = Path(output_dir)
|
|
127
|
+
output_path.mkdir(exist_ok=True)
|
|
128
|
+
|
|
129
|
+
input_path = Path(input_dir)
|
|
130
|
+
input_files = []
|
|
131
|
+
|
|
132
|
+
patterns = [
|
|
133
|
+
'*_R1_bacterial.fastq.gz',
|
|
134
|
+
'*_1_bacterial.fastq.gz',
|
|
135
|
+
'*_bacterial.fastq.gz'
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
for pattern in patterns:
|
|
139
|
+
found_files = list(input_path.glob(pattern))
|
|
140
|
+
if found_files:
|
|
141
|
+
input_files.extend(found_files)
|
|
142
|
+
logger.info(f"Found {len(found_files)} files using pattern: {pattern}")
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
if not input_files:
|
|
146
|
+
all_files = list(input_path.glob("*.fastq.gz"))
|
|
147
|
+
logger.error(f"FASTQ files in {input_dir}: {[f.name for f in all_files[:5]]}")
|
|
148
|
+
raise OxyMetaGError(f"No bacterial read files found in {input_dir}")
|
|
149
|
+
|
|
150
|
+
for input_file in input_files:
|
|
151
|
+
base_name = input_file.stem.replace('.fastq', '').replace('.gz', '')
|
|
152
|
+
base_name = base_name.replace('_R1_bacterial', '').replace('_1_bacterial', '').replace('_bacterial', '')
|
|
153
|
+
|
|
154
|
+
logger.info(f"Processing {input_file}")
|
|
155
|
+
|
|
156
|
+
output_file = output_path / f"{base_name}_diamond.tsv"
|
|
157
|
+
|
|
158
|
+
cmd = [
|
|
159
|
+
'diamond', 'blastx',
|
|
160
|
+
'-d', diamond_db,
|
|
161
|
+
'-q', str(input_file),
|
|
162
|
+
'-o', str(output_file),
|
|
163
|
+
'-f', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
subprocess.run(cmd, check=True)
|
|
168
|
+
logger.info(f"DIAMOND profiling completed for {input_file}")
|
|
169
|
+
|
|
170
|
+
except subprocess.CalledProcessError as e:
|
|
171
|
+
logger.error(f"Failed to process {input_file}: {e}")
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_aerobe_predictions.tsv",
|
|
176
|
+
mode: str = "modern", id_cut: float = None, bit_cut: float = None,
|
|
177
|
+
e_cut: float = None, threads: int = 4):
|
|
178
|
+
"""
|
|
179
|
+
Predict aerobe levels from DIAMOND results
|
|
180
|
+
"""
|
|
181
|
+
from .utils import get_package_data_path
|
|
182
|
+
|
|
183
|
+
logger.info(f"Starting aerobe level prediction in {mode} mode")
|
|
184
|
+
|
|
185
|
+
if mode == "modern":
|
|
186
|
+
identity_cutoff, bitscore_cutoff, evalue_cutoff = 60.0, 50.0, 0.001
|
|
187
|
+
elif mode == "ancient":
|
|
188
|
+
identity_cutoff, bitscore_cutoff, evalue_cutoff = 45.0, 25.0, 0.1
|
|
189
|
+
elif mode == "custom":
|
|
190
|
+
if any(x is None for x in [id_cut, bit_cut, e_cut]):
|
|
191
|
+
raise OxyMetaGError("Custom mode requires id_cut, bit_cut, and e_cut parameters")
|
|
192
|
+
identity_cutoff, bitscore_cutoff, evalue_cutoff = id_cut, bit_cut, e_cut
|
|
193
|
+
else:
|
|
194
|
+
raise OxyMetaGError("Mode must be 'modern', 'ancient', or 'custom'")
|
|
195
|
+
|
|
196
|
+
package_data_dir = str(Path(get_package_data_path("")).parent / "data")
|
|
197
|
+
r_script_path = get_package_data_path("../scripts/predict_oxygen.R")
|
|
198
|
+
|
|
199
|
+
if not Path(input_dir).exists():
|
|
200
|
+
raise OxyMetaGError(f"Input directory not found: {input_dir}")
|
|
201
|
+
|
|
202
|
+
cmd = [
|
|
203
|
+
'Rscript', r_script_path,
|
|
204
|
+
input_dir, output_file, package_data_dir, mode,
|
|
205
|
+
str(identity_cutoff), str(evalue_cutoff), str(bitscore_cutoff)
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
logger.info(f"Calling R script: {' '.join(cmd)}")
|
|
210
|
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
211
|
+
logger.info("R script completed successfully")
|
|
212
|
+
if result.stdout:
|
|
213
|
+
logger.info(f"R output: {result.stdout}")
|
|
214
|
+
|
|
215
|
+
except subprocess.CalledProcessError as e:
|
|
216
|
+
logger.error(f"R script failed: {e}")
|
|
217
|
+
if e.stderr:
|
|
218
|
+
logger.error(f"R stderr: {e.stderr}")
|
|
219
|
+
raise OxyMetaGError(f"Aerobe prediction failed: {e}")
|
|
220
|
+
|
|
221
|
+
if Path(output_file).exists():
|
|
222
|
+
results_df = pd.read_csv(output_file, sep='\t')
|
|
223
|
+
logger.info(f"Results saved to {output_file}")
|
|
224
|
+
return results_df
|
|
225
|
+
else:
|
|
226
|
+
raise OxyMetaGError(f"Output file was not created: {output_file}")
|
oxymetag/data/.DS_Store
ADDED
|
Binary file
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
Pfam,Oxygen,Other_Taxa
|
|
2
|
+
PF00042,aerobic,"Plant, Animal"
|
|
3
|
+
PF00115,aerobic,"Plant, Animal"
|
|
4
|
+
PF00116,aerobic,"Plant, Animal"
|
|
5
|
+
PF00296,aerobic,Archaea
|
|
6
|
+
PF00510,aerobic,"Plant, Animal, Fungus"
|
|
7
|
+
PF00916,aerobic,"Plant, Animal, Fungus"
|
|
8
|
+
PF01152,aerobic,"Plant, Protist"
|
|
9
|
+
PF01521,aerobic,"Plant, Animal, Fungus"
|
|
10
|
+
PF01871,anaerobic,"Plant, Animal, Fungus"
|
|
11
|
+
PF02579,anaerobic,Archaea
|
|
12
|
+
PF02906,anaerobic,"Plant, Animal, Fungus"
|
|
13
|
+
PF03063,anaerobic,Archaea
|
|
14
|
+
PF05425,aerobic,None
|
|
15
|
+
PF05721,aerobic,"Plant, Animal, Fungus"
|
|
16
|
+
PF08530,aerobic,None
|
|
17
|
+
PF10371,anaerobic,None
|
|
18
|
+
PF13597,anaerobic,Archaea
|
|
19
|
+
PF16870,aerobic,"Animal, Fungus"
|
|
20
|
+
PF17773,aerobic,"Plant, Animal"
|
|
21
|
+
PF17910,anaerobic,Archaea
|
|
Binary file
|
|
Binary file
|