oxymetag 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
oxymetag/__init__.py ADDED
@@ -0,0 +1,18 @@
1
+ """
2
+ OxyMetaG: Oxygen metabolism profiling from metagenomic data
3
+ """
4
+
5
+ __version__ = "1.0.0"
6
+ __author__ = "Clifton P. Bueno de Mesquita"
7
+ __email__ = "cliff.buenodemesquita@colorado.edu"
8
+
9
+ from .core import extract_reads, profile_samples, predict_aerobes
10
+ from .utils import check_dependencies, run_kraken2_setup
11
+
12
+ __all__ = [
13
+ "extract_reads",
14
+ "profile_samples",
15
+ "predict_aerobes",
16
+ "check_dependencies",
17
+ "run_kraken2_setup"
18
+ ]
oxymetag/cli.py ADDED
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Command line interface for OxyMetaG
4
+ """
5
+
6
+ import sys
7
+ import argparse
8
+ import logging
9
+
10
+ from . import __version__
11
+ from .core import extract_reads, profile_samples, predict_aerobes
12
+ from .utils import check_dependencies, run_kraken2_setup, OxyMetaGError
13
+
14
+ # Set up logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ )
19
+ logger = logging.getLogger('oxymetag')
20
+
21
+
22
+ def main():
23
+ """Main CLI interface"""
24
+ parser = argparse.ArgumentParser(
25
+ description="OxyMetaG: Oxygen metabolism profiling from metagenomic data",
26
+ prog="oxymetag"
27
+ )
28
+ parser.add_argument('--version', action='version', version=f'OxyMetaG {__version__}')
29
+
30
+ subparsers = parser.add_subparsers(dest='command', help='Available commands')
31
+
32
+ # Setup command
33
+ setup_parser = subparsers.add_parser('setup', help='Setup Kraken2 database')
34
+
35
+ # Extract command
36
+ extract_parser = subparsers.add_parser('extract', help='Extract bacterial reads')
37
+ extract_parser.add_argument('-i', '--input', nargs='+', required=True,
38
+ help='Input fastq.gz files')
39
+ extract_parser.add_argument('-o', '--output', default='BactReads',
40
+ help='Output directory (default: BactReads)')
41
+ extract_parser.add_argument('-t', '--threads', type=int, default=48,
42
+ help='Number of threads (default: 48)')
43
+ extract_parser.add_argument('--kraken-db', default='kraken2_db',
44
+ help='Kraken2 database path (default: kraken2_db)')
45
+
46
+ # Profile command
47
+ profile_parser = subparsers.add_parser('profile', help='Profile samples with DIAMOND')
48
+ profile_parser.add_argument('-i', '--input', default='BactReads',
49
+ help='Input directory (default: BactReads)')
50
+ profile_parser.add_argument('-o', '--output', default='diamond_output',
51
+ help='Output directory (default: diamond_output)')
52
+ profile_parser.add_argument('-t', '--threads', type=int, default=4,
53
+ help='Number of threads (default: 4)')
54
+ profile_parser.add_argument('--diamond-db',
55
+ help='DIAMOND database path (default: package database)')
56
+
57
+ # Predict command
58
+ predict_parser = subparsers.add_parser('predict', help='Predict aerobe levels')
59
+ predict_parser.add_argument('-i', '--input', default='diamond_output',
60
+ help='Input directory (default: diamond_output)')
61
+ predict_parser.add_argument('-o', '--output', default='per_aerobe_predictions.tsv',
62
+ help='Output file (default: per_aerobe_predictions.tsv)')
63
+ predict_parser.add_argument('-t', '--threads', type=int, default=4,
64
+ help='Number of threads (default: 4)')
65
+ predict_parser.add_argument('-m', '--mode', choices=['modern', 'ancient', 'custom'],
66
+ default='modern', help='Filtering mode (default: modern)')
67
+ predict_parser.add_argument('--idcut', type=float,
68
+ help='Custom identity cutoff (for custom mode)')
69
+ predict_parser.add_argument('--bitcut', type=float,
70
+ help='Custom bitscore cutoff (for custom mode)')
71
+ predict_parser.add_argument('--ecut', type=float,
72
+ help='Custom e-value cutoff (for custom mode)')
73
+
74
+ args = parser.parse_args()
75
+
76
+ if not args.command:
77
+ parser.print_help()
78
+ sys.exit(1)
79
+
80
+ try:
81
+ check_dependencies()
82
+
83
+ if args.command == 'setup':
84
+ run_kraken2_setup()
85
+
86
+ elif args.command == 'extract':
87
+ extract_reads(args.input, args.output, args.threads, args.kraken_db)
88
+
89
+ elif args.command == 'profile':
90
+ profile_samples(args.input, args.output, args.threads, args.diamond_db)
91
+
92
+ elif args.command == 'predict':
93
+ predict_aerobes(args.input, args.output, args.mode,
94
+ args.idcut, args.bitcut, args.ecut, args.threads)
95
+
96
+ logger.info("Command completed successfully")
97
+
98
+ except OxyMetaGError as e:
99
+ logger.error(f"Error: {e}")
100
+ sys.exit(1)
101
+ except Exception as e:
102
+ logger.error(f"Unexpected error: {e}")
103
+ sys.exit(1)
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
oxymetag/core.py ADDED
@@ -0,0 +1,226 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Core functions for OxyMetaG
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import subprocess
9
+ import glob
10
+ from pathlib import Path
11
+ import pandas as pd
12
+ import logging
13
+ from typing import List, Optional
14
+
15
+ logger = logging.getLogger('oxymetag')
16
+
17
+ class OxyMetaGError(Exception):
18
+ """Custom exception for OxyMetaG errors"""
19
+ pass
20
+
21
+ def extract_reads(input_files: List[str], output_dir: str = "BactReads",
22
+ threads: int = 48, kraken_db: str = "kraken2_db"):
23
+ """
24
+ Extract bacterial reads from metagenomic samples using Kraken2
25
+ """
26
+ logger.info(f"Starting bacterial read extraction with {threads} threads")
27
+
28
+ if not Path(kraken_db).exists():
29
+ raise OxyMetaGError(f"Kraken2 database not found: {kraken_db}")
30
+
31
+ output_path = Path(output_dir)
32
+ output_path.mkdir(exist_ok=True)
33
+
34
+ for input_file in input_files:
35
+ input_path = Path(input_file)
36
+ if not input_path.exists():
37
+ logger.warning(f"Input file not found: {input_file}")
38
+ continue
39
+
40
+ logger.info(f"Processing {input_file}")
41
+
42
+ base_name = input_path.stem.replace('.fastq', '').replace('.gz', '')
43
+
44
+ if '_R1' in base_name or '_1' in base_name:
45
+ kraken_base = base_name.replace('_R1', '').replace('_1', '')
46
+ else:
47
+ kraken_base = base_name
48
+
49
+ kraken_output = output_path / f"{kraken_base}_kraken.out"
50
+ kraken_report = output_path / f"{kraken_base}_report.txt"
51
+
52
+ if '_R1' in base_name or '_1' in base_name:
53
+ r2_file = str(input_path).replace('_R1', '_R2').replace('_1', '_2')
54
+ if Path(r2_file).exists():
55
+ cmd = [
56
+ 'kraken2', '--db', kraken_db, '--threads', str(threads),
57
+ '--output', str(kraken_output), '--report', str(kraken_report),
58
+ '--paired', str(input_path), r2_file
59
+ ]
60
+ else:
61
+ logger.warning(f"R2 file not found for {input_file}, treating as single-end")
62
+ cmd = [
63
+ 'kraken2', '--db', kraken_db, '--threads', str(threads),
64
+ '--output', str(kraken_output), '--report', str(kraken_report),
65
+ str(input_path)
66
+ ]
67
+ else:
68
+ cmd = [
69
+ 'kraken2', '--db', kraken_db, '--threads', str(threads),
70
+ '--output', str(kraken_output), '--report', str(kraken_report),
71
+ str(input_path)
72
+ ]
73
+
74
+ try:
75
+ subprocess.run(cmd, check=True)
76
+ logger.info(f"Kraken2 classification completed for {input_file}")
77
+
78
+ bacterial_reads = output_path / f"{base_name}_bacterial.fastq"
79
+
80
+ cmd = [
81
+ 'extract_kraken_reads.py',
82
+ '-k', str(kraken_output),
83
+ '-s', str(input_path),
84
+ '-o', str(bacterial_reads),
85
+ '-r', str(kraken_report),
86
+ '--taxid', '2',
87
+ '--include-children'
88
+ ]
89
+
90
+ if ('_R1' in base_name or '_1' in base_name) and Path(r2_file).exists():
91
+ if '_R1' in base_name:
92
+ r2_output = output_path / f"{base_name.replace('_R1', '_R2')}_bacterial.fastq"
93
+ else:
94
+ r2_output = output_path / f"{base_name.replace('_1', '_2')}_bacterial.fastq"
95
+ cmd.extend(['-s2', r2_file])
96
+ cmd.extend(['-o2', str(r2_output)])
97
+
98
+ subprocess.run(cmd, check=True)
99
+ subprocess.run(['gzip', str(bacterial_reads)], check=True)
100
+
101
+ if ('_R1' in base_name or '_1' in base_name) and Path(r2_file).exists():
102
+ subprocess.run(['gzip', str(r2_output)], check=True)
103
+
104
+ logger.info(f"Bacterial reads extracted for {input_file}")
105
+
106
+ except subprocess.CalledProcessError as e:
107
+ logger.error(f"Failed to process {input_file}: {e}")
108
+ continue
109
+
110
+
111
+ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_output",
112
+ threads: int = 4, diamond_db: str = None):
113
+ """
114
+ Profile samples using DIAMOND blastx against Pfam database
115
+ """
116
+ from .utils import get_package_data_path
117
+
118
+ logger.info(f"Starting sample profiling with {threads} threads")
119
+
120
+ if diamond_db is None:
121
+ diamond_db = get_package_data_path("oxymetag_pfams.dmnd")
122
+
123
+ if not Path(diamond_db).exists():
124
+ raise OxyMetaGError(f"DIAMOND database not found: {diamond_db}")
125
+
126
+ output_path = Path(output_dir)
127
+ output_path.mkdir(exist_ok=True)
128
+
129
+ input_path = Path(input_dir)
130
+ input_files = []
131
+
132
+ patterns = [
133
+ '*_R1_bacterial.fastq.gz',
134
+ '*_1_bacterial.fastq.gz',
135
+ '*_bacterial.fastq.gz'
136
+ ]
137
+
138
+ for pattern in patterns:
139
+ found_files = list(input_path.glob(pattern))
140
+ if found_files:
141
+ input_files.extend(found_files)
142
+ logger.info(f"Found {len(found_files)} files using pattern: {pattern}")
143
+ break
144
+
145
+ if not input_files:
146
+ all_files = list(input_path.glob("*.fastq.gz"))
147
+ logger.error(f"FASTQ files in {input_dir}: {[f.name for f in all_files[:5]]}")
148
+ raise OxyMetaGError(f"No bacterial read files found in {input_dir}")
149
+
150
+ for input_file in input_files:
151
+ base_name = input_file.stem.replace('.fastq', '').replace('.gz', '')
152
+ base_name = base_name.replace('_R1_bacterial', '').replace('_1_bacterial', '').replace('_bacterial', '')
153
+
154
+ logger.info(f"Processing {input_file}")
155
+
156
+ output_file = output_path / f"{base_name}_diamond.tsv"
157
+
158
+ cmd = [
159
+ 'diamond', 'blastx',
160
+ '-d', diamond_db,
161
+ '-q', str(input_file),
162
+ '-o', str(output_file),
163
+ '-f', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'
164
+ ]
165
+
166
+ try:
167
+ subprocess.run(cmd, check=True)
168
+ logger.info(f"DIAMOND profiling completed for {input_file}")
169
+
170
+ except subprocess.CalledProcessError as e:
171
+ logger.error(f"Failed to process {input_file}: {e}")
172
+ continue
173
+
174
+
175
+ def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_aerobe_predictions.tsv",
176
+ mode: str = "modern", id_cut: float = None, bit_cut: float = None,
177
+ e_cut: float = None, threads: int = 4):
178
+ """
179
+ Predict aerobe levels from DIAMOND results
180
+ """
181
+ from .utils import get_package_data_path
182
+
183
+ logger.info(f"Starting aerobe level prediction in {mode} mode")
184
+
185
+ if mode == "modern":
186
+ identity_cutoff, bitscore_cutoff, evalue_cutoff = 60.0, 50.0, 0.001
187
+ elif mode == "ancient":
188
+ identity_cutoff, bitscore_cutoff, evalue_cutoff = 45.0, 25.0, 0.1
189
+ elif mode == "custom":
190
+ if any(x is None for x in [id_cut, bit_cut, e_cut]):
191
+ raise OxyMetaGError("Custom mode requires id_cut, bit_cut, and e_cut parameters")
192
+ identity_cutoff, bitscore_cutoff, evalue_cutoff = id_cut, bit_cut, e_cut
193
+ else:
194
+ raise OxyMetaGError("Mode must be 'modern', 'ancient', or 'custom'")
195
+
196
+ package_data_dir = str(Path(get_package_data_path("")).parent / "data")
197
+ r_script_path = get_package_data_path("../scripts/predict_oxygen.R")
198
+
199
+ if not Path(input_dir).exists():
200
+ raise OxyMetaGError(f"Input directory not found: {input_dir}")
201
+
202
+ cmd = [
203
+ 'Rscript', r_script_path,
204
+ input_dir, output_file, package_data_dir, mode,
205
+ str(identity_cutoff), str(evalue_cutoff), str(bitscore_cutoff)
206
+ ]
207
+
208
+ try:
209
+ logger.info(f"Calling R script: {' '.join(cmd)}")
210
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
211
+ logger.info("R script completed successfully")
212
+ if result.stdout:
213
+ logger.info(f"R output: {result.stdout}")
214
+
215
+ except subprocess.CalledProcessError as e:
216
+ logger.error(f"R script failed: {e}")
217
+ if e.stderr:
218
+ logger.error(f"R stderr: {e.stderr}")
219
+ raise OxyMetaGError(f"Aerobe prediction failed: {e}")
220
+
221
+ if Path(output_file).exists():
222
+ results_df = pd.read_csv(output_file, sep='\t')
223
+ logger.info(f"Results saved to {output_file}")
224
+ return results_df
225
+ else:
226
+ raise OxyMetaGError(f"Output file was not created: {output_file}")
Binary file
@@ -0,0 +1,21 @@
1
+ Pfam,Oxygen,Other_Taxa
2
+ PF00042,aerobic,"Plant, Animal"
3
+ PF00115,aerobic,"Plant, Animal"
4
+ PF00116,aerobic,"Plant, Animal"
5
+ PF00296,aerobic,Archaea
6
+ PF00510,aerobic,"Plant, Animal, Fungus"
7
+ PF00916,aerobic,"Plant, Animal, Fungus"
8
+ PF01152,aerobic,"Plant, Protist"
9
+ PF01521,aerobic,"Plant, Animal, Fungus"
10
+ PF01871,anaerobic,"Plant, Animal, Fungus"
11
+ PF02579,anaerobic,Archaea
12
+ PF02906,anaerobic,"Plant, Animal, Fungus"
13
+ PF03063,anaerobic,Archaea
14
+ PF05425,aerobic,None
15
+ PF05721,aerobic,"Plant, Animal, Fungus"
16
+ PF08530,aerobic,None
17
+ PF10371,anaerobic,None
18
+ PF13597,anaerobic,Archaea
19
+ PF16870,aerobic,"Animal, Fungus"
20
+ PF17773,aerobic,"Plant, Animal"
21
+ PF17910,anaerobic,Archaea
Binary file
Binary file