oxymetag 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,7 +25,7 @@ bitcut <- as.numeric(args[7])
25
25
 
26
26
  predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut) {
27
27
 
28
- # Load package data files (these come from oxymetag/data/)
28
+ # Load package data files
29
29
  map_file <- file.path(package_data_dir, "pfam_headers_table.txt")
30
30
  lengths_file <- file.path(package_data_dir, "pfam_lengths.tsv")
31
31
  model_file <- file.path(package_data_dir, "oxygen_model.rds")
@@ -47,19 +47,45 @@ predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut
47
47
 
48
48
  # Load the trained model and oxygen classifications
49
49
  oxygen_model <- readRDS(model_file)
50
-
51
- # Get aerobic and anaerobic pfam lists (you'll need to define these)
52
50
  oxygen_pfams <- read.csv(pfams_file, stringsAsFactors = FALSE)
53
51
  aerobic_pfams <- oxygen_pfams %>% filter(Oxygen == "aerobic")
54
52
  anaerobic_pfams <- oxygen_pfams %>% filter(Oxygen == "anaerobic")
55
53
 
56
- # Find diamond output files (user's working directory)
57
- files <- list.files(input_dir, pattern = "*_diamond.tsv", full.names = TRUE)
54
+ # Determine method and file pattern based on mode
55
+ if (mode == "modern") {
56
+ method <- "diamond"
57
+ file_pattern <- "*_diamond.tsv"
58
+ } else if (mode == "ancient") {
59
+ method <- "mmseqs2"
60
+ file_pattern <- "*_mmseqs.tsv"
61
+ } else if (mode == "custom") {
62
+ # For custom mode, try to detect which files exist
63
+ diamond_files <- list.files(input_dir, pattern = "*_diamond.tsv", full.names = TRUE)
64
+ mmseqs_files <- list.files(input_dir, pattern = "*_mmseqs.tsv", full.names = TRUE)
65
+
66
+ if (length(diamond_files) > 0 && length(mmseqs_files) == 0) {
67
+ method <- "diamond"
68
+ file_pattern <- "*_diamond.tsv"
69
+ } else if (length(mmseqs_files) > 0 && length(diamond_files) == 0) {
70
+ method <- "mmseqs2"
71
+ file_pattern <- "*_mmseqs.tsv"
72
+ } else if (length(diamond_files) > 0 && length(mmseqs_files) > 0) {
73
+ stop("Both DIAMOND and MMseqs2 files found. Please specify input directory with only one type.")
74
+ } else {
75
+ stop("No profiling output files found in input directory.")
76
+ }
77
+ } else {
78
+ stop(paste("Unknown mode:", mode))
79
+ }
80
+
81
+ files <- list.files(input_dir, pattern = file_pattern, full.names = TRUE)
58
82
 
59
83
  if (length(files) == 0) {
60
- stop(paste("No *_diamond.tsv files found in", input_dir))
84
+ stop(paste("No", file_pattern, "files found in", input_dir))
61
85
  }
62
86
 
87
+ message("Processing ", length(files), " files using ", method, " output format")
88
+
63
89
  # Initialize results dataframe
64
90
  results <- data.frame(
65
91
  SampleID = character(length(files)),
@@ -75,29 +101,50 @@ predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut
75
101
 
76
102
  # Extract sample ID from filename
77
103
  sample_id <- basename(files[i])
78
- sample_id <- gsub("_diamond.tsv$", "", sample_id)
104
+ sample_id <- gsub("_diamond.tsv$|_mmseqs.tsv$", "", sample_id)
79
105
  results$SampleID[i] <- sample_id
80
106
 
81
- # Read and filter diamond output
107
+ # Read and filter profiling output
82
108
  if (file.size(files[i]) == 0) {
83
109
  message("Warning: Empty file ", files[i])
84
110
  next
85
111
  }
86
112
 
87
- d <- read.table(files[i], stringsAsFactors = FALSE) %>%
88
- set_names(c("qseqid", "sseqid", "pident", "length", "qstart", "qend",
89
- "sstart", "send", "evalue", "bitscore"))
90
-
91
- # Apply filtering based on mode
92
- if (mode == "modern") {
93
- d <- d %>%
94
- filter(pident >= 60, evalue < 0.001, bitscore >= 50)
95
- } else if (mode == "ancient") {
96
- d <- d %>%
97
- filter(pident >= 45, evalue < 0.1, bitscore >= 25)
98
- } else if (mode == "custom") {
99
- d <- d %>%
100
- filter(pident >= idcut, evalue < ecut, bitscore >= bitcut)
113
+ # Read data based on method
114
+ if (method == "diamond") {
115
+ d <- read.table(files[i], stringsAsFactors = FALSE) %>%
116
+ set_names(c("qseqid", "sseqid", "pident", "length", "qstart", "qend",
117
+ "sstart", "send", "evalue", "bitscore")) %>%
118
+ left_join(map, by = c("sseqid" = "Header")) %>%
119
+ group_by(qseqid) %>%
120
+ slice_max(bitscore, n = 1, with_ties = FALSE) %>%
121
+ ungroup()
122
+
123
+ # Apply filtering based on mode
124
+ if (mode == "modern") {
125
+ d <- d %>% filter(pident >= 60, evalue < 0.001, bitscore >= 50)
126
+ } else if (mode == "custom") {
127
+ d <- d %>% filter(pident >= idcut, evalue < ecut, bitscore >= bitcut)
128
+ }
129
+
130
+ } else if (method == "mmseqs2") {
131
+ d <- read.table(files[i], stringsAsFactors = FALSE) %>%
132
+ set_names(c("query", "target", "pident", "length", "mismatch", "gapopen",
133
+ "qstart", "qend", "tstart", "tend", "evalue", "bitscore",
134
+ "qlen", "tlen", "cigar", "qaln", "taln")) %>%
135
+ mutate(pident = pident * 100) %>%
136
+ left_join(map, by = c("target" = "Header")) %>%
137
+ group_by(query) %>%
138
+ slice_max(bitscore, n = 1, with_ties = FALSE) %>%
139
+ ungroup() %>%
140
+ filter(Pfam %in% oxygen_pfams$Pfam)
141
+
142
+ # Apply filtering based on mode
143
+ if (mode == "ancient") {
144
+ d <- d %>% filter(pident >= 86, evalue < 0.001, bitscore >= 50)
145
+ } else if (mode == "custom") {
146
+ d <- d %>% filter(pident >= idcut, evalue < ecut, bitscore >= bitcut)
147
+ }
101
148
  }
102
149
 
103
150
  if (nrow(d) == 0) {
@@ -108,45 +155,46 @@ predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut
108
155
  next
109
156
  }
110
157
 
111
- # Join with pfam mapping
112
- d <- d %>% left_join(map, by = c("sseqid" = "Header"))
113
-
114
158
  # Count pfams
115
159
  pf_count <- as.data.frame(table(d$Pfam))
116
- results$Pfams[i] <- nrow(pf_count)
117
160
  results$aerobe_pfams[i] <- sum(as.character(pf_count$Var1) %in% aerobic_pfams$Pfam)
118
161
  results$anaerobe_pfams[i] <- sum(as.character(pf_count$Var1) %in% anaerobic_pfams$Pfam)
119
162
 
120
163
  # Calculate gene hits and length correction
121
164
  gene.hits <- d %>%
122
165
  group_by(Pfam) %>%
123
- summarise(total_count = n())
166
+ summarise(total_count = n(), .groups = 'drop')
124
167
 
125
168
  gene.hit.length.correction <- gene.hits %>%
126
- left_join(., pfam_gene_length, by = "Pfam") %>%
127
- mutate(RPK = total_count / (1000*Gene.length)) %>%
128
- left_join(., oxygen_pfams, by = "Pfam")
169
+ left_join(pfam_gene_length, by = "Pfam") %>%
170
+ mutate(RPK = total_count / (1000 * Gene.length)) %>%
171
+ left_join(oxygen_pfams, by = "Pfam")
129
172
 
130
173
  # Sum by oxygen type
131
174
  oxygen_rpk <- gene.hit.length.correction %>%
132
175
  group_by(Oxygen) %>%
133
- summarize(RPKsum = sum(RPK))
176
+ summarize(RPKsum = sum(RPK, na.rm = TRUE), .groups = 'drop')
134
177
 
135
- # Calculate the ratio and add it to the dataframe
136
- results$ratio[i] <- oxygen_rpk$RPKsum[1] / oxygen_rpk$RPKsum[2]
178
+ # Calculate ratio (aerobe/anaerobe)
179
+ aerobe_rpk <- oxygen_rpk$RPKsum[oxygen_rpk$Oxygen == "aerobic"]
180
+ anaerobe_rpk <- oxygen_rpk$RPKsum[oxygen_rpk$Oxygen == "anaerobic"]
181
+
182
+ if (length(anaerobe_rpk) == 0 || anaerobe_rpk == 0) {
183
+ results$ratio[i] <- ifelse(length(aerobe_rpk) > 0 && aerobe_rpk > 0, Inf, 0)
184
+ } else {
185
+ results$ratio[i] <- aerobe_rpk / anaerobe_rpk
186
+ }
137
187
 
138
- # Processing message
139
188
  message("Processed sample ", i, "/", length(files), ": ", sample_id)
140
189
  }
141
190
 
142
191
  # Make predictions using the GAM model
143
192
  new_data <- data.frame(ratio = results$ratio)
144
- results$Per_aerobe <- predict(oxygen_model, newdata = new_data)
193
+ results$Per_aerobe <- predict(oxygen_model, newdata = new_data, type = "response")
145
194
 
146
195
  # Constrain predictions to 0-100% and set to 100% if ratio > 35
147
196
  results <- results %>%
148
- mutate(Per_aerobe = ifelse(Per_aerobe > 100, 100, Per_aerobe)) %>%
149
- mutate(Per_aerobe = ifelse(Per_aerobe < 0, 0, Per_aerobe)) %>%
197
+ mutate(Per_aerobe = pmax(0, pmin(100, Per_aerobe))) %>%
150
198
  mutate(Per_aerobe = ifelse(ratio > 35, 100, Per_aerobe))
151
199
 
152
200
  # Save results
@@ -157,4 +205,4 @@ predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut
157
205
  }
158
206
 
159
207
  # Run the function
160
- predict_oxygen(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut)
208
+ predict_oxygen(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut)
oxymetag/utils.py CHANGED
@@ -4,19 +4,49 @@ Utility functions for OxyMetaG
4
4
  """
5
5
 
6
6
  import subprocess
7
- import pkg_resources
8
7
  from pathlib import Path
9
8
  import logging
10
9
 
10
+ # Use importlib.resources instead of deprecated pkg_resources
11
+ try:
12
+ from importlib.resources import files
13
+ use_importlib = True
14
+ except ImportError:
15
+ use_importlib = False
16
+
11
17
  logger = logging.getLogger('oxymetag')
12
18
 
13
19
  class OxyMetaGError(Exception):
14
20
  """Custom exception for OxyMetaG errors"""
15
21
  pass
16
22
 
23
+ def get_package_data_path(filename: str) -> str:
24
+ """Get path to package data files"""
25
+ if use_importlib:
26
+ try:
27
+ package_files = files('oxymetag')
28
+ if filename.startswith('../'):
29
+ parts = filename.split('/')
30
+ for part in parts:
31
+ if part == '..':
32
+ package_files = package_files.parent
33
+ elif part and part != '.':
34
+ package_files = package_files / part
35
+ return str(package_files)
36
+ else:
37
+ return str(package_files / 'data' / filename)
38
+ except:
39
+ pass
40
+
41
+ package_dir = Path(__file__).parent
42
+ if filename.startswith('../'):
43
+ return str(package_dir / filename)
44
+ else:
45
+ return str(package_dir / 'data' / filename)
46
+
17
47
  def check_dependencies():
18
48
  """Check if required external tools are available"""
19
- required_tools = ['kraken2', 'diamond', 'Rscript']
49
+ required_tools = ['kraken2', 'diamond', 'mmseqs', 'Rscript']
20
50
  missing_tools = []
21
51
 
22
52
  for tool in required_tools:
@@ -26,14 +56,6 @@ def check_dependencies():
26
56
  if missing_tools:
27
57
  raise OxyMetaGError(f"Missing required tools: {', '.join(missing_tools)}")
28
58
 
29
- def get_package_data_path(filename: str) -> str:
30
- """Get path to package data files"""
31
- try:
32
- return pkg_resources.resource_filename('oxymetag', f'data/{filename}')
33
- except:
34
- package_dir = Path(__file__).parent
35
- return str(package_dir / 'data' / filename)
36
-
37
59
  def run_kraken2_setup():
38
60
  """Download and set up standard Kraken2 database without fungi"""
39
61
  logger.info("Setting up Kraken2 database (bacteria, archaea, viral)...")
@@ -42,13 +64,11 @@ def run_kraken2_setup():
42
64
  db_path.mkdir(exist_ok=True)
43
65
 
44
66
  try:
45
- # Download taxonomy
46
67
  cmd = ['kraken2-build', '--download-taxonomy', '--db', str(db_path)]
47
68
  logger.info("Downloading taxonomy...")
48
69
  subprocess.run(cmd, check=True)
49
70
  logger.info("Taxonomy downloaded successfully")
50
71
 
51
- # Download libraries (excluding fungi)
52
72
  libraries = ['bacteria', 'archaea', 'viral']
53
73
  for lib in libraries:
54
74
  cmd = ['kraken2-build', '--download-library', lib, '--db', str(db_path)]
@@ -56,12 +76,10 @@ def run_kraken2_setup():
56
76
  subprocess.run(cmd, check=True)
57
77
  logger.info(f"{lib} library downloaded successfully")
58
78
 
59
- # Build database
60
79
  cmd = ['kraken2-build', '--build', '--db', str(db_path), '--threads', '48']
61
80
  logger.info("Building Kraken2 database...")
62
81
  subprocess.run(cmd, check=True)
63
82
 
64
- # Clean up temporary files to save space
65
83
  cmd = ['kraken2-build', '--clean', '--db', str(db_path)]
66
84
  logger.info("Cleaning up temporary files...")
67
85
  subprocess.run(cmd, check=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: oxymetag
3
- Version: 1.0.0
3
+ Version: 1.1.0
4
4
  Summary: Oxygen metabolism profiling from metagenomic data
5
5
  Home-page: https://github.com/cliffbueno/oxymetag
6
6
  Author: Clifton P. Bueno de Mesquita
@@ -25,9 +25,9 @@ Requires-Dist: numpy >=1.20.0
25
25
 
26
26
  Oxygen metabolism profiling from metagenomic data using Pfam domains. OxyMetaG predicts the percent relative abundance of aerobic bacteria in metagenomic reads based on the ratio of abundances of a set of 20 Pfams. It is recommended to use a HPC cluster or server rather than laptop to run OxyMetaG due to memory requirements, particularly for the step of extracting bacterial reads. If you already have bacterial reads, the "profile" and "predict" functions will run quickly on a laptop.
27
27
 
28
- If you are working with modern metagenomes, we recommend first quality filtering the raw reads with your method of choice and standard practices, and then extracting bacterial reads with Kraken2 and KrakenTools, which is performed with the OxyMetaG extract function.
28
+ If you are working with modern metagenomes, we recommend first quality filtering the raw reads with your method of choice and standard practices, and then extracting bacterial reads with Kraken2 and KrakenTools, which is performed with the OxyMetaG extract function. For profiling modern metagenomes, use DIAMOND blastx with the default `-m modern` mode for the predict step.
29
29
 
30
- If you are working with ancient metagenomes, we recommend first quality filtering the raw reads with your method of choice and standard practices, and then extracting bacterial reads with a workflow optimized for ancient DNA, such as the one employed by De Sanctis et al. (2025).
30
+ If you are working with ancient metagenomes, we recommend first quality filtering the raw reads with your method of choice and standard practices, and then extracting bacterial reads with a workflow optimized for ancient DNA, such as the read mapping approach employed by De Sanctis et al. (2025). For profiling ancient metagenomes, use MMseqs2 with `-m mmseqs2` for the profile step and `-m ancient` for the predict step. The ancient mode uses parameters optimized for ancient DNA along with 97 decoy Pfams to reduce instances of false positives.
31
31
 
32
32
  ## Installation
33
33
 
@@ -46,16 +46,22 @@ conda env create -f environment.yml
46
46
  conda activate oxymetag
47
47
 
48
48
  # Install OxyMetaG
49
- pip install oxymetag
49
+ pip install -e .
50
+
51
+ # Index the MMseqs2 database (one-time setup, ~5-10 minutes)
52
+ mmseqs createindex oxymetag/data/oxymetag_pfams_n117_db tmp
50
53
  ```
51
54
 
55
+ **Note:** The MMseqs2 database indexing is optional but highly recommended for faster searches.
56
+
52
57
  ### Using Pip
53
58
 
54
59
  First install external dependencies:
55
60
  - Kraken2
56
61
  - DIAMOND
62
+ - MMseqs2
57
63
  - KrakenTools
58
- - R with mgcv and dplyr packages
64
+ - R with mgcv, dplyr, tidyr, and rlang packages
59
65
 
60
66
  Then install OxyMetaG:
61
67
  ```bash
@@ -64,30 +70,38 @@ pip install oxymetag
64
70
 
65
71
  ## Quick Start
66
72
 
67
- ### 1. Setup the standard Kraken2 database
73
+ ### Modern DNA workflow
74
+
68
75
  ```bash
76
+ # 1. Setup Kraken2 database (one-time)
69
77
  oxymetag setup
70
- ```
71
78
 
72
- ### 2. Extract bacterial reads
73
- ```bash
74
- oxymetag extract -i sample1_R1.fastq.gz sample1_R2.fastq.gz -o BactReads -t 48
75
- ```
79
+ # 2. Extract bacterial reads
80
+ oxymetag extract -i sample1_R1.fastq.gz -o BactReads -t 48
76
81
 
77
- ### 3. Profile samples
78
- ```bash
79
- oxymetag profile -i BactReads -o diamond_output -t 8
82
+ # 3. Profile samples with DIAMOND
83
+ oxymetag profile -i BactReads -o diamond_output -m diamond -t 8
84
+
85
+ # 4. Predict aerobe levels
86
+ oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m modern
80
87
  ```
81
88
 
82
- ### 4. Predict aerobe levels
89
+ ### Ancient DNA workflow
90
+
83
91
  ```bash
84
- # For modern DNA
85
- oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m modern
92
+ # 1. Extract bacterial reads (use ancient DNA-optimized workflow if available)
93
+ # If using oxymetag extract, same as modern workflow
94
+
95
+ # 2. Profile samples with MMseqs2
96
+ oxymetag profile -i BactReads -o mmseqs_output -m mmseqs2 -t 8
86
97
 
87
- # For ancient DNA
88
- oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m ancient
98
+ # 3. Predict aerobe levels with ancient mode
99
+ oxymetag predict -i mmseqs_output -o per_aerobe_predictions.tsv -m ancient
100
+ ```
101
+
102
+ ### Custom parameters
89
103
 
90
- # Custom cutoffs
104
+ ```bash
91
105
  oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m custom --idcut 50 --bitcut 30 --ecut 0.01
92
106
  ```
93
107
 
@@ -98,11 +112,11 @@ oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m custom --idc
98
112
 
99
113
  **What it does:** Downloads and builds the standard Kraken2 database containing bacterial, archaeal, and viral genomes. This database is used by the `extract` command to identify bacterial sequences from metagenomic samples.
100
114
 
101
- **Time:** 2-4 hours depending on internet speed and system performance.
115
+ **Time:** Depends on internet speed and system performance, but will likely take several hours (4-20 hours typical).
102
116
 
103
- **Output:** Creates a `kraken2_db/` directory with the standard database.
117
+ **Output:** Creates a `kraken2_db/` directory with the standard database (~90 GB).
104
118
 
105
- Make sure you run oxymetag setup from the directory where you want the database to live, or plan to always specify the --kraken-db path when running extract. The database is quite large (~50-100 GB), so choose a location with sufficient storage.
119
+ Make sure you run oxymetag setup from the directory where you want the database to live, or plan to always specify the --kraken-db path when running extract. The database is quite large, so choose a location with sufficient storage.
106
120
 
107
121
  ---
108
122
 
@@ -114,7 +128,7 @@ Make sure you run oxymetag setup from the directory where you want the database
114
128
  2. Uses KrakenTools to extract only the reads classified as bacterial
115
129
  3. Outputs cleaned bacterial-only FASTQ files for downstream analysis
116
130
 
117
- **Input:** Quality filtered metagenomic read FASTQ files (paired-end or merged)\
131
+ **Input:** Quality filtered metagenomic read FASTQ files (paired-end or merged)
118
132
  **Output:** Bacterial-only FASTQ files in `BactReads/` directory
119
133
 
120
134
  **Arguments:**
@@ -126,22 +140,26 @@ Make sure you run oxymetag setup from the directory where you want the database
126
140
  ---
127
141
 
128
142
  ### oxymetag profile
129
- **Function:** Profiles bacterial reads against oxygen metabolism protein domains.
143
+ **Function:** Profiles bacterial reads against oxygen metabolism protein domains using DIAMOND or MMseqs2.
130
144
 
131
145
  **What it does:**
132
146
  1. Takes bacterial-only reads from the `extract` step
133
- 2. Uses DIAMOND blastx to search against a curated database of 20 Pfam domains related to oxygen metabolism
147
+ 2. Uses DIAMOND blastx (for modern DNA) or MMseqs2 (for ancient DNA) to search against a curated database of Pfam domains related to oxygen metabolism
148
+ - DIAMOND mode: 20 target Pfams
149
+ - MMseqs2 mode: 20 target Pfams + 97 decoy Pfams (117 total) to reduce false positives
134
150
  3. Identifies protein-coding sequences and their functional annotations
135
151
  4. Creates detailed hit tables for each sample
136
152
 
137
- **Input:** Bacterial FASTQ files (uses R1 or merged reads only)\
138
- **Output:** DIAMOND alignment files (TSV format) in `diamond_output/` directory
153
+ **Input:** Bacterial FASTQ files (uses R1 or merged reads only)
154
+ **Output:** Alignment files (TSV format) in `diamond_output/` or `mmseqs_output/` directory
139
155
 
140
156
  **Arguments:**
141
157
  - `-i, --input`: Input directory with bacterial reads (default: BactReads)
142
- - `-o, --output`: Output directory (default: diamond_output)
158
+ - `-o, --output`: Output directory (default: diamond_output or mmseqs_output depending on method)
143
159
  - `-t, --threads`: Number of threads (default: 4)
144
- - `--diamond-db`: Custom DIAMOND database path (optional)
160
+ - `-m, --method`: Profiling method - 'diamond' or 'mmseqs2' (default: diamond)
161
+ - `--diamond-db`: Custom DIAMOND database path (optional, for diamond method)
162
+ - `--mmseqs-db`: Custom MMseqs2 database path (optional, for mmseqs2 method)
145
163
 
146
164
  ---
147
165
 
@@ -149,17 +167,24 @@ Make sure you run oxymetag setup from the directory where you want the database
149
167
  **Function:** Predicts aerobe abundance from protein domain profiles using machine learning.
150
168
 
151
169
  **What it does:**
152
- 1. Processes DIAMOND output files with appropriate quality filters
153
- 2. Normalizes protein domain counts by gene length (reads per kilobase)
154
- 3. Calculates aerobic/anaerobic domain ratios for each sample
155
- 4. Applies a trained GAM (Generalized Additive Model) to predict percentage of aerobes
156
- 5. Outputs a table with the sampleID, # Pfams detected, and predicted % aerobic bacteria
157
-
158
- **Input:** DIAMOND output directory from `profile` step\
170
+ 1. Processes DIAMOND or MMseqs2 output files with appropriate quality filters
171
+ 2. Selects the top hit per read based on bitscore
172
+ 3. For MMseqs2 (ancient mode): filters out decoy Pfams after selecting top hits
173
+ 4. Normalizes protein domain counts by gene length (reads per kilobase)
174
+ 5. Calculates aerobic/anaerobic domain ratios for each sample
175
+ 6. Applies a trained GAM (Generalized Additive Model) to predict percentage of aerobes
176
+ 7. Outputs a table with the sampleID, # Pfams detected, and predicted % aerobic bacteria
177
+
178
+ **Input:** DIAMOND or MMseqs2 output directory from `profile` step
159
179
  **Output:** Tab-separated file with aerobe predictions for each sample
160
180
 
181
+ **Mode determines input type:**
182
+ - `-m modern`: Uses DIAMOND output (default input: diamond_output/)
183
+ - `-m ancient`: Uses MMseqs2 output (default input: mmseqs_output/)
184
+ - `-m custom`: Auto-detects DIAMOND or MMseqs2 files in input directory
185
+
161
186
  **Arguments:**
162
- - `-i, --input`: Directory with DIAMOND output (default: diamond_output)
187
+ - `-i, --input`: Directory with profiling output (default: diamond_output for modern, mmseqs_output for ancient)
163
188
  - `-o, --output`: Output file (default: per_aerobe_predictions.tsv)
164
189
  - `-t, --threads`: Number of threads (default: 4)
165
190
  - `-m, --mode`: Filtering mode - 'modern', 'ancient', or 'custom' (default: modern)
@@ -169,32 +194,56 @@ Make sure you run oxymetag setup from the directory where you want the database
169
194
 
170
195
  ## Filtering Modes
171
196
 
172
- OxyMetaG includes three pre-configured filtering modes optimized for different types of DNA:
197
+ OxyMetaG includes three pre-configured filtering modes optimized for different types of DNA. In any case, it is always recommended to try several different parameters (using -m custom) to check how sensitive the results are to the cutoffs.
173
198
 
174
199
  ### Modern DNA (default)
175
- **Best for:** Modern environmental metagenomes
200
+ **Best for:** Modern environmental metagenomes
201
+ **Method:** DIAMOND blastx
202
+ **Filters:**
176
203
  - Identity ≥ 60%
177
204
  - Bitscore ≥ 50
178
205
  - E-value ≤ 0.001
179
206
 
180
- ### Ancient DNA
181
- **Best for:** Archaeological samples, paleogenomic data, degraded environmental DNA
182
- - Identity 45% (accounts for DNA damage)
183
- - Bitscore 25 (accommodates shorter fragments)
184
- - E-value ≤ 0.1 (more permissive for low-quality data)
207
+ **Usage:**
208
+ ```bash
209
+ oxymetag profile -m diamond
210
+ oxymetag predict -m modern
211
+ ```
212
+
213
+ ### Ancient DNA
214
+ **Best for:** Archaeological samples, paleogenomic data, degraded environmental DNA
215
+ **Method:** MMseqs2 with decoy Pfams
216
+ **Filters:**
217
+ - Identity ≥ 86%
218
+ - Bitscore ≥ 50
219
+ - E-value ≤ 0.001
220
+
221
+ **Note:** The ancient mode uses stricter identity cutoffs but employs 97 decoy Pfams to reduce false positives from damaged DNA. Reads matching decoys better than target Pfams are filtered out.
222
+
223
+ **Usage:**
224
+ ```bash
225
+ oxymetag profile -m mmseqs2
226
+ oxymetag predict -m ancient
227
+ ```
185
228
 
186
229
  ### Custom
187
230
  **Best for:** Specialized applications or when you want to optimize parameters
188
231
  - Specify your own `--idcut`, `--bitcut`, and `--ecut` values
232
+ - Auto-detects whether input is from DIAMOND or MMseqs2
189
233
  - Useful for method development or unusual sample types
190
234
 
235
+ **Usage:**
236
+ ```bash
237
+ oxymetag predict -m custom --idcut 50 --bitcut 30 --ecut 0.01
238
+ ```
239
+
191
240
  ## Output
192
241
 
193
242
  The final output (`per_aerobe_predictions.tsv`) contains:
194
243
  - `SampleID`: Sample identifier extracted from filenames
195
244
  - `ratio`: Aerobic/anaerobic domain ratio
196
- - `aerobe_pfams`: Number of aerobic Pfam domains detected
197
- - `anaerobe_pfams`: Number of anaerobic Pfam domains detected
245
+ - `aerobe_pfams`: Number of aerobic Pfam domains detected (from 20 target Pfams)
246
+ - `anaerobe_pfams`: Number of anaerobic Pfam domains detected (from 20 target Pfams)
198
247
  - `Per_aerobe`: **Predicted percentage of aerobic bacteria (0-100%)**
199
248
 
200
249
  ## Biological Interpretation
@@ -212,17 +261,33 @@ The `Per_aerobe` value represents the predicted percentage of aerobic bacteria i
212
261
  If you use OxyMetaG in your research, please cite:
213
262
 
214
263
  ```
215
- Bueno de Mesquita, C.P., Stallard-Olivera, E., Fierer, N. (2025). Bueno de Mesquita, C.P. et al. (2025). Predicting the proportion of aerobic and anaerobic bacteria from metagenomic reads with OxyMetaG.
264
+ Bueno de Mesquita, C.P., Stallard-Olivera, E., Fierer, N. (2025).
265
+ Quantifying the oxygen preferences of bacterial communities using a
266
+ metagenome-based approach.
267
+ ```
268
+
269
+ ### Additional citations
270
+
271
+ If you use the **extract** function, also cite Kraken2 and KrakenTools:
272
+ ```
273
+ Lu, J., Rincon, N., Wood, D.E. et al. Metagenome analysis using the Kraken
274
+ software suite. Nat Protoc 17, 2815–2839 (2022).
275
+ https://doi.org/10.1038/s41596-022-00738-y
216
276
  ```
217
- If you use the extract function, also cite Kraken2 and KrakenTools:
218
277
 
278
+ If you use the **profile** function with DIAMOND (`-m diamond`), also cite:
219
279
  ```
220
- Lu, J., Rincon, N., Wood, D.E. et al. Metagenome analysis using the Kraken software suite. Nat Protoc 17, 2815–2839 (2022). https://doi.org/10.1038/s41596-022-00738-y
280
+ Buchfink, B., Xie, C. & Huson, D. Fast and sensitive protein alignment using
281
+ DIAMOND. Nat Methods 12, 59–60 (2015).
282
+ https://doi.org/10.1038/nmeth.3176
221
283
  ```
222
- If you use the profile function, also cite DIAMOND
223
284
 
285
+ If you use the **profile** function with MMseqs2 (`-m mmseqs2`), also cite:
224
286
  ```
225
- Buchfink, B., Xie, C. & Huson, D. Fast and sensitive protein alignment using DIAMOND. Nat Methods 12, 59–60 (2015). https://doi.org/10.1038/nmeth.3176
287
+ Steinegger, M., Söding, J. MMseqs2 enables sensitive protein sequence
288
+ searching for the analysis of massive data sets. Nat Biotechnol 35,
289
+ 1026–1028 (2017).
290
+ https://doi.org/10.1038/nbt.3988
226
291
  ```
227
292
 
228
293
  ## License
@@ -0,0 +1,29 @@
1
+ oxymetag/__init__.py,sha256=KaqG7NoPm89jxncdp4Ym3GNAI9h0MUDI-_4ugy6E7jQ,441
2
+ oxymetag/cli.py,sha256=GIXCxpBUQIRWgcSyQNudAWyuVWAAnxK1DRGXG8jpVzQ,4877
3
+ oxymetag/core.py,sha256=zLLVT6XsNb7qFtDlobIh44xgnOzXauYBCL8nNLA4XSk,12496
4
+ oxymetag/utils.py,sha256=fxzKYNUpsuBnYZxlh8QMIEg5wktV6BatoLasfawTeW8,3178
5
+ oxymetag/data/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
6
+ oxymetag/data/Oxygen_pfams.csv,sha256=f3_CFPy235BxbX-2Ami8dJpTLQDImM8e_m87QcJvYKo,673
7
+ oxymetag/data/VTML20.out,sha256=V7F_3HDTtVccoSeuryUJObU5bY3dbfAYHx1TLFixqA8,1930
8
+ oxymetag/data/nucleotide.out,sha256=0IBDiBP7-Q-DJg_Ky3y7Rbv3Dw4s01Tu7S5MtxVJQo4,396
9
+ oxymetag/data/oxygen_model.rds,sha256=UP_3sIJJHEWCfiSTZPJrsVti91-mEUWOn7PgfNcWEDo,8240
10
+ oxymetag/data/oxymetag_pfams.dmnd,sha256=sSrkriGi-x4Ybf-pQtRRCFh1Wjm4rbQeX9FL9buDbAE,1472431
11
+ oxymetag/data/oxymetag_pfams_n117.dmnd,sha256=yIh1qK3aD23VDmG_jsGnx9I7ahh7vV5dSUnBb1y9Eek,15572193
12
+ oxymetag/data/oxymetag_pfams_n117_db,sha256=2C0jpmNzEd-_F6J2QaBW8SeNvivJ4L21nqYU9WLQNsU,12197517
13
+ oxymetag/data/oxymetag_pfams_n117_db.dbtype,sha256=3z9hmASpL9tAVxktxD3XSOp3itxSvEmM6AUkwBS4ERk,4
14
+ oxymetag/data/oxymetag_pfams_n117_db.index,sha256=bzgoZhWEsmFx6IU7aO5ArTMDBb7_1yepVSEP0oRt5Ms,423068
15
+ oxymetag/data/oxymetag_pfams_n117_db.lookup,sha256=XmvjvMlDr9CPklK-iZYv4LD9O9mVgMHR0s0zaVlowM4,919596
16
+ oxymetag/data/oxymetag_pfams_n117_db.source,sha256=qeG0mSUfLERK--Fq8hqNrStPEOgyEhXzIXZixYDADYs,27
17
+ oxymetag/data/oxymetag_pfams_n117_db_h,sha256=xDBPxG39tF1Ez70jd3XxqdyPxlmSr3B0sf9T6eFJ0Xo,1542665
18
+ oxymetag/data/oxymetag_pfams_n117_db_h.dbtype,sha256=QvSuuBwe-B93Hz3oq8qdz2aQHFdVMOdnLksRRkdK5lA,4
19
+ oxymetag/data/oxymetag_pfams_n117_db_h.index,sha256=JQc2q-oSWwx-sv32muf51tZVgmDl0DgM6m1eJr5Z-V4,379408
20
+ oxymetag/data/pfam_headers_table.txt,sha256=wMg4WvlST6Zi3EzVFudjFHyREqNk8kHDI9Q6th7FdFY,255832
21
+ oxymetag/data/pfam_lengths.tsv,sha256=--0bGxDN2v_WiBo0rKFJMPeOPsOrbaNhyjPAoIF9E5A,366
22
+ oxymetag/scripts/predict_oxygen.R,sha256=NUv23y0l7BQxFUoxHqttlKveh6LYmO2qgP8hciGdpMs,7657
23
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ oxymetag-1.1.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
25
+ oxymetag-1.1.0.dist-info/METADATA,sha256=LYHfJ-otW3Tckz7ZbTVxTpIvb-fSJi3S6yeiLK6ifHA,11745
26
+ oxymetag-1.1.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
27
+ oxymetag-1.1.0.dist-info/entry_points.txt,sha256=-9xMAfrSPtFBEvQWRNVKROTM_3OjEik34mVEsYFwM2k,47
28
+ oxymetag-1.1.0.dist-info/top_level.txt,sha256=G7EHL5Fpxne8CH3w5IDIkrsRmMzaEfOhmTngNovoYi8,15
29
+ oxymetag-1.1.0.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- oxymetag/__init__.py,sha256=GM_D7cuLGpBCmwCp3PkT42Q3mnryV3BDvMWjMBqt4L8,441
2
- oxymetag/cli.py,sha256=ITt3UNUFrdeIskjSaK-705oOcAPUlzq5acqlWCXVDqM,4478
3
- oxymetag/core.py,sha256=AkuXmhhI2It-QFdfykJcZ-e_6i124yS9RMM-BAH6RlU,8565
4
- oxymetag/utils.py,sha256=x-WVnR-yNawY13alF_8J9Ihkjpaeg15IlEpZXyn2JSU,2604
5
- oxymetag/data/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
6
- oxymetag/data/Oxygen_pfams.csv,sha256=f3_CFPy235BxbX-2Ami8dJpTLQDImM8e_m87QcJvYKo,673
7
- oxymetag/data/oxygen_model.rds,sha256=8BMWnnIKCALapQnJKLpMnqTymFNAW46E-cQJGU2tJu0,8221
8
- oxymetag/data/oxymetag_pfams.dmnd,sha256=sSrkriGi-x4Ybf-pQtRRCFh1Wjm4rbQeX9FL9buDbAE,1472431
9
- oxymetag/data/pfam_headers_table.txt,sha256=wMg4WvlST6Zi3EzVFudjFHyREqNk8kHDI9Q6th7FdFY,255832
10
- oxymetag/data/pfam_lengths.tsv,sha256=--0bGxDN2v_WiBo0rKFJMPeOPsOrbaNhyjPAoIF9E5A,366
11
- oxymetag/scripts/predict_oxygen.R,sha256=72Eum7XFtJ-Be5vdIqY8FFIuPnEWTGYDMaZWv5OTPtQ,5549
12
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- oxymetag-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
14
- oxymetag-1.0.0.dist-info/METADATA,sha256=eo8h97bwY2mGTgkTNpNjwomUpq6c_k8q4EEsyMiZDIk,8968
15
- oxymetag-1.0.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
16
- oxymetag-1.0.0.dist-info/entry_points.txt,sha256=-9xMAfrSPtFBEvQWRNVKROTM_3OjEik34mVEsYFwM2k,47
17
- oxymetag-1.0.0.dist-info/top_level.txt,sha256=G7EHL5Fpxne8CH3w5IDIkrsRmMzaEfOhmTngNovoYi8,15
18
- oxymetag-1.0.0.dist-info/RECORD,,