PyPI - oxymetag - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

oxymetag 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

oxymetag/__init__.py +1 -1
oxymetag/cli.py +15 -13
oxymetag/core.py +114 -15
oxymetag/data/VTML20.out +33 -0
oxymetag/data/nucleotide.out +9 -0
oxymetag/data/oxygen_model.rds +0 -0
oxymetag/data/oxymetag_pfams_n117.dmnd +0 -0
oxymetag/data/oxymetag_pfams_n117_db +0 -0
oxymetag/data/oxymetag_pfams_n117_db.dbtype +0 -0
oxymetag/data/oxymetag_pfams_n117_db.index +23972 -0
oxymetag/data/oxymetag_pfams_n117_db.lookup +23972 -0
oxymetag/data/oxymetag_pfams_n117_db.source +1 -0
oxymetag/data/oxymetag_pfams_n117_db_h +0 -0
oxymetag/data/oxymetag_pfams_n117_db_h.dbtype +0 -0
oxymetag/data/oxymetag_pfams_n117_db_h.index +23972 -0
oxymetag/scripts/predict_oxygen.R +86 -38
oxymetag/utils.py +32 -14
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/METADATA +117 -52
oxymetag-1.1.0.dist-info/RECORD +29 -0
oxymetag-1.0.0.dist-info/RECORD +0 -18
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/LICENSE +0 -0
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/WHEEL +0 -0
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/entry_points.txt +0 -0
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/top_level.txt +0 -0

oxymetag/scripts/predict_oxygen.R CHANGED Viewed

@@ -25,7 +25,7 @@ bitcut <- as.numeric(args[7])
 predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut) {
-  # Load package data files (these come from oxymetag/data/)
+  # Load package data files
   map_file <- file.path(package_data_dir, "pfam_headers_table.txt")
   lengths_file <- file.path(package_data_dir, "pfam_lengths.tsv")
   model_file <- file.path(package_data_dir, "oxygen_model.rds")
@@ -47,19 +47,45 @@ predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut
   # Load the trained model and oxygen classifications
   oxygen_model <- readRDS(model_file)
-  # Get aerobic and anaerobic pfam lists (you'll need to define these)
   oxygen_pfams <- read.csv(pfams_file, stringsAsFactors = FALSE)
   aerobic_pfams <- oxygen_pfams %>% filter(Oxygen == "aerobic")
   anaerobic_pfams <- oxygen_pfams %>% filter(Oxygen == "anaerobic")
-  # Find diamond output files (user's working directory)
-  files <- list.files(input_dir, pattern = "*_diamond.tsv", full.names = TRUE)
+  # Determine method and file pattern based on mode
+  if (mode == "modern") {
+    method <- "diamond"
+    file_pattern <- "*_diamond.tsv"
+  } else if (mode == "ancient") {
+    method <- "mmseqs2"
+    file_pattern <- "*_mmseqs.tsv"
+  } else if (mode == "custom") {
+    # For custom mode, try to detect which files exist
+    diamond_files <- list.files(input_dir, pattern = "*_diamond.tsv", full.names = TRUE)
+    mmseqs_files <- list.files(input_dir, pattern = "*_mmseqs.tsv", full.names = TRUE)
+    if (length(diamond_files) > 0 && length(mmseqs_files) == 0) {
+      method <- "diamond"
+      file_pattern <- "*_diamond.tsv"
+    } else if (length(mmseqs_files) > 0 && length(diamond_files) == 0) {
+      method <- "mmseqs2"
+      file_pattern <- "*_mmseqs.tsv"
+    } else if (length(diamond_files) > 0 && length(mmseqs_files) > 0) {
+      stop("Both DIAMOND and MMseqs2 files found. Please specify input directory with only one type.")
+    } else {
+      stop("No profiling output files found in input directory.")
+    }
+  } else {
+    stop(paste("Unknown mode:", mode))
+  }
+  files <- list.files(input_dir, pattern = file_pattern, full.names = TRUE)
   if (length(files) == 0) {
-    stop(paste("No *_diamond.tsv files found in", input_dir))
+    stop(paste("No", file_pattern, "files found in", input_dir))
   }
+  message("Processing ", length(files), " files using ", method, " output format")
   # Initialize results dataframe
   results <- data.frame(
     SampleID = character(length(files)),
@@ -75,29 +101,50 @@ predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut
     # Extract sample ID from filename
     sample_id <- basename(files[i])
-    sample_id <- gsub("_diamond.tsv$", "", sample_id)
+    sample_id <- gsub("_diamond.tsv$|_mmseqs.tsv$", "", sample_id)
     results$SampleID[i] <- sample_id
-    # Read and filter diamond output
+    # Read and filter profiling output
     if (file.size(files[i]) == 0) {
       message("Warning: Empty file ", files[i])
       next
     }
-    d <- read.table(files[i], stringsAsFactors = FALSE) %>%
-      set_names(c("qseqid",	"sseqid",	"pident",	"length",	"qstart",	"qend",
-                  "sstart",	"send",	"evalue",	"bitscore"))
-    # Apply filtering based on mode
-    if (mode == "modern") {
-      d <- d %>%
-        filter(pident >= 60, evalue < 0.001, bitscore >= 50)
-    } else if (mode == "ancient") {
-      d <- d %>%
-        filter(pident >= 45, evalue < 0.1, bitscore >= 25)
-    } else if (mode == "custom") {
-      d <- d %>%
-        filter(pident >= idcut, evalue < ecut, bitscore >= bitcut)
+    # Read data based on method
+    if (method == "diamond") {
+      d <- read.table(files[i], stringsAsFactors = FALSE) %>%
+        set_names(c("qseqid", "sseqid", "pident", "length", "qstart", "qend",
+                    "sstart", "send", "evalue", "bitscore")) %>%
+        left_join(map, by = c("sseqid" = "Header")) %>%
+        group_by(qseqid) %>%
+        slice_max(bitscore, n = 1, with_ties = FALSE) %>%
+        ungroup()
+      # Apply filtering based on mode
+      if (mode == "modern") {
+        d <- d %>% filter(pident >= 60, evalue < 0.001, bitscore >= 50)
+      } else if (mode == "custom") {
+        d <- d %>% filter(pident >= idcut, evalue < ecut, bitscore >= bitcut)
+      }
+    } else if (method == "mmseqs2") {
+      d <- read.table(files[i], stringsAsFactors = FALSE) %>%
+        set_names(c("query", "target", "pident", "length", "mismatch", "gapopen",
+                    "qstart", "qend", "tstart", "tend", "evalue", "bitscore",
+                    "qlen", "tlen", "cigar", "qaln", "taln")) %>%
+        mutate(pident = pident * 100) %>%
+        left_join(map, by = c("target" = "Header")) %>%
+        group_by(query) %>%
+        slice_max(bitscore, n = 1, with_ties = FALSE) %>%
+        ungroup() %>%
+        filter(Pfam %in% oxygen_pfams$Pfam)
+      # Apply filtering based on mode
+      if (mode == "ancient") {
+        d <- d %>% filter(pident >= 86, evalue < 0.001, bitscore >= 50)
+      } else if (mode == "custom") {
+        d <- d %>% filter(pident >= idcut, evalue < ecut, bitscore >= bitcut)
+      }
     }
     if (nrow(d) == 0) {
@@ -108,45 +155,46 @@ predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut
       next
     }
-    # Join with pfam mapping
-    d <- d %>% left_join(map, by = c("sseqid" = "Header"))
     # Count pfams
     pf_count <- as.data.frame(table(d$Pfam))
-    results$Pfams[i] <- nrow(pf_count)
     results$aerobe_pfams[i] <- sum(as.character(pf_count$Var1) %in% aerobic_pfams$Pfam)
     results$anaerobe_pfams[i] <- sum(as.character(pf_count$Var1) %in% anaerobic_pfams$Pfam)
     # Calculate gene hits and length correction
     gene.hits <- d %>%
       group_by(Pfam) %>%
-      summarise(total_count = n())
+      summarise(total_count = n(), .groups = 'drop')
     gene.hit.length.correction <- gene.hits %>%
-      left_join(., pfam_gene_length, by = "Pfam") %>%
-      mutate(RPK = total_count / (1000*Gene.length)) %>%
-      left_join(., oxygen_pfams, by = "Pfam")
+      left_join(pfam_gene_length, by = "Pfam") %>%
+      mutate(RPK = total_count / (1000 * Gene.length)) %>%
+      left_join(oxygen_pfams, by = "Pfam")
     # Sum by oxygen type
     oxygen_rpk <- gene.hit.length.correction %>%
       group_by(Oxygen) %>%
-      summarize(RPKsum = sum(RPK))
+      summarize(RPKsum = sum(RPK, na.rm = TRUE), .groups = 'drop')
-    # Calculate the ratio and add it to the dataframe
-    results$ratio[i] <- oxygen_rpk$RPKsum[1] / oxygen_rpk$RPKsum[2]
+    # Calculate ratio (aerobe/anaerobe)
+    aerobe_rpk <- oxygen_rpk$RPKsum[oxygen_rpk$Oxygen == "aerobic"]
+    anaerobe_rpk <- oxygen_rpk$RPKsum[oxygen_rpk$Oxygen == "anaerobic"]
+    if (length(anaerobe_rpk) == 0 || anaerobe_rpk == 0) {
+      results$ratio[i] <- ifelse(length(aerobe_rpk) > 0 && aerobe_rpk > 0, Inf, 0)
+    } else {
+      results$ratio[i] <- aerobe_rpk / anaerobe_rpk
+    }
-    # Processing message
     message("Processed sample ", i, "/", length(files), ": ", sample_id)
   }
   # Make predictions using the GAM model
   new_data <- data.frame(ratio = results$ratio)
-  results$Per_aerobe <- predict(oxygen_model, newdata = new_data)
+  results$Per_aerobe <- predict(oxygen_model, newdata = new_data, type = "response")
   # Constrain predictions to 0-100% and set to 100% if ratio > 35
   results <- results %>%
-    mutate(Per_aerobe = ifelse(Per_aerobe > 100, 100, Per_aerobe)) %>%
-    mutate(Per_aerobe = ifelse(Per_aerobe < 0, 0, Per_aerobe)) %>%
+    mutate(Per_aerobe = pmax(0, pmin(100, Per_aerobe))) %>%
     mutate(Per_aerobe = ifelse(ratio > 35, 100, Per_aerobe))
   # Save results
@@ -157,4 +205,4 @@ predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut
 }
 # Run the function
-predict_oxygen(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut)
+predict_oxygen(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut)

oxymetag/utils.py CHANGED Viewed

@@ -4,19 +4,49 @@ Utility functions for OxyMetaG
 """
 import subprocess
-import pkg_resources
 from pathlib import Path
 import logging
+# Use importlib.resources instead of deprecated pkg_resources
+try:
+    from importlib.resources import files
+    use_importlib = True
+except ImportError:
+    use_importlib = False
 logger = logging.getLogger('oxymetag')
 class OxyMetaGError(Exception):
     """Custom exception for OxyMetaG errors"""
     pass
+def get_package_data_path(filename: str) -> str:
+    """Get path to package data files"""
+    if use_importlib:
+        try:
+            package_files = files('oxymetag')
+            if filename.startswith('../'):
+                parts = filename.split('/')
+                for part in parts:
+                    if part == '..':
+                        package_files = package_files.parent
+                    elif part and part != '.':
+                        package_files = package_files / part
+                return str(package_files)
+            else:
+                return str(package_files / 'data' / filename)
+        except:
+            pass
+    package_dir = Path(__file__).parent
+    if filename.startswith('../'):
+        return str(package_dir / filename)
+    else:
+        return str(package_dir / 'data' / filename)
 def check_dependencies():
     """Check if required external tools are available"""
-    required_tools = ['kraken2', 'diamond', 'Rscript']
+    required_tools = ['kraken2', 'diamond', 'mmseqs', 'Rscript']
     missing_tools = []
     for tool in required_tools:
@@ -26,14 +56,6 @@ def check_dependencies():
     if missing_tools:
         raise OxyMetaGError(f"Missing required tools: {', '.join(missing_tools)}")
-def get_package_data_path(filename: str) -> str:
-    """Get path to package data files"""
-    try:
-        return pkg_resources.resource_filename('oxymetag', f'data/{filename}')
-    except:
-        package_dir = Path(__file__).parent
-        return str(package_dir / 'data' / filename)
 def run_kraken2_setup():
     """Download and set up standard Kraken2 database without fungi"""
     logger.info("Setting up Kraken2 database (bacteria, archaea, viral)...")
@@ -42,13 +64,11 @@ def run_kraken2_setup():
     db_path.mkdir(exist_ok=True)
     try:
-        # Download taxonomy
         cmd = ['kraken2-build', '--download-taxonomy', '--db', str(db_path)]
         logger.info("Downloading taxonomy...")
         subprocess.run(cmd, check=True)
         logger.info("Taxonomy downloaded successfully")
-        # Download libraries (excluding fungi)
         libraries = ['bacteria', 'archaea', 'viral']
         for lib in libraries:
             cmd = ['kraken2-build', '--download-library', lib, '--db', str(db_path)]
@@ -56,12 +76,10 @@ def run_kraken2_setup():
             subprocess.run(cmd, check=True)
             logger.info(f"{lib} library downloaded successfully")
-        # Build database
         cmd = ['kraken2-build', '--build', '--db', str(db_path), '--threads', '48']
         logger.info("Building Kraken2 database...")
         subprocess.run(cmd, check=True)
-        # Clean up temporary files to save space
         cmd = ['kraken2-build', '--clean', '--db', str(db_path)]
         logger.info("Cleaning up temporary files...")
         subprocess.run(cmd, check=True)

{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: oxymetag
-Version: 1.0.0
+Version: 1.1.0
 Summary: Oxygen metabolism profiling from metagenomic data
 Home-page: https://github.com/cliffbueno/oxymetag
 Author: Clifton P. Bueno de Mesquita
@@ -25,9 +25,9 @@ Requires-Dist: numpy >=1.20.0
 Oxygen metabolism profiling from metagenomic data using Pfam domains. OxyMetaG predicts the percent relative abundance of aerobic bacteria in metagenomic reads based on the ratio of abundances of a set of 20 Pfams. It is recommended to use a HPC cluster or server rather than laptop to run OxyMetaG due to memory requirements, particularly for the step of extracting bacterial reads. If you already have bacterial reads, the "profile" and "predict" functions will run quickly on a laptop.
-If you are working with modern metagenomes, we recommend first quality filtering the raw reads with your method of choice and standard practices, and then extracting bacterial reads with Kraken2 and KrakenTools, which is performed with the OxyMetaG extract function.
+If you are working with modern metagenomes, we recommend first quality filtering the raw reads with your method of choice and standard practices, and then extracting bacterial reads with Kraken2 and KrakenTools, which is performed with the OxyMetaG extract function. For profiling modern metagenomes, use DIAMOND blastx with the default `-m modern` mode for the predict step.
-If you are working with ancient metagenomes, we recommend first quality filtering the raw reads with your method of choice and standard practices, and then extracting bacterial reads with a workflow optimized for ancient DNA, such as the one employed by De Sanctis et al. (2025).
+If you are working with ancient metagenomes, we recommend first quality filtering the raw reads with your method of choice and standard practices, and then extracting bacterial reads with a workflow optimized for ancient DNA, such as the read mapping approach employed by De Sanctis et al. (2025). For profiling ancient metagenomes, use MMseqs2 with `-m mmseqs2` for the profile step and `-m ancient` for the predict step. The ancient mode uses parameters optimized for ancient DNA along with 97 decoy Pfams to reduce instances of false positives.
 ## Installation
@@ -46,16 +46,22 @@ conda env create -f environment.yml
 conda activate oxymetag
 # Install OxyMetaG
-pip install oxymetag
+pip install -e .
+# Index the MMseqs2 database (one-time setup, ~5-10 minutes)
+mmseqs createindex oxymetag/data/oxymetag_pfams_n117_db tmp
 ```
+**Note:** The MMseqs2 database indexing is optional but highly recommended for faster searches.
 ### Using Pip
 First install external dependencies:
 - Kraken2
 - DIAMOND
+- MMseqs2
 - KrakenTools
-- R with mgcv and dplyr packages
+- R with mgcv, dplyr, tidyr, and rlang packages
 Then install OxyMetaG:
 ```bash
@@ -64,30 +70,38 @@ pip install oxymetag
 ## Quick Start
-### 1. Setup the standard Kraken2 database
+### Modern DNA workflow
 ```bash
+# 1. Setup Kraken2 database (one-time)
 oxymetag setup
-```
-### 2. Extract bacterial reads
-```bash
-oxymetag extract -i sample1_R1.fastq.gz sample1_R2.fastq.gz -o BactReads -t 48
-```
+# 2. Extract bacterial reads
+oxymetag extract -i sample1_R1.fastq.gz -o BactReads -t 48
-### 3. Profile samples
-```bash
-oxymetag profile -i BactReads -o diamond_output -t 8
+# 3. Profile samples with DIAMOND
+oxymetag profile -i BactReads -o diamond_output -m diamond -t 8
+# 4. Predict aerobe levels
+oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m modern
 ```
-### 4. Predict aerobe levels
+### Ancient DNA workflow
 ```bash
-# For modern DNA
-oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m modern
+# 1. Extract bacterial reads (use ancient DNA-optimized workflow if available)
+# If using oxymetag extract, same as modern workflow
+# 2. Profile samples with MMseqs2
+oxymetag profile -i BactReads -o mmseqs_output -m mmseqs2 -t 8
-# For ancient DNA
-oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m ancient
+# 3. Predict aerobe levels with ancient mode
+oxymetag predict -i mmseqs_output -o per_aerobe_predictions.tsv -m ancient
+```
+### Custom parameters
-# Custom cutoffs
+```bash
 oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m custom --idcut 50 --bitcut 30 --ecut 0.01
 ```
@@ -98,11 +112,11 @@ oxymetag predict -i diamond_output -o per_aerobe_predictions.tsv -m custom --idc
 **What it does:** Downloads and builds the standard Kraken2 database containing bacterial, archaeal, and viral genomes. This database is used by the `extract` command to identify bacterial sequences from metagenomic samples.
-**Time:** 2-4 hours depending on internet speed and system performance.
+**Time:** Depends on internet speed and system performance, but will likely take several hours (4-20 hours typical).
-**Output:** Creates a `kraken2_db/` directory with the standard database.
+**Output:** Creates a `kraken2_db/` directory with the standard database (~90 GB).
-Make sure you run oxymetag setup from the directory where you want the database to live, or plan to always specify the --kraken-db path when running extract. The database is quite large (~50-100 GB), so choose a location with sufficient storage.
+Make sure you run oxymetag setup from the directory where you want the database to live, or plan to always specify the --kraken-db path when running extract. The database is quite large, so choose a location with sufficient storage.
 ---
@@ -114,7 +128,7 @@ Make sure you run oxymetag setup from the directory where you want the database
 2. Uses KrakenTools to extract only the reads classified as bacterial
 3. Outputs cleaned bacterial-only FASTQ files for downstream analysis
-**Input:** Quality filtered metagenomic read FASTQ files (paired-end or merged)\
+**Input:** Quality filtered metagenomic read FASTQ files (paired-end or merged)
 **Output:** Bacterial-only FASTQ files in `BactReads/` directory
 **Arguments:**
@@ -126,22 +140,26 @@ Make sure you run oxymetag setup from the directory where you want the database
 ---
 ### oxymetag profile
-**Function:** Profiles bacterial reads against oxygen metabolism protein domains.
+**Function:** Profiles bacterial reads against oxygen metabolism protein domains using DIAMOND or MMseqs2.
 **What it does:**
 1. Takes bacterial-only reads from the `extract` step
-2. Uses DIAMOND blastx to search against a curated database of 20 Pfam domains related to oxygen metabolism
+2. Uses DIAMOND blastx (for modern DNA) or MMseqs2 (for ancient DNA) to search against a curated database of Pfam domains related to oxygen metabolism
+   - DIAMOND mode: 20 target Pfams
+   - MMseqs2 mode: 20 target Pfams + 97 decoy Pfams (117 total) to reduce false positives
 3. Identifies protein-coding sequences and their functional annotations
 4. Creates detailed hit tables for each sample
-**Input:** Bacterial FASTQ files (uses R1 or merged reads only)\
-**Output:** DIAMOND alignment files (TSV format) in `diamond_output/` directory
+**Input:** Bacterial FASTQ files (uses R1 or merged reads only)
+**Output:** Alignment files (TSV format) in `diamond_output/` or `mmseqs_output/` directory
 **Arguments:**
 - `-i, --input`: Input directory with bacterial reads (default: BactReads)
-- `-o, --output`: Output directory (default: diamond_output)
+- `-o, --output`: Output directory (default: diamond_output or mmseqs_output depending on method)
 - `-t, --threads`: Number of threads (default: 4)
-- `--diamond-db`: Custom DIAMOND database path (optional)
+- `-m, --method`: Profiling method - 'diamond' or 'mmseqs2' (default: diamond)
+- `--diamond-db`: Custom DIAMOND database path (optional, for diamond method)
+- `--mmseqs-db`: Custom MMseqs2 database path (optional, for mmseqs2 method)
 ---
@@ -149,17 +167,24 @@ Make sure you run oxymetag setup from the directory where you want the database
 **Function:** Predicts aerobe abundance from protein domain profiles using machine learning.
 **What it does:**
-1. Processes DIAMOND output files with appropriate quality filters
-2. Normalizes protein domain counts by gene length (reads per kilobase)
-3. Calculates aerobic/anaerobic domain ratios for each sample
-4. Applies a trained GAM (Generalized Additive Model) to predict percentage of aerobes
-5. Outputs a table with the sampleID, # Pfams detected, and predicted % aerobic bacteria
-**Input:** DIAMOND output directory from `profile` step\
+1. Processes DIAMOND or MMseqs2 output files with appropriate quality filters
+2. Selects the top hit per read based on bitscore
+3. For MMseqs2 (ancient mode): filters out decoy Pfams after selecting top hits
+4. Normalizes protein domain counts by gene length (reads per kilobase)
+5. Calculates aerobic/anaerobic domain ratios for each sample
+6. Applies a trained GAM (Generalized Additive Model) to predict percentage of aerobes
+7. Outputs a table with the sampleID, # Pfams detected, and predicted % aerobic bacteria
+**Input:** DIAMOND or MMseqs2 output directory from `profile` step
 **Output:** Tab-separated file with aerobe predictions for each sample
+**Mode determines input type:**
+- `-m modern`: Uses DIAMOND output (default input: diamond_output/)
+- `-m ancient`: Uses MMseqs2 output (default input: mmseqs_output/)
+- `-m custom`: Auto-detects DIAMOND or MMseqs2 files in input directory
 **Arguments:**
-- `-i, --input`: Directory with DIAMOND output (default: diamond_output)
+- `-i, --input`: Directory with profiling output (default: diamond_output for modern, mmseqs_output for ancient)
 - `-o, --output`: Output file (default: per_aerobe_predictions.tsv)
 - `-t, --threads`: Number of threads (default: 4)
 - `-m, --mode`: Filtering mode - 'modern', 'ancient', or 'custom' (default: modern)
@@ -169,32 +194,56 @@ Make sure you run oxymetag setup from the directory where you want the database
 ## Filtering Modes
-OxyMetaG includes three pre-configured filtering modes optimized for different types of DNA:
+OxyMetaG includes three pre-configured filtering modes optimized for different types of DNA. In any case, it is always recommended to try several different parameters (using -m custom) to check how sensitive the results are to the cutoffs.
 ### Modern DNA (default)
-**Best for:** Modern environmental metagenomes
+**Best for:** Modern environmental metagenomes
+**Method:** DIAMOND blastx
+**Filters:**
 - Identity ≥ 60%
 - Bitscore ≥ 50
 - E-value ≤ 0.001
-### Ancient DNA
-**Best for:** Archaeological samples, paleogenomic data, degraded environmental DNA
-- Identity ≥ 45% (accounts for DNA damage)
-- Bitscore ≥ 25 (accommodates shorter fragments)
-- E-value ≤ 0.1 (more permissive for low-quality data)
+**Usage:**
+```bash
+oxymetag profile -m diamond
+oxymetag predict -m modern
+```
+### Ancient DNA
+**Best for:** Archaeological samples, paleogenomic data, degraded environmental DNA
+**Method:** MMseqs2 with decoy Pfams
+**Filters:**
+- Identity ≥ 86%
+- Bitscore ≥ 50
+- E-value ≤ 0.001
+**Note:** The ancient mode uses stricter identity cutoffs but employs 97 decoy Pfams to reduce false positives from damaged DNA. Reads matching decoys better than target Pfams are filtered out.
+**Usage:**
+```bash
+oxymetag profile -m mmseqs2
+oxymetag predict -m ancient
+```
 ### Custom
 **Best for:** Specialized applications or when you want to optimize parameters
 - Specify your own `--idcut`, `--bitcut`, and `--ecut` values
+- Auto-detects whether input is from DIAMOND or MMseqs2
 - Useful for method development or unusual sample types
+**Usage:**
+```bash
+oxymetag predict -m custom --idcut 50 --bitcut 30 --ecut 0.01
+```
 ## Output
 The final output (`per_aerobe_predictions.tsv`) contains:
 - `SampleID`: Sample identifier extracted from filenames
 - `ratio`: Aerobic/anaerobic domain ratio
-- `aerobe_pfams`: Number of aerobic Pfam domains detected
-- `anaerobe_pfams`: Number of anaerobic Pfam domains detected
+- `aerobe_pfams`: Number of aerobic Pfam domains detected (from 20 target Pfams)
+- `anaerobe_pfams`: Number of anaerobic Pfam domains detected (from 20 target Pfams)
 - `Per_aerobe`: **Predicted percentage of aerobic bacteria (0-100%)**
 ## Biological Interpretation
@@ -212,17 +261,33 @@ The `Per_aerobe` value represents the predicted percentage of aerobic bacteria i
 If you use OxyMetaG in your research, please cite:
 ```
-Bueno de Mesquita, C.P., Stallard-Olivera, E., Fierer, N. (2025). Bueno de Mesquita, C.P. et al. (2025). Predicting the proportion of aerobic and anaerobic bacteria from metagenomic reads with OxyMetaG.
+Bueno de Mesquita, C.P., Stallard-Olivera, E., Fierer, N. (2025).
+Quantifying the oxygen preferences of bacterial communities using a
+metagenome-based approach.
+```
+### Additional citations
+If you use the **extract** function, also cite Kraken2 and KrakenTools:
+```
+Lu, J., Rincon, N., Wood, D.E. et al. Metagenome analysis using the Kraken
+software suite. Nat Protoc 17, 2815–2839 (2022).
+https://doi.org/10.1038/s41596-022-00738-y
 ```
-If you use the extract function, also cite Kraken2 and KrakenTools:
+If you use the **profile** function with DIAMOND (`-m diamond`), also cite:
 ```
-Lu, J., Rincon, N., Wood, D.E. et al. Metagenome analysis using the Kraken software suite. Nat Protoc 17, 2815–2839 (2022). https://doi.org/10.1038/s41596-022-00738-y
+Buchfink, B., Xie, C. & Huson, D. Fast and sensitive protein alignment using
+DIAMOND. Nat Methods 12, 59–60 (2015).
+https://doi.org/10.1038/nmeth.3176
 ```
-If you use the profile function, also cite DIAMOND
+If you use the **profile** function with MMseqs2 (`-m mmseqs2`), also cite:
 ```
-Buchfink, B., Xie, C. & Huson, D. Fast and sensitive protein alignment using DIAMOND. Nat Methods 12, 59–60 (2015). https://doi.org/10.1038/nmeth.3176
+Steinegger, M., Söding, J. MMseqs2 enables sensitive protein sequence
+searching for the analysis of massive data sets. Nat Biotechnol 35,
+1026–1028 (2017).
+https://doi.org/10.1038/nbt.3988
 ```
 ## License

oxymetag-1.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,29 @@
+oxymetag/__init__.py,sha256=KaqG7NoPm89jxncdp4Ym3GNAI9h0MUDI-_4ugy6E7jQ,441
+oxymetag/cli.py,sha256=GIXCxpBUQIRWgcSyQNudAWyuVWAAnxK1DRGXG8jpVzQ,4877
+oxymetag/core.py,sha256=zLLVT6XsNb7qFtDlobIh44xgnOzXauYBCL8nNLA4XSk,12496
+oxymetag/utils.py,sha256=fxzKYNUpsuBnYZxlh8QMIEg5wktV6BatoLasfawTeW8,3178
+oxymetag/data/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
+oxymetag/data/Oxygen_pfams.csv,sha256=f3_CFPy235BxbX-2Ami8dJpTLQDImM8e_m87QcJvYKo,673
+oxymetag/data/VTML20.out,sha256=V7F_3HDTtVccoSeuryUJObU5bY3dbfAYHx1TLFixqA8,1930
+oxymetag/data/nucleotide.out,sha256=0IBDiBP7-Q-DJg_Ky3y7Rbv3Dw4s01Tu7S5MtxVJQo4,396
+oxymetag/data/oxygen_model.rds,sha256=UP_3sIJJHEWCfiSTZPJrsVti91-mEUWOn7PgfNcWEDo,8240
+oxymetag/data/oxymetag_pfams.dmnd,sha256=sSrkriGi-x4Ybf-pQtRRCFh1Wjm4rbQeX9FL9buDbAE,1472431
+oxymetag/data/oxymetag_pfams_n117.dmnd,sha256=yIh1qK3aD23VDmG_jsGnx9I7ahh7vV5dSUnBb1y9Eek,15572193
+oxymetag/data/oxymetag_pfams_n117_db,sha256=2C0jpmNzEd-_F6J2QaBW8SeNvivJ4L21nqYU9WLQNsU,12197517
+oxymetag/data/oxymetag_pfams_n117_db.dbtype,sha256=3z9hmASpL9tAVxktxD3XSOp3itxSvEmM6AUkwBS4ERk,4
+oxymetag/data/oxymetag_pfams_n117_db.index,sha256=bzgoZhWEsmFx6IU7aO5ArTMDBb7_1yepVSEP0oRt5Ms,423068
+oxymetag/data/oxymetag_pfams_n117_db.lookup,sha256=XmvjvMlDr9CPklK-iZYv4LD9O9mVgMHR0s0zaVlowM4,919596
+oxymetag/data/oxymetag_pfams_n117_db.source,sha256=qeG0mSUfLERK--Fq8hqNrStPEOgyEhXzIXZixYDADYs,27
+oxymetag/data/oxymetag_pfams_n117_db_h,sha256=xDBPxG39tF1Ez70jd3XxqdyPxlmSr3B0sf9T6eFJ0Xo,1542665
+oxymetag/data/oxymetag_pfams_n117_db_h.dbtype,sha256=QvSuuBwe-B93Hz3oq8qdz2aQHFdVMOdnLksRRkdK5lA,4
+oxymetag/data/oxymetag_pfams_n117_db_h.index,sha256=JQc2q-oSWwx-sv32muf51tZVgmDl0DgM6m1eJr5Z-V4,379408
+oxymetag/data/pfam_headers_table.txt,sha256=wMg4WvlST6Zi3EzVFudjFHyREqNk8kHDI9Q6th7FdFY,255832
+oxymetag/data/pfam_lengths.tsv,sha256=--0bGxDN2v_WiBo0rKFJMPeOPsOrbaNhyjPAoIF9E5A,366
+oxymetag/scripts/predict_oxygen.R,sha256=NUv23y0l7BQxFUoxHqttlKveh6LYmO2qgP8hciGdpMs,7657
+tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+oxymetag-1.1.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
+oxymetag-1.1.0.dist-info/METADATA,sha256=LYHfJ-otW3Tckz7ZbTVxTpIvb-fSJi3S6yeiLK6ifHA,11745
+oxymetag-1.1.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+oxymetag-1.1.0.dist-info/entry_points.txt,sha256=-9xMAfrSPtFBEvQWRNVKROTM_3OjEik34mVEsYFwM2k,47
+oxymetag-1.1.0.dist-info/top_level.txt,sha256=G7EHL5Fpxne8CH3w5IDIkrsRmMzaEfOhmTngNovoYi8,15
+oxymetag-1.1.0.dist-info/RECORD,,

oxymetag-1.0.0.dist-info/RECORD DELETED Viewed

@@ -1,18 +0,0 @@
-oxymetag/__init__.py,sha256=GM_D7cuLGpBCmwCp3PkT42Q3mnryV3BDvMWjMBqt4L8,441
-oxymetag/cli.py,sha256=ITt3UNUFrdeIskjSaK-705oOcAPUlzq5acqlWCXVDqM,4478
-oxymetag/core.py,sha256=AkuXmhhI2It-QFdfykJcZ-e_6i124yS9RMM-BAH6RlU,8565
-oxymetag/utils.py,sha256=x-WVnR-yNawY13alF_8J9Ihkjpaeg15IlEpZXyn2JSU,2604
-oxymetag/data/.DS_Store,sha256=1lFlJ5EFymdzGAUAaI30vcaaLHt3F1LwpG7xILf9jsM,6148
-oxymetag/data/Oxygen_pfams.csv,sha256=f3_CFPy235BxbX-2Ami8dJpTLQDImM8e_m87QcJvYKo,673
-oxymetag/data/oxygen_model.rds,sha256=8BMWnnIKCALapQnJKLpMnqTymFNAW46E-cQJGU2tJu0,8221
-oxymetag/data/oxymetag_pfams.dmnd,sha256=sSrkriGi-x4Ybf-pQtRRCFh1Wjm4rbQeX9FL9buDbAE,1472431
-oxymetag/data/pfam_headers_table.txt,sha256=wMg4WvlST6Zi3EzVFudjFHyREqNk8kHDI9Q6th7FdFY,255832
-oxymetag/data/pfam_lengths.tsv,sha256=--0bGxDN2v_WiBo0rKFJMPeOPsOrbaNhyjPAoIF9E5A,366
-oxymetag/scripts/predict_oxygen.R,sha256=72Eum7XFtJ-Be5vdIqY8FFIuPnEWTGYDMaZWv5OTPtQ,5549
-tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-oxymetag-1.0.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
-oxymetag-1.0.0.dist-info/METADATA,sha256=eo8h97bwY2mGTgkTNpNjwomUpq6c_k8q4EEsyMiZDIk,8968
-oxymetag-1.0.0.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-oxymetag-1.0.0.dist-info/entry_points.txt,sha256=-9xMAfrSPtFBEvQWRNVKROTM_3OjEik34mVEsYFwM2k,47
-oxymetag-1.0.0.dist-info/top_level.txt,sha256=G7EHL5Fpxne8CH3w5IDIkrsRmMzaEfOhmTngNovoYi8,15
-oxymetag-1.0.0.dist-info/RECORD,,

{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

oxymetag 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

oxymetag 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl