PyPI - oxymetag - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

oxymetag 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

oxymetag/__init__.py +1 -1
oxymetag/cli.py +15 -13
oxymetag/core.py +114 -15
oxymetag/data/VTML20.out +33 -0
oxymetag/data/nucleotide.out +9 -0
oxymetag/data/oxygen_model.rds +0 -0
oxymetag/data/oxymetag_pfams_n117.dmnd +0 -0
oxymetag/data/oxymetag_pfams_n117_db +0 -0
oxymetag/data/oxymetag_pfams_n117_db.dbtype +0 -0
oxymetag/data/oxymetag_pfams_n117_db.index +23972 -0
oxymetag/data/oxymetag_pfams_n117_db.lookup +23972 -0
oxymetag/data/oxymetag_pfams_n117_db.source +1 -0
oxymetag/data/oxymetag_pfams_n117_db_h +0 -0
oxymetag/data/oxymetag_pfams_n117_db_h.dbtype +0 -0
oxymetag/data/oxymetag_pfams_n117_db_h.index +23972 -0
oxymetag/scripts/predict_oxygen.R +86 -38
oxymetag/utils.py +32 -14
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/METADATA +117 -52
oxymetag-1.1.0.dist-info/RECORD +29 -0
oxymetag-1.0.0.dist-info/RECORD +0 -18
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/LICENSE +0 -0
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/WHEEL +0 -0
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/entry_points.txt +0 -0
{oxymetag-1.0.0.dist-info → oxymetag-1.1.0.dist-info}/top_level.txt +0 -0

oxymetag/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@
 OxyMetaG: Oxygen metabolism profiling from metagenomic data
 """
-__version__ = "1.0.0"
+__version__ = "1.1.0"
 __author__ = "Clifton P. Bueno de Mesquita"
 __email__ = "cliff.buenodemesquita@colorado.edu"

oxymetag/cli.py CHANGED Viewed

@@ -11,7 +11,6 @@ from . import __version__
 from .core import extract_reads, profile_samples, predict_aerobes
 from .utils import check_dependencies, run_kraken2_setup, OxyMetaGError
-# Set up logging
 logging.basicConfig(
     level=logging.INFO,
     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
@@ -29,10 +28,8 @@ def main():
     subparsers = parser.add_subparsers(dest='command', help='Available commands')
-    # Setup command
     setup_parser = subparsers.add_parser('setup', help='Setup Kraken2 database')
-    # Extract command
     extract_parser = subparsers.add_parser('extract', help='Extract bacterial reads')
     extract_parser.add_argument('-i', '--input', nargs='+', required=True,
                                help='Input fastq.gz files')
@@ -43,27 +40,31 @@ def main():
     extract_parser.add_argument('--kraken-db', default='kraken2_db',
                                help='Kraken2 database path (default: kraken2_db)')
-    # Profile command
-    profile_parser = subparsers.add_parser('profile', help='Profile samples with DIAMOND')
+    profile_parser = subparsers.add_parser('profile', help='Profile samples with DIAMOND or MMseqs2')
     profile_parser.add_argument('-i', '--input', default='BactReads',
                                help='Input directory (default: BactReads)')
-    profile_parser.add_argument('-o', '--output', default='diamond_output',
-                               help='Output directory (default: diamond_output)')
+    profile_parser.add_argument('-o', '--output', default=None,
+                               help='Output directory (default: diamond_output or mmseqs_output)')
     profile_parser.add_argument('-t', '--threads', type=int, default=4,
                                help='Number of threads (default: 4)')
+    profile_parser.add_argument('-m', '--method', choices=['diamond', 'mmseqs2'],
+                               default='diamond',
+                               help='Profiling method (default: diamond)')
     profile_parser.add_argument('--diamond-db',
                                help='DIAMOND database path (default: package database)')
+    profile_parser.add_argument('--mmseqs-db',
+                               help='MMseqs2 database path (default: package database)')
-    # Predict command
     predict_parser = subparsers.add_parser('predict', help='Predict aerobe levels')
-    predict_parser.add_argument('-i', '--input', default='diamond_output',
-                               help='Input directory (default: diamond_output)')
+    predict_parser.add_argument('-i', '--input', default=None,
+                               help='Input directory (default: diamond_output for modern, mmseqs_output for ancient)')
     predict_parser.add_argument('-o', '--output', default='per_aerobe_predictions.tsv',
                                help='Output file (default: per_aerobe_predictions.tsv)')
     predict_parser.add_argument('-t', '--threads', type=int, default=4,
                                help='Number of threads (default: 4)')
     predict_parser.add_argument('-m', '--mode', choices=['modern', 'ancient', 'custom'],
-                               default='modern', help='Filtering mode (default: modern)')
+                               default='modern',
+                               help='Filtering mode: modern=DIAMOND, ancient=MMseqs2 (default: modern)')
     predict_parser.add_argument('--idcut', type=float,
                                help='Custom identity cutoff (for custom mode)')
     predict_parser.add_argument('--bitcut', type=float,
@@ -87,11 +88,12 @@ def main():
             extract_reads(args.input, args.output, args.threads, args.kraken_db)
         elif args.command == 'profile':
-            profile_samples(args.input, args.output, args.threads, args.diamond_db)
+            profile_samples(args.input, args.output, args.threads, args.method,
+                          args.diamond_db, args.mmseqs_db)
         elif args.command == 'predict':
             predict_aerobes(args.input, args.output, args.mode,
-                           args.idcut, args.bitcut, args.ecut, args.threads)
+                          args.idcut, args.bitcut, args.ecut, args.threads)
         logger.info("Command completed successfully")

oxymetag/core.py CHANGED Viewed

@@ -108,20 +108,18 @@ def extract_reads(input_files: List[str], output_dir: str = "BactReads",
             continue
-def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_output",
-                   threads: int = 4, diamond_db: str = None):
+def profile_samples(input_dir: str = "BactReads", output_dir: str = None,
+                   threads: int = 4, method: str = "diamond",
+                   diamond_db: str = None, mmseqs_db: str = None):
     """
-    Profile samples using DIAMOND blastx against Pfam database
+    Profile samples using DIAMOND or MMseqs2 against Pfam database
     """
     from .utils import get_package_data_path
-    logger.info(f"Starting sample profiling with {threads} threads")
+    logger.info(f"Starting sample profiling with {method} using {threads} threads")
-    if diamond_db is None:
-        diamond_db = get_package_data_path("oxymetag_pfams.dmnd")
-    if not Path(diamond_db).exists():
-        raise OxyMetaGError(f"DIAMOND database not found: {diamond_db}")
+    if output_dir is None:
+        output_dir = 'diamond_output' if method == 'diamond' else 'mmseqs_output'
     output_path = Path(output_dir)
     output_path.mkdir(exist_ok=True)
@@ -132,6 +130,8 @@ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_out
     patterns = [
         '*_R1_bacterial.fastq.gz',
         '*_1_bacterial.fastq.gz',
+        '*_bacterial_R1.fastq.gz',
+        '*_bacterial_1.fastq.gz',
         '*_bacterial.fastq.gz'
     ]
@@ -147,9 +147,32 @@ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_out
         logger.error(f"FASTQ files in {input_dir}: {[f.name for f in all_files[:5]]}")
         raise OxyMetaGError(f"No bacterial read files found in {input_dir}")
+    if method == 'diamond':
+        _profile_with_diamond(input_files, output_path, threads, diamond_db)
+    elif method == 'mmseqs2':
+        _profile_with_mmseqs(input_files, output_path, threads, mmseqs_db)
+    else:
+        raise OxyMetaGError(f"Unknown method: {method}")
+def _profile_with_diamond(input_files: List[Path], output_path: Path,
+                         threads: int, diamond_db: str = None):
+    """Profile samples using DIAMOND blastx"""
+    from .utils import get_package_data_path
+    if diamond_db is None:
+        diamond_db = get_package_data_path("oxymetag_pfams.dmnd")
+    if not Path(diamond_db).exists():
+        raise OxyMetaGError(f"DIAMOND database not found: {diamond_db}")
     for input_file in input_files:
         base_name = input_file.stem.replace('.fastq', '').replace('.gz', '')
-        base_name = base_name.replace('_R1_bacterial', '').replace('_1_bacterial', '').replace('_bacterial', '')
+        base_name = (base_name.replace('_R1_bacterial', '')
+                             .replace('_1_bacterial', '')
+                             .replace('_bacterial_R1', '')
+                             .replace('_bacterial_1', '')
+                             .replace('_bacterial', ''))
         logger.info(f"Processing {input_file}")
@@ -172,20 +195,95 @@ def profile_samples(input_dir: str = "BactReads", output_dir: str = "diamond_out
             continue
-def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_aerobe_predictions.tsv",
+def _profile_with_mmseqs(input_files: List[Path], output_path: Path,
+                        threads: int, mmseqs_db: str = None):
+    """Profile samples using MMseqs2 easy-search"""
+    from .utils import get_package_data_path
+    if mmseqs_db is None:
+        mmseqs_db = get_package_data_path("oxymetag_pfams_n117_db")
+    if not Path(mmseqs_db).exists():
+        raise OxyMetaGError(f"MMseqs2 database not found: {mmseqs_db}")
+    data_dir = Path(get_package_data_path(""))
+    vtml_matrix = data_dir / "VTML20.out"
+    nucl_matrix = data_dir / "nucleotide.out"
+    if not vtml_matrix.exists():
+        raise OxyMetaGError(f"VTML20.out matrix not found: {vtml_matrix}")
+    if not nucl_matrix.exists():
+        raise OxyMetaGError(f"nucleotide.out matrix not found: {nucl_matrix}")
+    for input_file in input_files:
+        base_name = input_file.stem.replace('.fastq', '').replace('.gz', '')
+        base_name = (base_name.replace('_R1_bacterial', '')
+                             .replace('_1_bacterial', '')
+                             .replace('_bacterial_R1', '')
+                             .replace('_bacterial_1', '')
+                             .replace('_bacterial', ''))
+        logger.info(f"Processing {input_file} with MMseqs2")
+        output_file = output_path / f"{base_name}_mmseqs.tsv"
+        tmp_dir = output_path / f"{base_name}_tmp"
+        tmp_dir.mkdir(exist_ok=True)
+        cmd = [
+            'mmseqs', 'easy-search',
+            str(input_file),
+            str(mmseqs_db),
+            str(output_file),
+            str(tmp_dir),
+            '--min-length', '12',
+            '-e', '10.0',
+            '--min-seq-id', '0.86',
+            '-c', '0.65',
+            '--cov-mode', '2',
+            '--format-mode', '0',
+            '--format-output', 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,tlen,cigar,qaln,taln',
+            '--comp-bias-corr', '0',
+            '--mask', '0',
+            '--exact-kmer-matching', '1',
+            '--sub-mat', f'aa:{vtml_matrix},nucl:{nucl_matrix}',
+            '--seed-sub-mat', f'aa:{vtml_matrix},nucl:{nucl_matrix}',
+            '-s', '2',
+            '-k', '6',
+            '--spaced-kmer-pattern', '11011101',
+            '--max-seqs', '10000',
+            '--max-rejected', '10',
+            '--threads', str(threads),
+            '--remove-tmp-files', '0',
+            '--use-all-table-starts', '1'
+        ]
+        try:
+            subprocess.run(cmd, check=True)
+            logger.info(f"MMseqs2 profiling completed for {input_file}")
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Failed to process {input_file}: {e}")
+            continue
+def predict_aerobes(input_dir: str = None, output_file: str = "per_aerobe_predictions.tsv",
                    mode: str = "modern", id_cut: float = None, bit_cut: float = None,
                    e_cut: float = None, threads: int = 4):
     """
-    Predict aerobe levels from DIAMOND results
+    Predict aerobe levels from DIAMOND or MMseqs2 results
+    Mode determines method: modern=DIAMOND, ancient=MMseqs2
     """
     from .utils import get_package_data_path
     logger.info(f"Starting aerobe level prediction in {mode} mode")
+    if input_dir is None:
+        input_dir = 'diamond_output' if mode == 'modern' else 'mmseqs_output'
     if mode == "modern":
         identity_cutoff, bitscore_cutoff, evalue_cutoff = 60.0, 50.0, 0.001
     elif mode == "ancient":
-        identity_cutoff, bitscore_cutoff, evalue_cutoff = 45.0, 25.0, 0.1
+        identity_cutoff, bitscore_cutoff, evalue_cutoff = 86.0, 50.0, 0.001
     elif mode == "custom":
         if any(x is None for x in [id_cut, bit_cut, e_cut]):
             raise OxyMetaGError("Custom mode requires id_cut, bit_cut, and e_cut parameters")
@@ -194,7 +292,8 @@ def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_a
         raise OxyMetaGError("Mode must be 'modern', 'ancient', or 'custom'")
     package_data_dir = str(Path(get_package_data_path("")).parent / "data")
-    r_script_path = get_package_data_path("../scripts/predict_oxygen.R")
+    package_base = Path(__file__).parent
+    r_script_path = str(package_base / "scripts" / "predict_oxygen.R")
     if not Path(input_dir).exists():
         raise OxyMetaGError(f"Input directory not found: {input_dir}")
@@ -223,4 +322,4 @@ def predict_aerobes(input_dir: str = "diamond_output", output_file: str = "per_a
         logger.info(f"Results saved to {output_file}")
         return results_df
     else:
-        raise OxyMetaGError(f"Output file was not created: {output_file}")
+        raise OxyMetaGError(f"Output file was not created: {output_file}")

oxymetag/data/VTML20.out ADDED Viewed

@@ -0,0 +1,33 @@
+# VTML_20
+#
+# This matrix was produced from: vtml_20qij.mat using vtml_P.mat background frequencies
+#
+# VTML_20 substitution matrix, Units = bits/2.0
+# Expected score = -2.916179 bits; Entropy = 2.912514 bits
+# Target fraction identity = 0.8307
+# Lowest Score = -16, Highest Score= 12
+#
+# Background (precomputed optional): 0.0721 0.0135 0.0522 0.0728 0.038 0.0804 0.0256 0.0699 0.0703 0.107 0.0232 0.0503 0.0393 0.0339 0.0523 0.0698 0.05 0.0683 0.0143 0.0384 0.00001
+# Lambda     (precomputed optional): 0.34657
+	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y	X
+A	7	-3	-6	-5	-8	-4	-7	-7	-6	-7	-5	-6	-4	-5	-7	-2	-3	-3	-9	-8	0
+C	-3	12	-14	-14	-13	-7	-6	-5	-13	-12	-4	-8	-9	-13	-7	-3	-5	-3	-15	-4	0
+D	-6	-14	8	-1	-16	-6	-4	-12	-5	-15	-9	-1	-6	-4	-12	-5	-6	-9	-10	-14	0
+E	-5	-14	-1	7	-14	-6	-6	-10	-2	-8	-8	-5	-6	-1	-10	-5	-6	-7	-16	-7	0
+F	-8	-13	-16	-14	9	-11	-5	-5	-14	-3	-3	-10	-9	-8	-10	-7	-8	-6	-3	0	0
+G	-4	-7	-6	-6	-11	7	-7	-15	-7	-11	-10	-5	-8	-8	-7	-4	-8	-10	-9	-10	0
+H	-7	-6	-4	-6	-5	-7	10	-9	-5	-7	-12	-3	-6	-2	-3	-5	-5	-8	-6	-1	0
+I	-7	-5	-12	-10	-5	-15	-9	7	-9	-2	-2	-9	-10	-9	-8	-9	-5	1	-6	-8	0
+K	-6	-13	-5	-2	-14	-7	-5	-9	7	-8	-5	-3	-6	-2	0	-5	-4	-8	-9	-8	0
+L	-7	-12	-15	-8	-3	-11	-7	-2	-8	6	0	-9	-7	-6	-8	-8	-7	-3	-6	-6	0
+M	-5	-4	-9	-8	-3	-10	-12	-2	-5	0	10	-7	-10	-4	-6	-8	-4	-3	-13	-11	0
+N	-6	-8	-1	-5	-10	-5	-3	-9	-3	-9	-7	8	-8	-4	-5	-2	-4	-9	-10	-6	0
+P	-4	-9	-6	-6	-9	-8	-6	-10	-6	-7	-10	-8	9	-5	-7	-4	-6	-7	-9	-15	0
+Q	-5	-13	-4	-1	-8	-8	-2	-9	-2	-6	-4	-4	-5	9	-2	-4	-5	-7	-15	-12	0
+R	-7	-7	-12	-10	-10	-7	-3	-8	0	-8	-6	-5	-7	-2	8	-6	-6	-9	-8	-7	0
+S	-2	-3	-5	-5	-7	-4	-5	-9	-5	-8	-8	-2	-4	-4	-6	7	-1	-8	-8	-6	0
+T	-3	-5	-6	-6	-8	-8	-5	-5	-4	-7	-4	-4	-6	-5	-6	-1	8	-4	-15	-8	0
+V	-3	-3	-9	-7	-6	-10	-8	1	-8	-3	-3	-9	-7	-7	-9	-8	-4	7	-13	-8	0
+W	-9	-15	-10	-16	-3	-9	-6	-6	-9	-6	-13	-10	-9	-15	-8	-8	-15	-13	12	-2	0
+Y	-8	-4	-14	-7	0	-10	-1	-8	-8	-6	-11	-6	-15	-12	-7	-6	-8	-8	-2	9	0
+X	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1

oxymetag/data/nucleotide.out ADDED Viewed

@@ -0,0 +1,9 @@
+# NUCL in 1/2 Bit
+# Background (precomputed optional): 0.2499975 0.2499975 0.2499975 0.2499975 0.00001
+# Lambda     (precomputed optional): 0.6337314
+   A       C       T       G       X
+A  2.0000 -3.0000 -3.0000 -3.0000 -3.0000
+C -3.0000  2.0000 -3.0000 -3.0000 -3.0000
+T -3.0000 -3.0000  2.0000 -3.0000 -3.0000
+G -3.0000 -3.0000 -3.0000  2.0000 -3.0000
+X -3.0000 -3.0000 -3.0000 -3.0000 -3.0000

oxymetag/data/oxygen_model.rds CHANGED Viewed

Binary file

oxymetag/data/oxymetag_pfams_n117.dmnd ADDED Viewed

Binary file

oxymetag/data/oxymetag_pfams_n117_db ADDED Viewed

Binary file

oxymetag/data/oxymetag_pfams_n117_db.dbtype ADDED Viewed

Binary file

oxymetag 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

oxymetag 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl