oxymetag 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ "Pfam" "Gene.length"
2
+ "PF00042" 473.91
3
+ "PF00115" 1528.11
4
+ "PF00116" 750.9
5
+ "PF00296" 1098.24
6
+ "PF00510" 773.67
7
+ "PF00916" 2117.37
8
+ "PF01152" 419.37
9
+ "PF01521" 423.06
10
+ "PF01871" 700.41
11
+ "PF02579" 1028.22
12
+ "PF02906" 1566.93
13
+ "PF03063" 1722.42
14
+ "PF05425" 1056.6
15
+ "PF05721" 894
16
+ "PF08530" 2247.99
17
+ "PF10371" 3937.35
18
+ "PF13597" 2691.6
19
+ "PF16870" 2981.01
20
+ "PF17773" 950.7
21
+ "PF17910" 2188.41
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env Rscript
2
+
3
+ # Load required libraries
4
+ suppressPackageStartupMessages({
5
+ library(dplyr)
6
+ library(tidyr)
7
+ library(rlang)
8
+ library(mgcv)
9
+ })
10
+
11
+ # Parse command line arguments
12
+ args <- commandArgs(trailingOnly = TRUE)
13
+
14
+ if (length(args) < 7) {
15
+ stop("Usage: Rscript predict_oxygen.R <input_dir> <output_file> <package_data_dir> <mode> <idcut> <ecut> <bitcut>")
16
+ }
17
+
18
+ input_dir <- args[1]
19
+ output_file <- args[2]
20
+ package_data_dir <- args[3]
21
+ mode <- args[4]
22
+ idcut <- as.numeric(args[5])
23
+ ecut <- as.numeric(args[6])
24
+ bitcut <- as.numeric(args[7])
25
+
26
+ predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut) {
27
+
28
+ # Load package data files (these come from oxymetag/data/)
29
+ map_file <- file.path(package_data_dir, "pfam_headers_table.txt")
30
+ lengths_file <- file.path(package_data_dir, "pfam_lengths.tsv")
31
+ model_file <- file.path(package_data_dir, "oxygen_model.rds")
32
+ pfams_file <- file.path(package_data_dir, "Oxygen_pfams.csv")
33
+
34
+ # Check if package data files exist
35
+ if (!file.exists(map_file)) stop(paste("Package data file not found:", map_file))
36
+ if (!file.exists(lengths_file)) stop(paste("Package data file not found:", lengths_file))
37
+ if (!file.exists(model_file)) stop(paste("Package data file not found:", model_file))
38
+ if (!file.exists(pfams_file)) stop(paste("Package data file not found:", pfams_file))
39
+
40
+ # Load package data
41
+ map <- read.table(map_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE, quote = "") %>%
42
+ separate(Header, into = c("Header", "Junk"), sep = " ") %>%
43
+ select(-Junk) %>%
44
+ filter(!duplicated(Header))
45
+
46
+ pfam_gene_length <- read.delim(lengths_file)
47
+
48
+ # Load the trained model and oxygen classifications
49
+ oxygen_model <- readRDS(model_file)
50
+
51
+ # Get aerobic and anaerobic pfam lists (you'll need to define these)
52
+ oxygen_pfams <- read.csv(pfams_file, stringsAsFactors = FALSE)
53
+ aerobic_pfams <- oxygen_pfams %>% filter(Oxygen == "aerobic")
54
+ anaerobic_pfams <- oxygen_pfams %>% filter(Oxygen == "anaerobic")
55
+
56
+ # Find diamond output files (user's working directory)
57
+ files <- list.files(input_dir, pattern = "*_diamond.tsv", full.names = TRUE)
58
+
59
+ if (length(files) == 0) {
60
+ stop(paste("No *_diamond.tsv files found in", input_dir))
61
+ }
62
+
63
+ # Initialize results dataframe
64
+ results <- data.frame(
65
+ SampleID = character(length(files)),
66
+ ratio = numeric(length(files)),
67
+ aerobe_pfams = integer(length(files)),
68
+ anaerobe_pfams = integer(length(files)),
69
+ Per_aerobe = numeric(length(files)),
70
+ stringsAsFactors = FALSE
71
+ )
72
+
73
+ # Process each file
74
+ for (i in 1:length(files)) {
75
+
76
+ # Extract sample ID from filename
77
+ sample_id <- basename(files[i])
78
+ sample_id <- gsub("_diamond.tsv$", "", sample_id)
79
+ results$SampleID[i] <- sample_id
80
+
81
+ # Read and filter diamond output
82
+ if (file.size(files[i]) == 0) {
83
+ message("Warning: Empty file ", files[i])
84
+ next
85
+ }
86
+
87
+ d <- read.table(files[i], stringsAsFactors = FALSE) %>%
88
+ set_names(c("qseqid", "sseqid", "pident", "length", "qstart", "qend",
89
+ "sstart", "send", "evalue", "bitscore"))
90
+
91
+ # Apply filtering based on mode
92
+ if (mode == "modern") {
93
+ d <- d %>%
94
+ filter(pident >= 60, evalue < 0.001, bitscore >= 50)
95
+ } else if (mode == "ancient") {
96
+ d <- d %>%
97
+ filter(pident >= 45, evalue < 0.1, bitscore >= 25)
98
+ } else if (mode == "custom") {
99
+ d <- d %>%
100
+ filter(pident >= idcut, evalue < ecut, bitscore >= bitcut)
101
+ }
102
+
103
+ if (nrow(d) == 0) {
104
+ message("Warning: No significant hits for ", sample_id)
105
+ results$ratio[i] <- 0
106
+ results$aerobe_pfams[i] <- 0
107
+ results$anaerobe_pfams[i] <- 0
108
+ next
109
+ }
110
+
111
+ # Join with pfam mapping
112
+ d <- d %>% left_join(map, by = c("sseqid" = "Header"))
113
+
114
+ # Count pfams
115
+ pf_count <- as.data.frame(table(d$Pfam))
116
+ results$Pfams[i] <- nrow(pf_count)
117
+ results$aerobe_pfams[i] <- sum(as.character(pf_count$Var1) %in% aerobic_pfams$Pfam)
118
+ results$anaerobe_pfams[i] <- sum(as.character(pf_count$Var1) %in% anaerobic_pfams$Pfam)
119
+
120
+ # Calculate gene hits and length correction
121
+ gene.hits <- d %>%
122
+ group_by(Pfam) %>%
123
+ summarise(total_count = n())
124
+
125
+ gene.hit.length.correction <- gene.hits %>%
126
+ left_join(., pfam_gene_length, by = "Pfam") %>%
127
+ mutate(RPK = total_count / (1000*Gene.length)) %>%
128
+ left_join(., oxygen_pfams, by = "Pfam")
129
+
130
+ # Sum by oxygen type
131
+ oxygen_rpk <- gene.hit.length.correction %>%
132
+ group_by(Oxygen) %>%
133
+ summarize(RPKsum = sum(RPK))
134
+
135
+ # Calculate the ratio and add it to the dataframe
136
+ results$ratio[i] <- oxygen_rpk$RPKsum[1] / oxygen_rpk$RPKsum[2]
137
+
138
+ # Processing message
139
+ message("Processed sample ", i, "/", length(files), ": ", sample_id)
140
+ }
141
+
142
+ # Make predictions using the GAM model
143
+ new_data <- data.frame(ratio = results$ratio)
144
+ results$Per_aerobe <- predict(oxygen_model, newdata = new_data)
145
+
146
+ # Constrain predictions to 0-100% and set to 100% if ratio > 35
147
+ results <- results %>%
148
+ mutate(Per_aerobe = ifelse(Per_aerobe > 100, 100, Per_aerobe)) %>%
149
+ mutate(Per_aerobe = ifelse(Per_aerobe < 0, 0, Per_aerobe)) %>%
150
+ mutate(Per_aerobe = ifelse(ratio > 35, 100, Per_aerobe))
151
+
152
+ # Save results
153
+ write.table(results, output_file, sep = "\t", row.names = FALSE, quote = FALSE)
154
+ message("Results saved to: ", output_file)
155
+
156
+ return(results)
157
+ }
158
+
159
+ # Run the function
160
+ predict_oxygen(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut)
oxymetag/utils.py ADDED
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Utility functions for OxyMetaG
4
+ """
5
+
6
+ import subprocess
7
+ import pkg_resources
8
+ from pathlib import Path
9
+ import logging
10
+
11
+ logger = logging.getLogger('oxymetag')
12
+
13
+ class OxyMetaGError(Exception):
14
+ """Custom exception for OxyMetaG errors"""
15
+ pass
16
+
17
+ def check_dependencies():
18
+ """Check if required external tools are available"""
19
+ required_tools = ['kraken2', 'diamond', 'Rscript']
20
+ missing_tools = []
21
+
22
+ for tool in required_tools:
23
+ if subprocess.run(['which', tool], capture_output=True).returncode != 0:
24
+ missing_tools.append(tool)
25
+
26
+ if missing_tools:
27
+ raise OxyMetaGError(f"Missing required tools: {', '.join(missing_tools)}")
28
+
29
+ def get_package_data_path(filename: str) -> str:
30
+ """Get path to package data files"""
31
+ try:
32
+ return pkg_resources.resource_filename('oxymetag', f'data/{filename}')
33
+ except:
34
+ package_dir = Path(__file__).parent
35
+ return str(package_dir / 'data' / filename)
36
+
37
+ def run_kraken2_setup():
38
+ """Download and set up standard Kraken2 database without fungi"""
39
+ logger.info("Setting up Kraken2 database (bacteria, archaea, viral)...")
40
+
41
+ db_path = Path.cwd() / "kraken2_db"
42
+ db_path.mkdir(exist_ok=True)
43
+
44
+ try:
45
+ # Download taxonomy
46
+ cmd = ['kraken2-build', '--download-taxonomy', '--db', str(db_path)]
47
+ logger.info("Downloading taxonomy...")
48
+ subprocess.run(cmd, check=True)
49
+ logger.info("Taxonomy downloaded successfully")
50
+
51
+ # Download libraries (excluding fungi)
52
+ libraries = ['bacteria', 'archaea', 'viral']
53
+ for lib in libraries:
54
+ cmd = ['kraken2-build', '--download-library', lib, '--db', str(db_path)]
55
+ logger.info(f"Downloading {lib} library...")
56
+ subprocess.run(cmd, check=True)
57
+ logger.info(f"{lib} library downloaded successfully")
58
+
59
+ # Build database
60
+ cmd = ['kraken2-build', '--build', '--db', str(db_path), '--threads', '48']
61
+ logger.info("Building Kraken2 database...")
62
+ subprocess.run(cmd, check=True)
63
+
64
+ # Clean up temporary files to save space
65
+ cmd = ['kraken2-build', '--clean', '--db', str(db_path)]
66
+ logger.info("Cleaning up temporary files...")
67
+ subprocess.run(cmd, check=True)
68
+
69
+ logger.info(f"Kraken2 database setup complete: {db_path}")
70
+ logger.info("Database includes: bacteria, archaea, viral (fungi excluded)")
71
+
72
+ except subprocess.CalledProcessError as e:
73
+ raise OxyMetaGError(f"Failed to setup Kraken2 database: {e}")