oxymetag 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oxymetag/__init__.py +18 -0
- oxymetag/cli.py +107 -0
- oxymetag/core.py +226 -0
- oxymetag/data/.DS_Store +0 -0
- oxymetag/data/Oxygen_pfams.csv +21 -0
- oxymetag/data/oxygen_model.rds +0 -0
- oxymetag/data/oxymetag_pfams.dmnd +0 -0
- oxymetag/data/pfam_headers_table.txt +3870 -0
- oxymetag/data/pfam_lengths.tsv +21 -0
- oxymetag/scripts/predict_oxygen.R +160 -0
- oxymetag/utils.py +73 -0
- oxymetag-1.0.0.dist-info/LICENSE +674 -0
- oxymetag-1.0.0.dist-info/METADATA +235 -0
- oxymetag-1.0.0.dist-info/RECORD +18 -0
- oxymetag-1.0.0.dist-info/WHEEL +5 -0
- oxymetag-1.0.0.dist-info/entry_points.txt +2 -0
- oxymetag-1.0.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"Pfam" "Gene.length"
|
|
2
|
+
"PF00042" 473.91
|
|
3
|
+
"PF00115" 1528.11
|
|
4
|
+
"PF00116" 750.9
|
|
5
|
+
"PF00296" 1098.24
|
|
6
|
+
"PF00510" 773.67
|
|
7
|
+
"PF00916" 2117.37
|
|
8
|
+
"PF01152" 419.37
|
|
9
|
+
"PF01521" 423.06
|
|
10
|
+
"PF01871" 700.41
|
|
11
|
+
"PF02579" 1028.22
|
|
12
|
+
"PF02906" 1566.93
|
|
13
|
+
"PF03063" 1722.42
|
|
14
|
+
"PF05425" 1056.6
|
|
15
|
+
"PF05721" 894
|
|
16
|
+
"PF08530" 2247.99
|
|
17
|
+
"PF10371" 3937.35
|
|
18
|
+
"PF13597" 2691.6
|
|
19
|
+
"PF16870" 2981.01
|
|
20
|
+
"PF17773" 950.7
|
|
21
|
+
"PF17910" 2188.41
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#!/usr/bin/env Rscript
|
|
2
|
+
|
|
3
|
+
# Load required libraries
|
|
4
|
+
suppressPackageStartupMessages({
|
|
5
|
+
library(dplyr)
|
|
6
|
+
library(tidyr)
|
|
7
|
+
library(rlang)
|
|
8
|
+
library(mgcv)
|
|
9
|
+
})
|
|
10
|
+
|
|
11
|
+
# Parse command line arguments
|
|
12
|
+
args <- commandArgs(trailingOnly = TRUE)
|
|
13
|
+
|
|
14
|
+
if (length(args) < 7) {
|
|
15
|
+
stop("Usage: Rscript predict_oxygen.R <input_dir> <output_file> <package_data_dir> <mode> <idcut> <ecut> <bitcut>")
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
input_dir <- args[1]
|
|
19
|
+
output_file <- args[2]
|
|
20
|
+
package_data_dir <- args[3]
|
|
21
|
+
mode <- args[4]
|
|
22
|
+
idcut <- as.numeric(args[5])
|
|
23
|
+
ecut <- as.numeric(args[6])
|
|
24
|
+
bitcut <- as.numeric(args[7])
|
|
25
|
+
|
|
26
|
+
predict_oxygen <- function(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut) {
|
|
27
|
+
|
|
28
|
+
# Load package data files (these come from oxymetag/data/)
|
|
29
|
+
map_file <- file.path(package_data_dir, "pfam_headers_table.txt")
|
|
30
|
+
lengths_file <- file.path(package_data_dir, "pfam_lengths.tsv")
|
|
31
|
+
model_file <- file.path(package_data_dir, "oxygen_model.rds")
|
|
32
|
+
pfams_file <- file.path(package_data_dir, "Oxygen_pfams.csv")
|
|
33
|
+
|
|
34
|
+
# Check if package data files exist
|
|
35
|
+
if (!file.exists(map_file)) stop(paste("Package data file not found:", map_file))
|
|
36
|
+
if (!file.exists(lengths_file)) stop(paste("Package data file not found:", lengths_file))
|
|
37
|
+
if (!file.exists(model_file)) stop(paste("Package data file not found:", model_file))
|
|
38
|
+
if (!file.exists(pfams_file)) stop(paste("Package data file not found:", pfams_file))
|
|
39
|
+
|
|
40
|
+
# Load package data
|
|
41
|
+
map <- read.table(map_file, sep = "\t", header = TRUE, stringsAsFactors = FALSE, quote = "") %>%
|
|
42
|
+
separate(Header, into = c("Header", "Junk"), sep = " ") %>%
|
|
43
|
+
select(-Junk) %>%
|
|
44
|
+
filter(!duplicated(Header))
|
|
45
|
+
|
|
46
|
+
pfam_gene_length <- read.delim(lengths_file)
|
|
47
|
+
|
|
48
|
+
# Load the trained model and oxygen classifications
|
|
49
|
+
oxygen_model <- readRDS(model_file)
|
|
50
|
+
|
|
51
|
+
# Get aerobic and anaerobic pfam lists (you'll need to define these)
|
|
52
|
+
oxygen_pfams <- read.csv(pfams_file, stringsAsFactors = FALSE)
|
|
53
|
+
aerobic_pfams <- oxygen_pfams %>% filter(Oxygen == "aerobic")
|
|
54
|
+
anaerobic_pfams <- oxygen_pfams %>% filter(Oxygen == "anaerobic")
|
|
55
|
+
|
|
56
|
+
# Find diamond output files (user's working directory)
|
|
57
|
+
files <- list.files(input_dir, pattern = "*_diamond.tsv", full.names = TRUE)
|
|
58
|
+
|
|
59
|
+
if (length(files) == 0) {
|
|
60
|
+
stop(paste("No *_diamond.tsv files found in", input_dir))
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Initialize results dataframe
|
|
64
|
+
results <- data.frame(
|
|
65
|
+
SampleID = character(length(files)),
|
|
66
|
+
ratio = numeric(length(files)),
|
|
67
|
+
aerobe_pfams = integer(length(files)),
|
|
68
|
+
anaerobe_pfams = integer(length(files)),
|
|
69
|
+
Per_aerobe = numeric(length(files)),
|
|
70
|
+
stringsAsFactors = FALSE
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Process each file
|
|
74
|
+
for (i in 1:length(files)) {
|
|
75
|
+
|
|
76
|
+
# Extract sample ID from filename
|
|
77
|
+
sample_id <- basename(files[i])
|
|
78
|
+
sample_id <- gsub("_diamond.tsv$", "", sample_id)
|
|
79
|
+
results$SampleID[i] <- sample_id
|
|
80
|
+
|
|
81
|
+
# Read and filter diamond output
|
|
82
|
+
if (file.size(files[i]) == 0) {
|
|
83
|
+
message("Warning: Empty file ", files[i])
|
|
84
|
+
next
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
d <- read.table(files[i], stringsAsFactors = FALSE) %>%
|
|
88
|
+
set_names(c("qseqid", "sseqid", "pident", "length", "qstart", "qend",
|
|
89
|
+
"sstart", "send", "evalue", "bitscore"))
|
|
90
|
+
|
|
91
|
+
# Apply filtering based on mode
|
|
92
|
+
if (mode == "modern") {
|
|
93
|
+
d <- d %>%
|
|
94
|
+
filter(pident >= 60, evalue < 0.001, bitscore >= 50)
|
|
95
|
+
} else if (mode == "ancient") {
|
|
96
|
+
d <- d %>%
|
|
97
|
+
filter(pident >= 45, evalue < 0.1, bitscore >= 25)
|
|
98
|
+
} else if (mode == "custom") {
|
|
99
|
+
d <- d %>%
|
|
100
|
+
filter(pident >= idcut, evalue < ecut, bitscore >= bitcut)
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if (nrow(d) == 0) {
|
|
104
|
+
message("Warning: No significant hits for ", sample_id)
|
|
105
|
+
results$ratio[i] <- 0
|
|
106
|
+
results$aerobe_pfams[i] <- 0
|
|
107
|
+
results$anaerobe_pfams[i] <- 0
|
|
108
|
+
next
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Join with pfam mapping
|
|
112
|
+
d <- d %>% left_join(map, by = c("sseqid" = "Header"))
|
|
113
|
+
|
|
114
|
+
# Count pfams
|
|
115
|
+
pf_count <- as.data.frame(table(d$Pfam))
|
|
116
|
+
results$Pfams[i] <- nrow(pf_count)
|
|
117
|
+
results$aerobe_pfams[i] <- sum(as.character(pf_count$Var1) %in% aerobic_pfams$Pfam)
|
|
118
|
+
results$anaerobe_pfams[i] <- sum(as.character(pf_count$Var1) %in% anaerobic_pfams$Pfam)
|
|
119
|
+
|
|
120
|
+
# Calculate gene hits and length correction
|
|
121
|
+
gene.hits <- d %>%
|
|
122
|
+
group_by(Pfam) %>%
|
|
123
|
+
summarise(total_count = n())
|
|
124
|
+
|
|
125
|
+
gene.hit.length.correction <- gene.hits %>%
|
|
126
|
+
left_join(., pfam_gene_length, by = "Pfam") %>%
|
|
127
|
+
mutate(RPK = total_count / (1000*Gene.length)) %>%
|
|
128
|
+
left_join(., oxygen_pfams, by = "Pfam")
|
|
129
|
+
|
|
130
|
+
# Sum by oxygen type
|
|
131
|
+
oxygen_rpk <- gene.hit.length.correction %>%
|
|
132
|
+
group_by(Oxygen) %>%
|
|
133
|
+
summarize(RPKsum = sum(RPK))
|
|
134
|
+
|
|
135
|
+
# Calculate the ratio and add it to the dataframe
|
|
136
|
+
results$ratio[i] <- oxygen_rpk$RPKsum[1] / oxygen_rpk$RPKsum[2]
|
|
137
|
+
|
|
138
|
+
# Processing message
|
|
139
|
+
message("Processed sample ", i, "/", length(files), ": ", sample_id)
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
# Make predictions using the GAM model
|
|
143
|
+
new_data <- data.frame(ratio = results$ratio)
|
|
144
|
+
results$Per_aerobe <- predict(oxygen_model, newdata = new_data)
|
|
145
|
+
|
|
146
|
+
# Constrain predictions to 0-100% and set to 100% if ratio > 35
|
|
147
|
+
results <- results %>%
|
|
148
|
+
mutate(Per_aerobe = ifelse(Per_aerobe > 100, 100, Per_aerobe)) %>%
|
|
149
|
+
mutate(Per_aerobe = ifelse(Per_aerobe < 0, 0, Per_aerobe)) %>%
|
|
150
|
+
mutate(Per_aerobe = ifelse(ratio > 35, 100, Per_aerobe))
|
|
151
|
+
|
|
152
|
+
# Save results
|
|
153
|
+
write.table(results, output_file, sep = "\t", row.names = FALSE, quote = FALSE)
|
|
154
|
+
message("Results saved to: ", output_file)
|
|
155
|
+
|
|
156
|
+
return(results)
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
# Run the function
|
|
160
|
+
predict_oxygen(input_dir, output_file, package_data_dir, mode, idcut, ecut, bitcut)
|
oxymetag/utils.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Utility functions for OxyMetaG
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import subprocess
|
|
7
|
+
import pkg_resources
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger('oxymetag')
|
|
12
|
+
|
|
13
|
+
class OxyMetaGError(Exception):
|
|
14
|
+
"""Custom exception for OxyMetaG errors"""
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
def check_dependencies():
|
|
18
|
+
"""Check if required external tools are available"""
|
|
19
|
+
required_tools = ['kraken2', 'diamond', 'Rscript']
|
|
20
|
+
missing_tools = []
|
|
21
|
+
|
|
22
|
+
for tool in required_tools:
|
|
23
|
+
if subprocess.run(['which', tool], capture_output=True).returncode != 0:
|
|
24
|
+
missing_tools.append(tool)
|
|
25
|
+
|
|
26
|
+
if missing_tools:
|
|
27
|
+
raise OxyMetaGError(f"Missing required tools: {', '.join(missing_tools)}")
|
|
28
|
+
|
|
29
|
+
def get_package_data_path(filename: str) -> str:
|
|
30
|
+
"""Get path to package data files"""
|
|
31
|
+
try:
|
|
32
|
+
return pkg_resources.resource_filename('oxymetag', f'data/{filename}')
|
|
33
|
+
except:
|
|
34
|
+
package_dir = Path(__file__).parent
|
|
35
|
+
return str(package_dir / 'data' / filename)
|
|
36
|
+
|
|
37
|
+
def run_kraken2_setup():
|
|
38
|
+
"""Download and set up standard Kraken2 database without fungi"""
|
|
39
|
+
logger.info("Setting up Kraken2 database (bacteria, archaea, viral)...")
|
|
40
|
+
|
|
41
|
+
db_path = Path.cwd() / "kraken2_db"
|
|
42
|
+
db_path.mkdir(exist_ok=True)
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
# Download taxonomy
|
|
46
|
+
cmd = ['kraken2-build', '--download-taxonomy', '--db', str(db_path)]
|
|
47
|
+
logger.info("Downloading taxonomy...")
|
|
48
|
+
subprocess.run(cmd, check=True)
|
|
49
|
+
logger.info("Taxonomy downloaded successfully")
|
|
50
|
+
|
|
51
|
+
# Download libraries (excluding fungi)
|
|
52
|
+
libraries = ['bacteria', 'archaea', 'viral']
|
|
53
|
+
for lib in libraries:
|
|
54
|
+
cmd = ['kraken2-build', '--download-library', lib, '--db', str(db_path)]
|
|
55
|
+
logger.info(f"Downloading {lib} library...")
|
|
56
|
+
subprocess.run(cmd, check=True)
|
|
57
|
+
logger.info(f"{lib} library downloaded successfully")
|
|
58
|
+
|
|
59
|
+
# Build database
|
|
60
|
+
cmd = ['kraken2-build', '--build', '--db', str(db_path), '--threads', '48']
|
|
61
|
+
logger.info("Building Kraken2 database...")
|
|
62
|
+
subprocess.run(cmd, check=True)
|
|
63
|
+
|
|
64
|
+
# Clean up temporary files to save space
|
|
65
|
+
cmd = ['kraken2-build', '--clean', '--db', str(db_path)]
|
|
66
|
+
logger.info("Cleaning up temporary files...")
|
|
67
|
+
subprocess.run(cmd, check=True)
|
|
68
|
+
|
|
69
|
+
logger.info(f"Kraken2 database setup complete: {db_path}")
|
|
70
|
+
logger.info("Database includes: bacteria, archaea, viral (fungi excluded)")
|
|
71
|
+
|
|
72
|
+
except subprocess.CalledProcessError as e:
|
|
73
|
+
raise OxyMetaGError(f"Failed to setup Kraken2 database: {e}")
|