SCiMS 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scims/__init__.py +35 -0
- scims/__main__.py +196 -0
- scims/classification.py +135 -0
- scims/data/training_data_hmp_1000x_normalizedXY.txt +23637 -0
- scims/helpers.py +98 -0
- scims/process_idxstats.py +67 -0
- scims/utils.py +87 -0
- scims-1.0.0.dist-info/METADATA +222 -0
- scims-1.0.0.dist-info/RECORD +26 -0
- scims-1.0.0.dist-info/WHEEL +5 -0
- scims-1.0.0.dist-info/entry_points.txt +2 -0
- scims-1.0.0.dist-info/licenses/LICENSE +21 -0
- scims-1.0.0.dist-info/top_level.txt +2 -0
- tests/.DS_Store +0 -0
- tests/__init__.py +2 -0
- tests/__pycache__/__init__.cpython-39.pyc +0 -0
- tests/__pycache__/test_classification.cpython-39-pytest-8.4.2.pyc +0 -0
- tests/__pycache__/test_helpers.cpython-39-pytest-8.4.2.pyc +0 -0
- tests/__pycache__/test_process_idxstats.cpython-39-pytest-8.4.2.pyc +0 -0
- tests/__pycache__/test_run.cpython-39-pytest-8.4.2.pyc +0 -0
- tests/__pycache__/test_utils.cpython-39-pytest-8.4.2.pyc +0 -0
- tests/test_classification.py +145 -0
- tests/test_helpers.py +33 -0
- tests/test_process_idxstats.py +100 -0
- tests/test_run.py +57 -0
- tests/test_utils.py +100 -0
scims/__init__.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# __init__.py: initialize SCiMS package
|
|
2
|
+
|
|
3
|
+
from .utils import (
|
|
4
|
+
read_metadata,
|
|
5
|
+
normalize_colname,
|
|
6
|
+
find_sample_id_column,
|
|
7
|
+
read_master_file,
|
|
8
|
+
extract_sample_id
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from .helpers import (
|
|
12
|
+
load_training_data
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from .classification import (
|
|
16
|
+
process_sample_xy,
|
|
17
|
+
process_sample_zw
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from .__main__ import main
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
'main',
|
|
24
|
+
'read_metadata',
|
|
25
|
+
'normalize_colname',
|
|
26
|
+
'find_sample_id_column',
|
|
27
|
+
'read_master_file',
|
|
28
|
+
'extract_sample_id',
|
|
29
|
+
'load_training_data',
|
|
30
|
+
'process_sample_xy',
|
|
31
|
+
'process_sample_zw'
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
# Package version
|
|
35
|
+
__version__ = '1.1.0'
|
scims/__main__.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SCiMS: Sex Calling in Metagenomic Sequencing
|
|
3
|
+
|
|
4
|
+
This script classifies host sex using metagenomic sequencing data alone.
|
|
5
|
+
Metagenomic samples are classified into male, female, or uncertain based on
|
|
6
|
+
coverage ratios of putative sex chromosomes (X/Y or Z/W). It uses a kernel
|
|
7
|
+
density estimation (KDE) approach, comparing coverage ratios against training data.
|
|
8
|
+
In the XY system, 'male' is heterogametic (XY); in the ZW system, 'female' is
|
|
9
|
+
heterogametic (ZW).
|
|
10
|
+
|
|
11
|
+
Author: Hanh Tran
|
|
12
|
+
Version: 1.1.0
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import sys
|
|
20
|
+
|
|
21
|
+
from scipy.stats import gaussian_kde
|
|
22
|
+
|
|
23
|
+
from .utils import (
|
|
24
|
+
read_metadata,
|
|
25
|
+
find_sample_id_column,
|
|
26
|
+
extract_sample_id
|
|
27
|
+
)
|
|
28
|
+
from .helpers import load_training_data
|
|
29
|
+
from .process_idxstats import process_idxstats_file
|
|
30
|
+
|
|
31
|
+
def main():
|
|
32
|
+
parser = argparse.ArgumentParser(description="SCiMS: Sex Calling in Metagenomic Sequencing")
|
|
33
|
+
|
|
34
|
+
# Mode selection: default is single-sample mode.
|
|
35
|
+
parser.add_argument('--idxstats_file', dest="idxstats_file", help='Path to a single idxstats file (default mode)')
|
|
36
|
+
parser.add_argument('--idxstats_folder', dest="idxstats_folder", help='Path to the folder containing idxstats files for multiple-sample mode')
|
|
37
|
+
|
|
38
|
+
parser.add_argument('--scaffolds', dest="scaffold_ids_file", required=True, help='Path to the text file containing scaffold IDs')
|
|
39
|
+
parser.add_argument('--homogametic_scaffold', dest="homogametic_scaffold", required=True, help='ID of the homogametic scaffold (e.g. X or Z)')
|
|
40
|
+
parser.add_argument('--heterogametic_scaffold', dest="heterogametic_scaffold", required=True, help='ID of the heterogametic scaffold (e.g. Y or W)')
|
|
41
|
+
parser.add_argument('--ZW', dest="ZW", action="store_true", help='Switch to ZW system (default is XY)')
|
|
42
|
+
parser.add_argument('--threshold', dest="threshold", type=float, default=0.5, help='Probability threshold for determining sex')
|
|
43
|
+
parser.add_argument('--output_dir', dest="output_dir", required=True, help='Path to the output directory')
|
|
44
|
+
parser.add_argument('--training_data', dest="training_data", help='Path to the training data file', default="training_data_hmp_1000x_normalizedXY.txt")
|
|
45
|
+
|
|
46
|
+
# Optional metadata update (only used in multiple-sample mode)
|
|
47
|
+
parser.add_argument('--metadata', dest="metadata", help='Path to the metadata file (optional, used in multiple-sample mode)')
|
|
48
|
+
parser.add_argument('--id_column', dest="id_column", help='User-specified sample ID column name in metadata')
|
|
49
|
+
|
|
50
|
+
# New boolean flag for log output (default is False)
|
|
51
|
+
parser.add_argument('--log', dest="log", action="store_true", help='If set, a log file is written to the output directory (scims.log)')
|
|
52
|
+
|
|
53
|
+
args = parser.parse_args()
|
|
54
|
+
|
|
55
|
+
# Check metadata parameters early
|
|
56
|
+
if args.metadata and not args.id_column:
|
|
57
|
+
print("Error: When providing a metadata file, you must also specify the id column using --id_column.")
|
|
58
|
+
sys.exit(1)
|
|
59
|
+
|
|
60
|
+
# Setup logging after parsing arguments
|
|
61
|
+
logger = logging.getLogger(__name__)
|
|
62
|
+
logger.setLevel(logging.INFO)
|
|
63
|
+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
64
|
+
|
|
65
|
+
# Clear any existing handlers
|
|
66
|
+
if logger.hasHandlers():
|
|
67
|
+
logger.handlers.clear()
|
|
68
|
+
|
|
69
|
+
# Add console handler
|
|
70
|
+
console_handler = logging.StreamHandler()
|
|
71
|
+
console_handler.setFormatter(formatter)
|
|
72
|
+
logger.addHandler(console_handler)
|
|
73
|
+
|
|
74
|
+
# Create output directory if it doesn't exist
|
|
75
|
+
os.makedirs(args.output_dir, exist_ok=True)
|
|
76
|
+
|
|
77
|
+
# Add file handler if the --log flag is set
|
|
78
|
+
if args.log:
|
|
79
|
+
log_file_path = os.path.join(args.output_dir, "scims.log")
|
|
80
|
+
file_handler = logging.FileHandler(log_file_path, mode='w')
|
|
81
|
+
file_handler.setFormatter(formatter)
|
|
82
|
+
logger.addHandler(file_handler)
|
|
83
|
+
logger.info(f"Log file created at: {log_file_path}")
|
|
84
|
+
|
|
85
|
+
logger.info(" \n=================================================")
|
|
86
|
+
logger.info("""
|
|
87
|
+
_|_|_| _|_|_| _|_|_| _| _| _|_|_|
|
|
88
|
+
_| _| _| _|_| _|_| _|
|
|
89
|
+
_|_|_| _| _| _| _| _| _|_|_|
|
|
90
|
+
_| _| _| _| _| _|
|
|
91
|
+
_|_|_| _|_|_| _|_|_| _| _| _|_|_|
|
|
92
|
+
=================================================""")
|
|
93
|
+
logger.info("SCiMS: Sex Calling in Metagenomic Sequencing")
|
|
94
|
+
logger.info("Version: 1.1.0")
|
|
95
|
+
logger.info("=================================================")
|
|
96
|
+
|
|
97
|
+
# Validate mode: either a single file or folder must be provided
|
|
98
|
+
if args.idxstats_folder:
|
|
99
|
+
mode = "multiple"
|
|
100
|
+
elif args.idxstats_file:
|
|
101
|
+
mode = "single"
|
|
102
|
+
else:
|
|
103
|
+
logger.error("You must specify either --idxstats_file for single-sample mode or --idxstats_folder for multiple-sample mode.")
|
|
104
|
+
sys.exit(1)
|
|
105
|
+
|
|
106
|
+
# Load scaffold IDs
|
|
107
|
+
try:
|
|
108
|
+
with open(args.scaffold_ids_file, 'r') as sf:
|
|
109
|
+
scaffold_ids = [line.strip() for line in sf if line.strip()]
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.error(f"Failed to read scaffold IDs: {e}")
|
|
112
|
+
sys.exit(1)
|
|
113
|
+
|
|
114
|
+
# Load training data and build KDE models
|
|
115
|
+
try:
|
|
116
|
+
training_data = load_training_data(args.training_data)
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.error(f"Failed to load training data: {e}")
|
|
119
|
+
sys.exit(1)
|
|
120
|
+
|
|
121
|
+
if args.ZW:
|
|
122
|
+
# Use ZW system columns from training data
|
|
123
|
+
male_rows = training_data[training_data['actual_sex_zw'] == 'male']
|
|
124
|
+
female_rows = training_data[training_data['actual_sex_zw'] == 'female']
|
|
125
|
+
male_data = male_rows[['Rz', 'Rw']].dropna().values.T
|
|
126
|
+
female_data = female_rows[['Rz', 'Rw']].dropna().values.T
|
|
127
|
+
else:
|
|
128
|
+
# Default to XY system
|
|
129
|
+
male_rows = training_data[training_data['actual_sex'] == 'male']
|
|
130
|
+
female_rows = training_data[training_data['actual_sex'] == 'female']
|
|
131
|
+
male_data = male_rows[['Rx', 'Ry']].dropna().values.T
|
|
132
|
+
female_data = female_rows[['Rx', 'Ry']].dropna().values.T
|
|
133
|
+
|
|
134
|
+
kde_male_joint = gaussian_kde(male_data)
|
|
135
|
+
kde_female_joint = gaussian_kde(female_data)
|
|
136
|
+
|
|
137
|
+
if mode == "multiple":
|
|
138
|
+
try:
|
|
139
|
+
folder_files = [os.path.join(args.idxstats_folder, f)
|
|
140
|
+
for f in os.listdir(args.idxstats_folder) if f.endswith(".idxstats")]
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f"Error reading idxstats folder: {e}")
|
|
143
|
+
sys.exit(1)
|
|
144
|
+
|
|
145
|
+
if not folder_files:
|
|
146
|
+
logger.error("No idxstats files found in the provided folder.")
|
|
147
|
+
sys.exit(1)
|
|
148
|
+
|
|
149
|
+
all_results = [] # To collect results for optional metadata merging
|
|
150
|
+
for idxstats_file in folder_files:
|
|
151
|
+
result = process_idxstats_file(idxstats_file, scaffold_ids, args, kde_male_joint, kde_female_joint)
|
|
152
|
+
all_results.append(result)
|
|
153
|
+
# Build output dictionary using consistent keys
|
|
154
|
+
out_dict = {
|
|
155
|
+
"SCiMS_ID": result.get("SCiMS_ID"),
|
|
156
|
+
"SCiMS_predicted_sex": result.get("SCiMS_sex"),
|
|
157
|
+
"SCiMS_male_post_prob": result.get("SCiMS_male_post_prob"),
|
|
158
|
+
"SCiMS_female_post_prob": result.get("SCiMS_female_post_prob")
|
|
159
|
+
}
|
|
160
|
+
base_name = os.path.basename(idxstats_file).split('.')[0]
|
|
161
|
+
output_file = os.path.join(args.output_dir, f"{base_name}_results.txt")
|
|
162
|
+
pd.DataFrame([out_dict]).to_csv(output_file, sep='\t', index=False)
|
|
163
|
+
logger.info(f"Results written to {output_file}")
|
|
164
|
+
|
|
165
|
+
# Merge metadata once after processing all files
|
|
166
|
+
if args.metadata:
|
|
167
|
+
try:
|
|
168
|
+
results_df = pd.DataFrame(all_results)
|
|
169
|
+
metadata = read_metadata(args.metadata)
|
|
170
|
+
sample_id_col = find_sample_id_column(metadata, args.id_column)
|
|
171
|
+
merged_df = pd.merge(metadata, results_df, left_on=sample_id_col, right_on='SCiMS_ID', how='left')
|
|
172
|
+
merged_df.drop(columns=['SCiMS_ID'], inplace=True)
|
|
173
|
+
metadata_basename = os.path.basename(args.metadata).split('.')[0]
|
|
174
|
+
metadata_file = os.path.join(args.output_dir, f"{metadata_basename}_scims_updated.txt")
|
|
175
|
+
merged_df.to_csv(metadata_file, sep='\t', index=False)
|
|
176
|
+
logger.info(f"Updated metadata with classification results written to {metadata_file}")
|
|
177
|
+
except Exception as e:
|
|
178
|
+
logger.error(f"Error updating metadata: {e}")
|
|
179
|
+
sys.exit(1)
|
|
180
|
+
else:
|
|
181
|
+
# Single-sample mode: process one file and write the filtered result
|
|
182
|
+
result = process_idxstats_file(args.idxstats_file, scaffold_ids, args, kde_male_joint, kde_female_joint)
|
|
183
|
+
out_dict = {
|
|
184
|
+
"SCiMS_ID": result.get("SCiMS_ID"),
|
|
185
|
+
"SCiMS_predicted_sex": result.get("SCiMS_sex"),
|
|
186
|
+
"SCiMS_male_post_prob": result.get("SCiMS_male_post_prob"),
|
|
187
|
+
"SCiMS_female_post_prob": result.get("SCiMS_female_post_prob")
|
|
188
|
+
}
|
|
189
|
+
results_df = pd.DataFrame([out_dict])
|
|
190
|
+
base_name = os.path.basename(args.idxstats_file).split('.')[0]
|
|
191
|
+
output_file = os.path.join(args.output_dir, f"{base_name}_results.txt")
|
|
192
|
+
results_df.to_csv(output_file, sep='\t', index=False)
|
|
193
|
+
logger.info(f"Results written to {output_file}")
|
|
194
|
+
|
|
195
|
+
if __name__ == "__main__":
|
|
196
|
+
main()
|
scims/classification.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# classification.py : core logic for classification process
|
|
2
|
+
|
|
3
|
+
from scipy.stats import gaussian_kde
|
|
4
|
+
import logging
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from .helpers import (
|
|
8
|
+
compute_coverage_ratio_rx,
|
|
9
|
+
compute_joint_posterior,
|
|
10
|
+
determine_sex_with_joint_posteriors,
|
|
11
|
+
load_training_data
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
def process_sample_xy(
|
|
15
|
+
idxstats: pd.DataFrame,
|
|
16
|
+
x_id: str,
|
|
17
|
+
y_id: str,
|
|
18
|
+
male_kde: gaussian_kde,
|
|
19
|
+
female_kde: gaussian_kde,
|
|
20
|
+
threshold: float
|
|
21
|
+
) -> dict:
|
|
22
|
+
"""
|
|
23
|
+
Given an idxstats DataFrame, XY chromosome IDs (x_id, y_id), and KDE models for male/female,
|
|
24
|
+
computes coverage ratios (Rx, Ry) and determines sex using joint posteriors.
|
|
25
|
+
"""
|
|
26
|
+
# Indices of X/Y
|
|
27
|
+
x_index = idxstats.index.get_loc(x_id) if x_id in idxstats.index else None
|
|
28
|
+
y_index = idxstats.index.get_loc(y_id) if y_id in idxstats.index else None
|
|
29
|
+
|
|
30
|
+
# Idenfity which rows are autosomes (excluding X and Y)
|
|
31
|
+
autosome_indices = [
|
|
32
|
+
i for i in range(len(idxstats))
|
|
33
|
+
if i != x_index and (y_index is None or i != y_index)]
|
|
34
|
+
|
|
35
|
+
# Coverage ratio for X vs autosomes (Rx)
|
|
36
|
+
Rx = compute_coverage_ratio_rx(idxstats, autosome_indices, x_index)
|
|
37
|
+
|
|
38
|
+
# Coverage ratio for Y vs X+Y (Ry)
|
|
39
|
+
x_count = idxstats.loc[x_id].iloc[1] if x_id in idxstats.index else 0 # column 1 = mapped reads column 0 = chromosome length
|
|
40
|
+
y_count = idxstats.loc[y_id].iloc[1] if y_id in idxstats.index else 0
|
|
41
|
+
x_length = idxstats.loc[x_id].iloc[0]
|
|
42
|
+
y_length = idxstats.loc[y_id].iloc[0]
|
|
43
|
+
total_count = idxstats.iloc[:, 1].sum()
|
|
44
|
+
total_xy = x_count + y_count
|
|
45
|
+
|
|
46
|
+
if total_xy == 0:
|
|
47
|
+
Ry = np.nan
|
|
48
|
+
logging.warning(f"No reads mapped to X or Y in {x_id} or {y_id}. Skipping Ry ratio calculation.")
|
|
49
|
+
else:
|
|
50
|
+
factor = x_length / y_length # This factor is used to take into account that X and Y are different lengths, so we need to normalize the coverage ratio by the length of the chromosomes so that the coverage ratio is comparable between the two chromosomes and not just because of artifactual differences in length
|
|
51
|
+
Ry = (y_count / total_xy) * factor
|
|
52
|
+
|
|
53
|
+
# Compute posteriors
|
|
54
|
+
P_male = 0.5
|
|
55
|
+
P_female = 0.5
|
|
56
|
+
|
|
57
|
+
P_male_posterior, P_female_posterior = compute_joint_posterior(Rx, Ry, male_kde, female_kde, P_male, P_female)
|
|
58
|
+
|
|
59
|
+
inferred_sex = determine_sex_with_joint_posteriors(P_male_posterior, P_female_posterior, threshold)
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
'Rx': Rx,
|
|
63
|
+
'Ry': Ry,
|
|
64
|
+
'Total reads mapped': total_count,
|
|
65
|
+
'Reads mapped to X': x_count,
|
|
66
|
+
'Reads mapped to Y': y_count,
|
|
67
|
+
'Posterior probability of being male': np.round(P_male_posterior, 3),
|
|
68
|
+
'Posterior probability of being female': np.round(P_female_posterior, 3),
|
|
69
|
+
'SCiMS predicted sex': inferred_sex
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def process_sample_zw(
|
|
74
|
+
idxstats: pd.DataFrame,
|
|
75
|
+
z_id: str,
|
|
76
|
+
w_id: str,
|
|
77
|
+
male_kde: gaussian_kde,
|
|
78
|
+
female_kde: gaussian_kde,
|
|
79
|
+
threshold: float
|
|
80
|
+
) -> dict:
|
|
81
|
+
"""
|
|
82
|
+
Given an idxstats DataFrame, ZW chromosome IDs (z_id, w_id), and KDE models for male/female,
|
|
83
|
+
computes coverage ratios (Rz, Rw) and determines sex using joint posteriors.
|
|
84
|
+
"""
|
|
85
|
+
z_index = idxstats.index.get_loc(z_id) if z_id in idxstats.index else None
|
|
86
|
+
w_index = idxstats.index.get_loc(w_id) if w_id in idxstats.index else None
|
|
87
|
+
|
|
88
|
+
# Identify which rows are autosomes (excluding Z and W)
|
|
89
|
+
autosome_indices = [
|
|
90
|
+
i for i in range(len(idxstats))
|
|
91
|
+
if i != z_index and (w_index is None or i != w_index)
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
# Coverage ratio for Z vs autosomes (Rz)
|
|
95
|
+
Rz = compute_coverage_ratio_rx(idxstats, autosome_indices, z_index)
|
|
96
|
+
|
|
97
|
+
# Coverage ratio for W vs Z+W (Rw)
|
|
98
|
+
z_count = idxstats.loc[z_id].iloc[1] if z_id in idxstats.index else 0
|
|
99
|
+
z_length = idxstats.loc[z_id].iloc[0]
|
|
100
|
+
#print(z_length)
|
|
101
|
+
w_count = idxstats.loc[w_id].iloc[1] if w_id in idxstats.index else 0
|
|
102
|
+
w_length = idxstats.loc[w_id].iloc[0]
|
|
103
|
+
#print(w_length)
|
|
104
|
+
total_count = idxstats.iloc[:, 1].sum()
|
|
105
|
+
total_zw = z_count + w_count
|
|
106
|
+
|
|
107
|
+
if total_zw == 0:
|
|
108
|
+
Rw = np.nan
|
|
109
|
+
logging.warning(f"No reads mapped to Z or W in {z_id} or {w_id}. Skipping Rw ratio calculation.")
|
|
110
|
+
else:
|
|
111
|
+
#Rw = (w_count / total_zw) * (z_length / w_length)
|
|
112
|
+
#print(Rw)
|
|
113
|
+
factor = z_length / w_length # This factor is used to take into account that Z and W are different lengths, so we need to normalize the coverage ratio by the length of the chromosomes so that the coverage ratio is comparable between the two chromosomes and not just because of artifactual differences in length
|
|
114
|
+
#print(factor)
|
|
115
|
+
Rw = (w_count / total_zw) * factor
|
|
116
|
+
#print(Rw)
|
|
117
|
+
|
|
118
|
+
# Compute posteriors
|
|
119
|
+
P_male = 0.5
|
|
120
|
+
P_female = 0.5
|
|
121
|
+
|
|
122
|
+
P_male_posterior, P_female_posterior = compute_joint_posterior(Rz, Rw, male_kde, female_kde, P_male, P_female)
|
|
123
|
+
|
|
124
|
+
inferred_sex = determine_sex_with_joint_posteriors(P_male_posterior, P_female_posterior, threshold)
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
'Rz': Rz,
|
|
128
|
+
'Rw': Rw,
|
|
129
|
+
'Total reads mapped': total_count,
|
|
130
|
+
'Reads mapped to Z': z_count,
|
|
131
|
+
'Reads mapped to W': w_count,
|
|
132
|
+
'Posterior probability of being male': np.round(P_male_posterior, 3),
|
|
133
|
+
'Posterior probability of being female': np.round(P_female_posterior, 3),
|
|
134
|
+
'SCiMS predicted sex': inferred_sex
|
|
135
|
+
}
|