SCiMS 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scims/__init__.py ADDED
@@ -0,0 +1,35 @@
1
+ # __init__.py: initialize SCiMS package
2
+
3
+ from .utils import (
4
+ read_metadata,
5
+ normalize_colname,
6
+ find_sample_id_column,
7
+ read_master_file,
8
+ extract_sample_id
9
+ )
10
+
11
+ from .helpers import (
12
+ load_training_data
13
+ )
14
+
15
+ from .classification import (
16
+ process_sample_xy,
17
+ process_sample_zw
18
+ )
19
+
20
+ from .__main__ import main
21
+
22
+ __all__ = [
23
+ 'main',
24
+ 'read_metadata',
25
+ 'normalize_colname',
26
+ 'find_sample_id_column',
27
+ 'read_master_file',
28
+ 'extract_sample_id',
29
+ 'load_training_data',
30
+ 'process_sample_xy',
31
+ 'process_sample_zw'
32
+ ]
33
+
34
+ # Package version
35
+ __version__ = '1.1.0'
scims/__main__.py ADDED
@@ -0,0 +1,196 @@
1
+ """
2
+ SCiMS: Sex Calling in Metagenomic Sequencing
3
+
4
+ This script classifies host sex using metagenomic sequencing data alone.
5
+ Metagenomic samples are classified into male, female, or uncertain based on
6
+ coverage ratios of putative sex chromosomes (X/Y or Z/W). It uses a kernel
7
+ density estimation (KDE) approach, comparing coverage ratios against training data.
8
+ In the XY system, 'male' is heterogametic (XY); in the ZW system, 'female' is
9
+ heterogametic (ZW).
10
+
11
+ Author: Hanh Tran
12
+ Version: 1.1.0
13
+ """
14
+
15
+ import argparse
16
+ import logging
17
+ import os
18
+ import pandas as pd
19
+ import sys
20
+
21
+ from scipy.stats import gaussian_kde
22
+
23
+ from .utils import (
24
+ read_metadata,
25
+ find_sample_id_column,
26
+ extract_sample_id
27
+ )
28
+ from .helpers import load_training_data
29
+ from .process_idxstats import process_idxstats_file
30
+
31
+ def main():
32
+ parser = argparse.ArgumentParser(description="SCiMS: Sex Calling in Metagenomic Sequencing")
33
+
34
+ # Mode selection: default is single-sample mode.
35
+ parser.add_argument('--idxstats_file', dest="idxstats_file", help='Path to a single idxstats file (default mode)')
36
+ parser.add_argument('--idxstats_folder', dest="idxstats_folder", help='Path to the folder containing idxstats files for multiple-sample mode')
37
+
38
+ parser.add_argument('--scaffolds', dest="scaffold_ids_file", required=True, help='Path to the text file containing scaffold IDs')
39
+ parser.add_argument('--homogametic_scaffold', dest="homogametic_scaffold", required=True, help='ID of the homogametic scaffold (e.g. X or Z)')
40
+ parser.add_argument('--heterogametic_scaffold', dest="heterogametic_scaffold", required=True, help='ID of the heterogametic scaffold (e.g. Y or W)')
41
+ parser.add_argument('--ZW', dest="ZW", action="store_true", help='Switch to ZW system (default is XY)')
42
+ parser.add_argument('--threshold', dest="threshold", type=float, default=0.5, help='Probability threshold for determining sex')
43
+ parser.add_argument('--output_dir', dest="output_dir", required=True, help='Path to the output directory')
44
+ parser.add_argument('--training_data', dest="training_data", help='Path to the training data file', default="training_data_hmp_1000x_normalizedXY.txt")
45
+
46
+ # Optional metadata update (only used in multiple-sample mode)
47
+ parser.add_argument('--metadata', dest="metadata", help='Path to the metadata file (optional, used in multiple-sample mode)')
48
+ parser.add_argument('--id_column', dest="id_column", help='User-specified sample ID column name in metadata')
49
+
50
+ # New boolean flag for log output (default is False)
51
+ parser.add_argument('--log', dest="log", action="store_true", help='If set, a log file is written to the output directory (scims.log)')
52
+
53
+ args = parser.parse_args()
54
+
55
+ # Check metadata parameters early
56
+ if args.metadata and not args.id_column:
57
+ print("Error: When providing a metadata file, you must also specify the id column using --id_column.")
58
+ sys.exit(1)
59
+
60
+ # Setup logging after parsing arguments
61
+ logger = logging.getLogger(__name__)
62
+ logger.setLevel(logging.INFO)
63
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
64
+
65
+ # Clear any existing handlers
66
+ if logger.hasHandlers():
67
+ logger.handlers.clear()
68
+
69
+ # Add console handler
70
+ console_handler = logging.StreamHandler()
71
+ console_handler.setFormatter(formatter)
72
+ logger.addHandler(console_handler)
73
+
74
+ # Create output directory if it doesn't exist
75
+ os.makedirs(args.output_dir, exist_ok=True)
76
+
77
+ # Add file handler if the --log flag is set
78
+ if args.log:
79
+ log_file_path = os.path.join(args.output_dir, "scims.log")
80
+ file_handler = logging.FileHandler(log_file_path, mode='w')
81
+ file_handler.setFormatter(formatter)
82
+ logger.addHandler(file_handler)
83
+ logger.info(f"Log file created at: {log_file_path}")
84
+
85
+ logger.info(" \n=================================================")
86
+ logger.info("""
87
+ _|_|_| _|_|_| _|_|_| _| _| _|_|_|
88
+ _| _| _| _|_| _|_| _|
89
+ _|_|_| _| _| _| _| _| _|_|_|
90
+ _| _| _| _| _| _|
91
+ _|_|_| _|_|_| _|_|_| _| _| _|_|_|
92
+ =================================================""")
93
+ logger.info("SCiMS: Sex Calling in Metagenomic Sequencing")
94
+ logger.info("Version: 1.1.0")
95
+ logger.info("=================================================")
96
+
97
+ # Validate mode: either a single file or folder must be provided
98
+ if args.idxstats_folder:
99
+ mode = "multiple"
100
+ elif args.idxstats_file:
101
+ mode = "single"
102
+ else:
103
+ logger.error("You must specify either --idxstats_file for single-sample mode or --idxstats_folder for multiple-sample mode.")
104
+ sys.exit(1)
105
+
106
+ # Load scaffold IDs
107
+ try:
108
+ with open(args.scaffold_ids_file, 'r') as sf:
109
+ scaffold_ids = [line.strip() for line in sf if line.strip()]
110
+ except Exception as e:
111
+ logger.error(f"Failed to read scaffold IDs: {e}")
112
+ sys.exit(1)
113
+
114
+ # Load training data and build KDE models
115
+ try:
116
+ training_data = load_training_data(args.training_data)
117
+ except Exception as e:
118
+ logger.error(f"Failed to load training data: {e}")
119
+ sys.exit(1)
120
+
121
+ if args.ZW:
122
+ # Use ZW system columns from training data
123
+ male_rows = training_data[training_data['actual_sex_zw'] == 'male']
124
+ female_rows = training_data[training_data['actual_sex_zw'] == 'female']
125
+ male_data = male_rows[['Rz', 'Rw']].dropna().values.T
126
+ female_data = female_rows[['Rz', 'Rw']].dropna().values.T
127
+ else:
128
+ # Default to XY system
129
+ male_rows = training_data[training_data['actual_sex'] == 'male']
130
+ female_rows = training_data[training_data['actual_sex'] == 'female']
131
+ male_data = male_rows[['Rx', 'Ry']].dropna().values.T
132
+ female_data = female_rows[['Rx', 'Ry']].dropna().values.T
133
+
134
+ kde_male_joint = gaussian_kde(male_data)
135
+ kde_female_joint = gaussian_kde(female_data)
136
+
137
+ if mode == "multiple":
138
+ try:
139
+ folder_files = [os.path.join(args.idxstats_folder, f)
140
+ for f in os.listdir(args.idxstats_folder) if f.endswith(".idxstats")]
141
+ except Exception as e:
142
+ logger.error(f"Error reading idxstats folder: {e}")
143
+ sys.exit(1)
144
+
145
+ if not folder_files:
146
+ logger.error("No idxstats files found in the provided folder.")
147
+ sys.exit(1)
148
+
149
+ all_results = [] # To collect results for optional metadata merging
150
+ for idxstats_file in folder_files:
151
+ result = process_idxstats_file(idxstats_file, scaffold_ids, args, kde_male_joint, kde_female_joint)
152
+ all_results.append(result)
153
+ # Build output dictionary using consistent keys
154
+ out_dict = {
155
+ "SCiMS_ID": result.get("SCiMS_ID"),
156
+ "SCiMS_predicted_sex": result.get("SCiMS_sex"),
157
+ "SCiMS_male_post_prob": result.get("SCiMS_male_post_prob"),
158
+ "SCiMS_female_post_prob": result.get("SCiMS_female_post_prob")
159
+ }
160
+ base_name = os.path.basename(idxstats_file).split('.')[0]
161
+ output_file = os.path.join(args.output_dir, f"{base_name}_results.txt")
162
+ pd.DataFrame([out_dict]).to_csv(output_file, sep='\t', index=False)
163
+ logger.info(f"Results written to {output_file}")
164
+
165
+ # Merge metadata once after processing all files
166
+ if args.metadata:
167
+ try:
168
+ results_df = pd.DataFrame(all_results)
169
+ metadata = read_metadata(args.metadata)
170
+ sample_id_col = find_sample_id_column(metadata, args.id_column)
171
+ merged_df = pd.merge(metadata, results_df, left_on=sample_id_col, right_on='SCiMS_ID', how='left')
172
+ merged_df.drop(columns=['SCiMS_ID'], inplace=True)
173
+ metadata_basename = os.path.basename(args.metadata).split('.')[0]
174
+ metadata_file = os.path.join(args.output_dir, f"{metadata_basename}_scims_updated.txt")
175
+ merged_df.to_csv(metadata_file, sep='\t', index=False)
176
+ logger.info(f"Updated metadata with classification results written to {metadata_file}")
177
+ except Exception as e:
178
+ logger.error(f"Error updating metadata: {e}")
179
+ sys.exit(1)
180
+ else:
181
+ # Single-sample mode: process one file and write the filtered result
182
+ result = process_idxstats_file(args.idxstats_file, scaffold_ids, args, kde_male_joint, kde_female_joint)
183
+ out_dict = {
184
+ "SCiMS_ID": result.get("SCiMS_ID"),
185
+ "SCiMS_predicted_sex": result.get("SCiMS_sex"),
186
+ "SCiMS_male_post_prob": result.get("SCiMS_male_post_prob"),
187
+ "SCiMS_female_post_prob": result.get("SCiMS_female_post_prob")
188
+ }
189
+ results_df = pd.DataFrame([out_dict])
190
+ base_name = os.path.basename(args.idxstats_file).split('.')[0]
191
+ output_file = os.path.join(args.output_dir, f"{base_name}_results.txt")
192
+ results_df.to_csv(output_file, sep='\t', index=False)
193
+ logger.info(f"Results written to {output_file}")
194
+
195
+ if __name__ == "__main__":
196
+ main()
@@ -0,0 +1,135 @@
1
+ # classification.py : core logic for classification process
2
+
3
+ from scipy.stats import gaussian_kde
4
+ import logging
5
+ import numpy as np
6
+ import pandas as pd
7
+ from .helpers import (
8
+ compute_coverage_ratio_rx,
9
+ compute_joint_posterior,
10
+ determine_sex_with_joint_posteriors,
11
+ load_training_data
12
+ )
13
+
14
+ def process_sample_xy(
15
+ idxstats: pd.DataFrame,
16
+ x_id: str,
17
+ y_id: str,
18
+ male_kde: gaussian_kde,
19
+ female_kde: gaussian_kde,
20
+ threshold: float
21
+ ) -> dict:
22
+ """
23
+ Given an idxstats DataFrame, XY chromosome IDs (x_id, y_id), and KDE models for male/female,
24
+ computes coverage ratios (Rx, Ry) and determines sex using joint posteriors.
25
+ """
26
+ # Indices of X/Y
27
+ x_index = idxstats.index.get_loc(x_id) if x_id in idxstats.index else None
28
+ y_index = idxstats.index.get_loc(y_id) if y_id in idxstats.index else None
29
+
30
+ # Idenfity which rows are autosomes (excluding X and Y)
31
+ autosome_indices = [
32
+ i for i in range(len(idxstats))
33
+ if i != x_index and (y_index is None or i != y_index)]
34
+
35
+ # Coverage ratio for X vs autosomes (Rx)
36
+ Rx = compute_coverage_ratio_rx(idxstats, autosome_indices, x_index)
37
+
38
+ # Coverage ratio for Y vs X+Y (Ry)
39
+ x_count = idxstats.loc[x_id].iloc[1] if x_id in idxstats.index else 0 # column 1 = mapped reads column 0 = chromosome length
40
+ y_count = idxstats.loc[y_id].iloc[1] if y_id in idxstats.index else 0
41
+ x_length = idxstats.loc[x_id].iloc[0]
42
+ y_length = idxstats.loc[y_id].iloc[0]
43
+ total_count = idxstats.iloc[:, 1].sum()
44
+ total_xy = x_count + y_count
45
+
46
+ if total_xy == 0:
47
+ Ry = np.nan
48
+ logging.warning(f"No reads mapped to X or Y in {x_id} or {y_id}. Skipping Ry ratio calculation.")
49
+ else:
50
+ factor = x_length / y_length # This factor is used to take into account that X and Y are different lengths, so we need to normalize the coverage ratio by the length of the chromosomes so that the coverage ratio is comparable between the two chromosomes and not just because of artifactual differences in length
51
+ Ry = (y_count / total_xy) * factor
52
+
53
+ # Compute posteriors
54
+ P_male = 0.5
55
+ P_female = 0.5
56
+
57
+ P_male_posterior, P_female_posterior = compute_joint_posterior(Rx, Ry, male_kde, female_kde, P_male, P_female)
58
+
59
+ inferred_sex = determine_sex_with_joint_posteriors(P_male_posterior, P_female_posterior, threshold)
60
+
61
+ return {
62
+ 'Rx': Rx,
63
+ 'Ry': Ry,
64
+ 'Total reads mapped': total_count,
65
+ 'Reads mapped to X': x_count,
66
+ 'Reads mapped to Y': y_count,
67
+ 'Posterior probability of being male': np.round(P_male_posterior, 3),
68
+ 'Posterior probability of being female': np.round(P_female_posterior, 3),
69
+ 'SCiMS predicted sex': inferred_sex
70
+ }
71
+
72
+
73
+ def process_sample_zw(
74
+ idxstats: pd.DataFrame,
75
+ z_id: str,
76
+ w_id: str,
77
+ male_kde: gaussian_kde,
78
+ female_kde: gaussian_kde,
79
+ threshold: float
80
+ ) -> dict:
81
+ """
82
+ Given an idxstats DataFrame, ZW chromosome IDs (z_id, w_id), and KDE models for male/female,
83
+ computes coverage ratios (Rz, Rw) and determines sex using joint posteriors.
84
+ """
85
+ z_index = idxstats.index.get_loc(z_id) if z_id in idxstats.index else None
86
+ w_index = idxstats.index.get_loc(w_id) if w_id in idxstats.index else None
87
+
88
+ # Identify which rows are autosomes (excluding Z and W)
89
+ autosome_indices = [
90
+ i for i in range(len(idxstats))
91
+ if i != z_index and (w_index is None or i != w_index)
92
+ ]
93
+
94
+ # Coverage ratio for Z vs autosomes (Rz)
95
+ Rz = compute_coverage_ratio_rx(idxstats, autosome_indices, z_index)
96
+
97
+ # Coverage ratio for W vs Z+W (Rw)
98
+ z_count = idxstats.loc[z_id].iloc[1] if z_id in idxstats.index else 0
99
+ z_length = idxstats.loc[z_id].iloc[0]
100
+ #print(z_length)
101
+ w_count = idxstats.loc[w_id].iloc[1] if w_id in idxstats.index else 0
102
+ w_length = idxstats.loc[w_id].iloc[0]
103
+ #print(w_length)
104
+ total_count = idxstats.iloc[:, 1].sum()
105
+ total_zw = z_count + w_count
106
+
107
+ if total_zw == 0:
108
+ Rw = np.nan
109
+ logging.warning(f"No reads mapped to Z or W in {z_id} or {w_id}. Skipping Rw ratio calculation.")
110
+ else:
111
+ #Rw = (w_count / total_zw) * (z_length / w_length)
112
+ #print(Rw)
113
+ factor = z_length / w_length # This factor is used to take into account that Z and W are different lengths, so we need to normalize the coverage ratio by the length of the chromosomes so that the coverage ratio is comparable between the two chromosomes and not just because of artifactual differences in length
114
+ #print(factor)
115
+ Rw = (w_count / total_zw) * factor
116
+ #print(Rw)
117
+
118
+ # Compute posteriors
119
+ P_male = 0.5
120
+ P_female = 0.5
121
+
122
+ P_male_posterior, P_female_posterior = compute_joint_posterior(Rz, Rw, male_kde, female_kde, P_male, P_female)
123
+
124
+ inferred_sex = determine_sex_with_joint_posteriors(P_male_posterior, P_female_posterior, threshold)
125
+
126
+ return {
127
+ 'Rz': Rz,
128
+ 'Rw': Rw,
129
+ 'Total reads mapped': total_count,
130
+ 'Reads mapped to Z': z_count,
131
+ 'Reads mapped to W': w_count,
132
+ 'Posterior probability of being male': np.round(P_male_posterior, 3),
133
+ 'Posterior probability of being female': np.round(P_female_posterior, 3),
134
+ 'SCiMS predicted sex': inferred_sex
135
+ }