sai-pg 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,241 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ import argparse
22
+ import re
23
+ from sai.parsers.argument_validation import positive_int
24
+ from sai.parsers.argument_validation import existed_file
25
+ from sai.parsers.argument_validation import between_zero_and_one
26
+ from sai.parsers.argument_validation import validate_stat_type
27
+ from sai.sai import score
28
+ from sai.utils.utils import parse_ind_file
29
+
30
+
31
+ def _run_score(args: argparse.Namespace) -> None:
32
+ """
33
+ Executes the scoring function with arguments provided via the command line interface.
34
+
35
+ Parameters
36
+ ----------
37
+ args : argparse.Namespace
38
+ Parsed command-line arguments that contain the necessary parameters for the scoring function,
39
+ including:
40
+
41
+ - vcf : str
42
+ Path to the VCF file containing variant data.
43
+ - chr_name : str
44
+ Name of the chromosome to be analyzed.
45
+ - ref : str
46
+ Path to the reference group individual file.
47
+ - tgt : str
48
+ Path to the target group individual file.
49
+ - src : str
50
+ Path to the source population individual file.
51
+ - win_len : int
52
+ Length of each analysis window.
53
+ - win_step : int
54
+ Step size for moving the window along the sequence.
55
+ - num_src : int
56
+ Number of source populations. The length of `args.y` should match `num_src`.
57
+ - anc_alleles : str
58
+ Path to the ancestral allele file.
59
+ - w : float
60
+ Allele frequency threshold for the reference group.
61
+ - y : list of float
62
+ List of allele frequency thresholds for each source population. Its length must match `num_src`.
63
+ - output : str
64
+ Path to the output file for storing results.
65
+ - stat_type: str
66
+ Specifies the type of statistic to compute.
67
+
68
+ Raises
69
+ ------
70
+ ValueError
71
+ If the length of `args.y` does not match the expected number of source populations (`args.num_src`),
72
+ or if other input parameters do not meet expected conditions.
73
+ """
74
+ src_samples = parse_ind_file(args.src)
75
+ num_src = len(src_samples.keys())
76
+ if len(args.y) != num_src:
77
+ raise ValueError(
78
+ f"The length of y ({len(args.y)}) does not match the number of source populations ({num_src}) found in {args.src}."
79
+ )
80
+
81
+ score(
82
+ vcf_file=args.vcf,
83
+ chr_name=args.chr_name,
84
+ ref_ind_file=args.ref,
85
+ tgt_ind_file=args.tgt,
86
+ src_ind_file=args.src,
87
+ win_len=args.win_len,
88
+ win_step=args.win_step,
89
+ num_src=num_src,
90
+ anc_allele_file=args.anc_alleles,
91
+ w=args.w,
92
+ y=args.y,
93
+ output_file=args.output,
94
+ stat_type=args.stat,
95
+ num_workers=1,
96
+ )
97
+
98
+
99
+ def _parse_y_thresholds(value: str) -> tuple[str, float]:
100
+ """
101
+ Parses the --y parameter value to extract an operator and a numerical threshold.
102
+
103
+ This function ensures that the input is correctly formatted as one of the following:
104
+ - `=X` (equality condition)
105
+ - `>X` (greater than condition)
106
+ - `<X` (less than condition)
107
+ - `>=X` (greater than or equal to condition)
108
+ - `<=X` (less than or equal to condition)
109
+
110
+ The numerical value `X` must be within the range [0, 1].
111
+
112
+ Parameters
113
+ ----------
114
+ value : str
115
+ A string representing the allele frequency threshold condition, e.g., "=0.7", ">0.8", "<=0.2".
116
+
117
+ Returns
118
+ -------
119
+ tuple[str, float]
120
+ A tuple containing:
121
+ - A string representing the comparison operator (`=`, `<`, `>`, `<=`, `>=`).
122
+ - A float representing the threshold value.
123
+
124
+ Raises
125
+ ------
126
+ argparse.ArgumentTypeError
127
+ If the input format is invalid or the numerical threshold is outside the range [0, 1].
128
+ """
129
+ match = re.match(r"^(=|<|>|<=|>=)(\d*\.?\d+)$", value)
130
+ if not match:
131
+ raise argparse.ArgumentTypeError(
132
+ f"Invalid format for --y: {value}. Must be in the form =X, >X, <X, >=X, or <=X "
133
+ f"(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2)."
134
+ )
135
+
136
+ operator, num_str = match.groups()
137
+ num = float(num_str)
138
+
139
+ if not (0 <= num <= 1):
140
+ raise argparse.ArgumentTypeError(
141
+ f"Value for --y must be between 0 and 1, got {num}."
142
+ )
143
+
144
+ return operator, num
145
+
146
+
147
+ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
148
+ """
149
+ Initializes and configures the command-line interface parser
150
+ for the score subcommand.
151
+
152
+ Parameters
153
+ ----------
154
+ subparsers : argparse.ArgumentParser
155
+ A command-line interface parser to be configured.
156
+ """
157
+ parser = subparsers.add_parser(
158
+ "score", help="Run the score command based on specified parameters."
159
+ )
160
+ parser.add_argument(
161
+ "--vcf",
162
+ type=existed_file,
163
+ required=True,
164
+ help="Path to the VCF file containing variant data.",
165
+ )
166
+ parser.add_argument(
167
+ "--chr-name",
168
+ dest="chr_name",
169
+ type=str,
170
+ required=True,
171
+ help="Chromosome name to analyze from the VCF file.",
172
+ )
173
+ parser.add_argument(
174
+ "--ref",
175
+ type=existed_file,
176
+ required=True,
177
+ help="Path to the file with reference population identifiers.",
178
+ )
179
+ parser.add_argument(
180
+ "--tgt",
181
+ type=existed_file,
182
+ required=True,
183
+ help="Path to the file with target population identifiers.",
184
+ )
185
+ parser.add_argument(
186
+ "--src",
187
+ type=existed_file,
188
+ required=True,
189
+ help="Path to the file with source population identifiers.",
190
+ )
191
+ parser.add_argument(
192
+ "--win-len",
193
+ dest="win_len",
194
+ type=positive_int,
195
+ default=50000,
196
+ help="Length of each genomic window in base pairs. Default: 50,000.",
197
+ )
198
+ parser.add_argument(
199
+ "--win-step",
200
+ dest="win_step",
201
+ type=positive_int,
202
+ default=10000,
203
+ help="Step size in base pairs between consecutive windows. Default: 10,000.",
204
+ )
205
+ parser.add_argument(
206
+ "--anc-alleles",
207
+ dest="anc_alleles",
208
+ type=existed_file,
209
+ default=None,
210
+ help="Path to the BED file with ancestral allele information. If ancestral allele information is not provided, filtering will be performed for each variant based on whether the allele frequency of any allele (assuming biallelic) meets the specified condition during the calculation of the statistics. Default: None.",
211
+ )
212
+ parser.add_argument(
213
+ "--w",
214
+ type=between_zero_and_one,
215
+ default=0.01,
216
+ help="Frequency threshold for variants in the reference population; only variants with frequencies below this threshold are included in the analysis. Default: 0.01.",
217
+ )
218
+ parser.add_argument(
219
+ "--y",
220
+ type=_parse_y_thresholds,
221
+ nargs="+",
222
+ default=[("=", 1.0)],
223
+ help="List of allele frequency conditions for the source populations. "
224
+ "Each value must be in the form =X, >X, <X, >=X, or <=X "
225
+ "(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2). "
226
+ "The number of values must match the number of source populations in the file specified by `--src`; "
227
+ "the order of the allele frequency conditions should also correspond to the order of source populations in that file. Default: =1",
228
+ )
229
+ parser.add_argument(
230
+ "--output",
231
+ type=str,
232
+ required=True,
233
+ help="Output file path for saving results.",
234
+ )
235
+ parser.add_argument(
236
+ "--stat",
237
+ type=validate_stat_type,
238
+ required=True,
239
+ help="Type of statistic to compute: UXX or QXX, where XX is a percentage-like index indicating a threshold in the target population. For example, `U50` means the allele frequency is greater than 0.5, and `Q95` means the allele frequency is greater than or equal to the 95th percentile among sites meeting the specified conditions.",
240
+ )
241
+ parser.set_defaults(runner=_run_score)
sai/sai.py ADDED
@@ -0,0 +1,315 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ import os
22
+ import warnings
23
+ import pandas as pd
24
+ import matplotlib.pyplot as plt
25
+ from matplotlib.ticker import MaxNLocator
26
+ from sai.utils.generators import ChunkGenerator
27
+ from sai.utils.preprocessors import ChunkPreprocessor
28
+ from sai.utils.utils import natsorted_df
29
+
30
+
31
+ def score(
32
+ vcf_file: str,
33
+ chr_name: str,
34
+ ref_ind_file: str,
35
+ tgt_ind_file: str,
36
+ src_ind_file: str,
37
+ win_len: int,
38
+ win_step: int,
39
+ num_src: int,
40
+ anc_allele_file: str,
41
+ w: float,
42
+ y: list[float],
43
+ output_file: str,
44
+ stat_type: str,
45
+ num_workers: int,
46
+ ) -> None:
47
+ """
48
+ Processes and scores genomic data by generating windowed data and feature vectors.
49
+
50
+ Parameters
51
+ ----------
52
+ vcf_file : str
53
+ Path to the VCF file containing variant data.
54
+ chr_name : str
55
+ The chromosome name to be analyzed from the VCF file.
56
+ ref_ind_file : str
57
+ Path to the file containing reference population identifiers.
58
+ tgt_ind_file : str
59
+ Path to the file containing target population identifiers.
60
+ src_ind_file : str
61
+ Path to the file containing source population identifiers.
62
+ win_len : int
63
+ Length of each genomic window in base pairs.
64
+ win_step : int
65
+ Step size in base pairs between consecutive windows.
66
+ num_src : int
67
+ Number of source populations to include in each windowed analysis.
68
+ anc_allele_file : str
69
+ Path to the file containing ancestral allele information.
70
+ w : float
71
+ Frequency threshold for calculating feature vectors.
72
+ y : list[float]
73
+ List of frequency thresholds used for various calculations in feature vector processing.
74
+ output_file : str
75
+ File path to save the output of processed feature vectors.
76
+ stat_type: str
77
+ Specifies the type of statistic to compute.
78
+ num_workers : int
79
+ Number of parallel processes for multiprocessing.
80
+ """
81
+ generator = ChunkGenerator(
82
+ vcf_file=vcf_file,
83
+ chr_name=chr_name,
84
+ window_size=win_len,
85
+ step_size=win_step,
86
+ num_chunks=num_workers * 8,
87
+ )
88
+
89
+ preprocessor = ChunkPreprocessor(
90
+ vcf_file=vcf_file,
91
+ ref_ind_file=ref_ind_file,
92
+ tgt_ind_file=tgt_ind_file,
93
+ src_ind_file=src_ind_file,
94
+ win_len=win_len,
95
+ win_step=win_step,
96
+ w=w,
97
+ y=y,
98
+ output_file=output_file,
99
+ stat_type=stat_type,
100
+ anc_allele_file=anc_allele_file,
101
+ num_src=num_src,
102
+ )
103
+
104
+ header = f"Chrom\tStart\tEnd\tRef\tTgt\tSrc\tN(Variants)\t{stat_type}(w<{w},y=({','.join(f'{op}{val}' for op, val in y)}))\tCandidate\n"
105
+
106
+ directory = os.path.dirname(output_file)
107
+ if directory:
108
+ os.makedirs(directory, exist_ok=True)
109
+ with open(output_file, "w") as f:
110
+ f.write(header)
111
+
112
+ items = []
113
+
114
+ for params in generator.get():
115
+ items.extend(preprocessor.run(**params))
116
+
117
+ preprocessor.process_items(items)
118
+
119
+
120
+ def outlier(score_file: str, output: str, quantile: float) -> None:
121
+ """
122
+ Outputs rows exceeding the specified quantile for the chosen column ('U' or 'Q'),
123
+ sorted by Start and then End columns.
124
+
125
+ Parameters
126
+ ----------
127
+ score_file : str
128
+ Path to the input file, in CSV format.
129
+ output : str
130
+ Path to the output file.
131
+ quantile : float
132
+ Quantile threshold to filter rows.
133
+ """
134
+ # Read the input data file
135
+ data = pd.read_csv(
136
+ score_file,
137
+ sep="\t",
138
+ na_values=["nan"],
139
+ dtype={"Candidate": str},
140
+ index_col=False,
141
+ )
142
+
143
+ column = data.columns[-2]
144
+
145
+ # Convert column to numeric for computation
146
+ data[column] = pd.to_numeric(data[column], errors="coerce")
147
+
148
+ # Calculate quantile threshold for the chosen column
149
+ threshold = data[column].quantile(quantile)
150
+
151
+ if data[column].nunique() == 1:
152
+ warnings.warn(
153
+ f"Column '{column}' contains only one unique value ({threshold}), making quantile filtering meaningless.",
154
+ UserWarning,
155
+ )
156
+ outliers = pd.DataFrame(columns=data.columns)
157
+ elif (threshold == 1) and (column.startswith("Q")):
158
+ outliers = data[data[column] >= threshold]
159
+ else:
160
+ outliers = data[data[column] > threshold]
161
+
162
+ # Sort the filtered data by 'Chrom', 'Start', 'End' columns
163
+ if not outliers.empty:
164
+ outliers = outliers.reset_index(drop=True)
165
+ outliers_sorted = natsorted_df(outliers)
166
+ else:
167
+ outliers_sorted = outliers
168
+
169
+ # Convert all columns to string before saving
170
+ outliers_sorted = outliers_sorted.astype(str)
171
+
172
+ # Save the sorted filtered data to the output file
173
+ outliers_sorted.to_csv(output, index=False, sep="\t")
174
+
175
+
176
+ def plot(
177
+ u_file: str,
178
+ q_file: str,
179
+ output: str,
180
+ xlabel: str,
181
+ ylabel: str,
182
+ title: str,
183
+ figsize_x: float = 6,
184
+ figsize_y: float = 6,
185
+ dpi: int = 300,
186
+ alpha: float = 0.6,
187
+ marker_size: float = 20,
188
+ marker_color: str = "blue",
189
+ marker_style: str = "o",
190
+ ) -> None:
191
+ """
192
+ Reads two score/outlier files (U and Q), finds common candidate positions, and plots U vs. Q.
193
+
194
+ Parameters
195
+ ----------
196
+ u_file : str
197
+ Path to the input file containing U score/outlier data.
198
+ q_file : str
199
+ Path to the input file containing Q score/outlier data.
200
+ output : str
201
+ Path to save the output plot.
202
+ xlabel : str
203
+ Label for the X-axis.
204
+ ylabel : str
205
+ Label for the Y-axis.
206
+ title : str
207
+ Title of the plot.
208
+ figsize_x : float, optional
209
+ Width of the figure (default: 6).
210
+ figsize_y : float, optional
211
+ Height of the figure (default: 6).
212
+ dpi : int, optional
213
+ Resolution of the saved plot (default: 300).
214
+ alpha : float, optional
215
+ Transparency level of scatter points (default: 0.6).
216
+ marker_size : float, optional
217
+ Size of the scatter plot markers (default: 20).
218
+ marker_color : str, optional
219
+ Color of the markers (default: "blue").
220
+ marker_style : str, optional
221
+ Shape of the marker (default: "o").
222
+ """
223
+ u_data = pd.read_csv(u_file, sep="\t")
224
+ q_data = pd.read_csv(q_file, sep="\t")
225
+
226
+ u_column = u_data.columns[-2]
227
+ q_column = q_data.columns[-2]
228
+
229
+ u_data["interval"] = (
230
+ u_data["Chrom"].astype(str)
231
+ + ":"
232
+ + u_data["Start"].astype(str)
233
+ + "-"
234
+ + u_data["End"].astype(str)
235
+ )
236
+ q_data["interval"] = (
237
+ q_data["Chrom"].astype(str)
238
+ + ":"
239
+ + q_data["Start"].astype(str)
240
+ + "-"
241
+ + q_data["End"].astype(str)
242
+ )
243
+
244
+ u_data[u_column] = pd.to_numeric(u_data[u_column], errors="coerce")
245
+ q_data[q_column] = pd.to_numeric(q_data[q_column], errors="coerce")
246
+ u_data = u_data.dropna(subset=[u_column])
247
+ q_data = q_data.dropna(subset=[q_column])
248
+
249
+ u_interval_dict = {row["interval"]: row[u_column] for _, row in u_data.iterrows()}
250
+ q_interval_dict = {row["interval"]: row[q_column] for _, row in q_data.iterrows()}
251
+ u_candidate_dict = {
252
+ row["interval"]: set(str(row["Candidate"]).split(","))
253
+ for _, row in u_data.iterrows()
254
+ }
255
+ q_candidate_dict = {
256
+ row["interval"]: set(str(row["Candidate"]).split(","))
257
+ for _, row in q_data.iterrows()
258
+ }
259
+
260
+ common_intervals = set(u_interval_dict.keys()) & set(q_interval_dict.keys())
261
+ if not common_intervals:
262
+ raise ValueError(
263
+ "No common genomic intervals found between U and Q score/outlier files."
264
+ )
265
+
266
+ # Helper: get candidate overlap or "."
267
+ def get_candidate_overlap(interval):
268
+ u_set = u_candidate_dict.get(interval, set())
269
+ q_set = q_candidate_dict.get(interval, set())
270
+ overlap = sorted(u_set & q_set)
271
+ return ",".join(overlap) if overlap else "NA"
272
+
273
+ overlap_df = pd.DataFrame(
274
+ {
275
+ "Chrom": [interval.split(":")[0] for interval in common_intervals],
276
+ "Start": [
277
+ int(interval.split(":")[1].split("-")[0])
278
+ for interval in common_intervals
279
+ ],
280
+ "End": [
281
+ int(interval.split(":")[1].split("-")[1])
282
+ for interval in common_intervals
283
+ ],
284
+ u_column: [u_interval_dict[c] for c in common_intervals],
285
+ q_column: [q_interval_dict[c] for c in common_intervals],
286
+ "Overlapping Candidate": [
287
+ get_candidate_overlap(c) for c in common_intervals
288
+ ],
289
+ }
290
+ )
291
+
292
+ overlap_df_sorted = natsorted_df(overlap_df)
293
+ overlap_output = os.path.splitext(output)[0] + ".overlap.tsv"
294
+ pd.DataFrame(overlap_df_sorted).to_csv(overlap_output, sep="\t", index=False)
295
+
296
+ plt.figure(figsize=(figsize_x, figsize_y))
297
+ plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
298
+ plt.scatter(
299
+ x=overlap_df[q_column],
300
+ y=overlap_df[u_column],
301
+ alpha=alpha,
302
+ s=marker_size,
303
+ c=marker_color,
304
+ marker=marker_style,
305
+ )
306
+ xmin, xmax = plt.gca().get_xlim()
307
+ ymin, ymax = plt.gca().get_ylim()
308
+ plt.xlim(left=max(0, xmin))
309
+ plt.ylim(bottom=max(0, ymin))
310
+ plt.xlabel(xlabel)
311
+ plt.ylabel(ylabel)
312
+ plt.title(title)
313
+ plt.grid(alpha=0.5, linestyle="--")
314
+ plt.savefig(output, bbox_inches="tight", dpi=dpi)
315
+ plt.close()
sai/stats/__init__.py ADDED
@@ -0,0 +1,18 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html