sai-pg 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sai/__init__.py +2 -0
- sai/__main__.py +6 -3
- sai/configs/__init__.py +24 -0
- sai/configs/global_config.py +83 -0
- sai/configs/ploidy_config.py +94 -0
- sai/configs/pop_config.py +82 -0
- sai/configs/stat_config.py +220 -0
- sai/{utils/generators → generators}/chunk_generator.py +2 -8
- sai/{utils/generators → generators}/window_generator.py +82 -37
- sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
- sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
- sai/parsers/outlier_parser.py +4 -3
- sai/parsers/score_parser.py +8 -119
- sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
- sai/preprocessors/feature_preprocessor.py +236 -0
- sai/registries/__init__.py +22 -0
- sai/registries/generic_registry.py +89 -0
- sai/registries/stat_registry.py +30 -0
- sai/sai.py +124 -220
- sai/stats/__init__.py +11 -0
- sai/stats/danc_statistic.py +83 -0
- sai/stats/dd_statistic.py +77 -0
- sai/stats/df_statistic.py +84 -0
- sai/stats/dplus_statistic.py +86 -0
- sai/stats/fd_statistic.py +92 -0
- sai/stats/generic_statistic.py +93 -0
- sai/stats/q_statistic.py +104 -0
- sai/stats/stat_utils.py +259 -0
- sai/stats/u_statistic.py +99 -0
- sai/utils/utils.py +220 -143
- {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
- sai_pg-1.1.0.dist-info/RECORD +70 -0
- {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
- sai_pg-1.1.0.dist-info/top_level.txt +2 -0
- tests/configs/test_global_config.py +163 -0
- tests/configs/test_ploidy_config.py +93 -0
- tests/configs/test_pop_config.py +90 -0
- tests/configs/test_stat_config.py +171 -0
- tests/generators/test_chunk_generator.py +51 -0
- tests/generators/test_window_generator.py +164 -0
- tests/multiprocessing/test_mp_manager.py +92 -0
- tests/multiprocessing/test_mp_pool.py +79 -0
- tests/parsers/test_argument_validation.py +133 -0
- tests/parsers/test_outlier_parser.py +53 -0
- tests/parsers/test_score_parser.py +63 -0
- tests/preprocessors/test_chunk_preprocessor.py +79 -0
- tests/preprocessors/test_feature_preprocessor.py +223 -0
- tests/registries/test_registries.py +74 -0
- tests/stats/test_danc_statistic.py +51 -0
- tests/stats/test_dd_statistic.py +45 -0
- tests/stats/test_df_statistic.py +73 -0
- tests/stats/test_dplus_statistic.py +79 -0
- tests/stats/test_fd_statistic.py +68 -0
- tests/stats/test_q_statistic.py +268 -0
- tests/stats/test_stat_utils.py +354 -0
- tests/stats/test_u_statistic.py +233 -0
- tests/test___main__.py +51 -0
- tests/test_sai.py +102 -0
- tests/utils/test_utils.py +511 -0
- sai/parsers/plot_parser.py +0 -152
- sai/stats/features.py +0 -302
- sai/utils/preprocessors/feature_preprocessor.py +0 -211
- sai_pg-1.0.0.dist-info/RECORD +0 -30
- sai_pg-1.0.0.dist-info/top_level.txt +0 -1
- /sai/{utils/generators → generators}/__init__.py +0 -0
- /sai/{utils/generators → generators}/data_generator.py +0 -0
- /sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
- {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
- {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0
sai/stats/features.py
DELETED
@@ -1,302 +0,0 @@
|
|
1
|
-
# Copyright 2025 Xin Huang
|
2
|
-
#
|
3
|
-
# GNU General Public License v3.0
|
4
|
-
#
|
5
|
-
# This program is free software: you can redistribute it and/or modify
|
6
|
-
# it under the terms of the GNU General Public License as published by
|
7
|
-
# the Free Software Foundation, either version 3 of the License, or
|
8
|
-
# (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program. If not, please see
|
17
|
-
#
|
18
|
-
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
-
|
20
|
-
|
21
|
-
import numpy as np
|
22
|
-
|
23
|
-
|
24
|
-
def calc_freq(gts: np.ndarray, ploidy: int = 1) -> np.ndarray:
|
25
|
-
"""
|
26
|
-
Calculates allele frequencies, supporting both phased and unphased data.
|
27
|
-
|
28
|
-
Parameters
|
29
|
-
----------
|
30
|
-
gts : np.ndarray
|
31
|
-
A 2D numpy array where each row represents a locus and each column represents an individual.
|
32
|
-
ploidy : int, optional
|
33
|
-
Ploidy level of the organism. If ploidy=1, the function assumes phased data and calculates
|
34
|
-
frequency by taking the mean across individuals. For unphased data, it calculates frequency by
|
35
|
-
dividing the sum across individuals by the total number of alleles. Default is 1.
|
36
|
-
|
37
|
-
Returns
|
38
|
-
-------
|
39
|
-
np.ndarray
|
40
|
-
An array of allele frequencies for each locus.
|
41
|
-
"""
|
42
|
-
return np.sum(gts, axis=1) / (gts.shape[1] * ploidy)
|
43
|
-
|
44
|
-
|
45
|
-
def compute_matching_loci(
|
46
|
-
ref_gts: np.ndarray,
|
47
|
-
tgt_gts: np.ndarray,
|
48
|
-
src_gts_list: list[np.ndarray],
|
49
|
-
w: float,
|
50
|
-
y_list: list[tuple[str, float]],
|
51
|
-
ploidy: int,
|
52
|
-
anc_allele_available: bool,
|
53
|
-
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
54
|
-
"""
|
55
|
-
Computes loci that meet specified allele frequency conditions across reference, target, and source genotypes.
|
56
|
-
|
57
|
-
Parameters
|
58
|
-
----------
|
59
|
-
ref_gts : np.ndarray
|
60
|
-
A 2D numpy array where each row represents a locus and each column represents an individual in the reference group.
|
61
|
-
tgt_gts : np.ndarray
|
62
|
-
A 2D numpy array where each row represents a locus and each column represents an individual in the target group.
|
63
|
-
src_gts_list : list of np.ndarray
|
64
|
-
A list of 2D numpy arrays for each source population, where each row represents a locus and each column
|
65
|
-
represents an individual in that source population.
|
66
|
-
w : float
|
67
|
-
Threshold for the allele frequency in `ref_gts`. Only loci with frequencies less than `w` are counted.
|
68
|
-
Must be within the range [0, 1].
|
69
|
-
y_list : list of tuple[str, float]
|
70
|
-
List of allele frequency conditions for each source population in `src_gts_list`.
|
71
|
-
Each entry is a tuple (operator, threshold), where:
|
72
|
-
- `operator` can be '=', '<', '>', '<=', '>='
|
73
|
-
- `threshold` is a float within [0, 1]
|
74
|
-
The length must match `src_gts_list`.
|
75
|
-
ploidy : int
|
76
|
-
The ploidy level of the organism.
|
77
|
-
anc_allele_available : bool
|
78
|
-
If True, checks only for matches with `y` (assuming `1` represents the derived allele).
|
79
|
-
If False, checks both matches with `y` and `1 - y`, taking the dominant allele in the source as the reference.
|
80
|
-
|
81
|
-
Returns
|
82
|
-
-------
|
83
|
-
tuple[np.ndarray, np.ndarray, np.ndarray]
|
84
|
-
- Adjusted reference allele frequencies (`ref_freq`).
|
85
|
-
- Adjusted target allele frequencies (`tgt_freq`).
|
86
|
-
- Boolean array indicating loci that meet the specified frequency conditions (`condition`).
|
87
|
-
"""
|
88
|
-
# Validate input parameters
|
89
|
-
if not (0 <= w <= 1):
|
90
|
-
raise ValueError("Parameters w must be within the range [0, 1].")
|
91
|
-
|
92
|
-
for op, y in y_list:
|
93
|
-
if not (0 <= y <= 1):
|
94
|
-
raise ValueError(f"Invalid value in y_list: {y}. within the range [0, 1].")
|
95
|
-
if op not in ("=", "<", ">", "<=", ">="):
|
96
|
-
raise ValueError(
|
97
|
-
f"Invalid operator in y_list: {op}. Must be '=', '<', '>', '<=', or '>='."
|
98
|
-
)
|
99
|
-
|
100
|
-
if len(src_gts_list) != len(y_list):
|
101
|
-
raise ValueError("The length of src_gts_list and y_list must match.")
|
102
|
-
|
103
|
-
# Compute allele frequencies
|
104
|
-
ref_freq = calc_freq(ref_gts, ploidy)
|
105
|
-
tgt_freq = calc_freq(tgt_gts, ploidy)
|
106
|
-
src_freq_list = [calc_freq(src_gts, ploidy) for src_gts in src_gts_list]
|
107
|
-
|
108
|
-
# Check match for each `y`
|
109
|
-
op_funcs = {
|
110
|
-
"=": lambda src_freq, y: src_freq == y,
|
111
|
-
"<": lambda src_freq, y: src_freq < y,
|
112
|
-
">": lambda src_freq, y: src_freq > y,
|
113
|
-
"<=": lambda src_freq, y: src_freq <= y,
|
114
|
-
">=": lambda src_freq, y: src_freq >= y,
|
115
|
-
}
|
116
|
-
|
117
|
-
match_conditions = [
|
118
|
-
op_funcs[op](src_freq, y) for src_freq, (op, y) in zip(src_freq_list, y_list)
|
119
|
-
]
|
120
|
-
all_match_y = np.all(match_conditions, axis=0)
|
121
|
-
|
122
|
-
if not anc_allele_available:
|
123
|
-
# Check if all source populations match `1 - y`
|
124
|
-
match_conditions_1_minus_y = [
|
125
|
-
op_funcs[op](src_freq, 1 - y)
|
126
|
-
for src_freq, (op, y) in zip(src_freq_list, y_list)
|
127
|
-
]
|
128
|
-
all_match_1_minus_y = np.all(match_conditions_1_minus_y, axis=0)
|
129
|
-
all_match = all_match_y | all_match_1_minus_y
|
130
|
-
|
131
|
-
# Identify loci where all sources match `1 - y` for frequency inversion
|
132
|
-
inverted = all_match_1_minus_y
|
133
|
-
|
134
|
-
# Invert frequencies for these loci
|
135
|
-
ref_freq[inverted] = 1 - ref_freq[inverted]
|
136
|
-
tgt_freq[inverted] = 1 - tgt_freq[inverted]
|
137
|
-
else:
|
138
|
-
all_match = all_match_y
|
139
|
-
|
140
|
-
# Final condition: locus must satisfy source matching and have `ref_freq < w`
|
141
|
-
condition = all_match & (ref_freq < w)
|
142
|
-
|
143
|
-
return ref_freq, tgt_freq, condition
|
144
|
-
|
145
|
-
|
146
|
-
def calc_u(
|
147
|
-
ref_gts: np.ndarray,
|
148
|
-
tgt_gts: np.ndarray,
|
149
|
-
src_gts_list: list[np.ndarray],
|
150
|
-
pos: np.ndarray,
|
151
|
-
w: float,
|
152
|
-
x: float,
|
153
|
-
y_list: list[float],
|
154
|
-
ploidy: int = 1,
|
155
|
-
anc_allele_available: bool = False,
|
156
|
-
) -> tuple[int, np.ndarray]:
|
157
|
-
"""
|
158
|
-
Calculates the count of genetic loci that meet specified allele frequency conditions
|
159
|
-
across reference, target, and multiple source genotypes, with adjustments based on src_freq consistency.
|
160
|
-
|
161
|
-
Parameters
|
162
|
-
----------
|
163
|
-
ref_gts : np.ndarray
|
164
|
-
A 2D numpy array where each row represents a locus and each column represents an individual in the reference group.
|
165
|
-
tgt_gts : np.ndarray
|
166
|
-
A 2D numpy array where each row represents a locus and each column represents an individual in the target group.
|
167
|
-
src_gts_list : list of np.ndarray
|
168
|
-
A list of 2D numpy arrays for each source population, where each row represents a locus and each column
|
169
|
-
represents an individual in that source population.
|
170
|
-
pos : np.ndarray
|
171
|
-
A 1D numpy array where each element represents the genomic position.
|
172
|
-
w : float
|
173
|
-
Threshold for the allele frequency in `ref_gts`. Only loci with frequencies less than `w` are counted.
|
174
|
-
Must be within the range [0, 1].
|
175
|
-
x : float
|
176
|
-
Threshold for the allele frequency in `tgt_gts`. Only loci with frequencies greater than `x` are counted.
|
177
|
-
Must be within the range [0, 1].
|
178
|
-
y_list : list of float
|
179
|
-
List of exact allele frequency thresholds for each source population in `src_gts_list`.
|
180
|
-
Must be within the range [0, 1] and have the same length as `src_gts_list`.
|
181
|
-
ploidy : int, optional
|
182
|
-
The ploidy level of the organism. Default is 1, which assumes phased data.
|
183
|
-
anc_allele_available : bool
|
184
|
-
If True, checks only for matches with `y` (assuming `1` represents the derived allele).
|
185
|
-
If False, checks both matches with `y` and `1 - y`, taking the major allele in the source as the reference.
|
186
|
-
|
187
|
-
Returns
|
188
|
-
-------
|
189
|
-
tuple[int, np.ndarray]
|
190
|
-
- The count of loci that meet all specified frequency conditions.
|
191
|
-
- A 1D numpy array containing the genomic positions of the loci that meet the conditions.
|
192
|
-
|
193
|
-
Raises
|
194
|
-
------
|
195
|
-
ValueError
|
196
|
-
If `x` is outside the range [0, 1].
|
197
|
-
"""
|
198
|
-
# Validate input parameters
|
199
|
-
if not (0 <= x <= 1):
|
200
|
-
raise ValueError("Parameter x must be within the range [0, 1].")
|
201
|
-
|
202
|
-
ref_freq, tgt_freq, condition = compute_matching_loci(
|
203
|
-
ref_gts,
|
204
|
-
tgt_gts,
|
205
|
-
src_gts_list,
|
206
|
-
w,
|
207
|
-
y_list,
|
208
|
-
ploidy,
|
209
|
-
anc_allele_available,
|
210
|
-
)
|
211
|
-
|
212
|
-
# Apply final conditions
|
213
|
-
condition &= tgt_freq > x
|
214
|
-
|
215
|
-
loci_indices = np.where(condition)[0]
|
216
|
-
loci_positions = pos[loci_indices]
|
217
|
-
count = loci_indices.size
|
218
|
-
|
219
|
-
# Return count of matching loci
|
220
|
-
return count, loci_positions
|
221
|
-
|
222
|
-
|
223
|
-
def calc_q(
|
224
|
-
ref_gts: np.ndarray,
|
225
|
-
tgt_gts: np.ndarray,
|
226
|
-
src_gts_list: list[np.ndarray],
|
227
|
-
pos: np.ndarray,
|
228
|
-
w: float,
|
229
|
-
y_list: list[float],
|
230
|
-
quantile: float = 0.95,
|
231
|
-
ploidy: int = 1,
|
232
|
-
anc_allele_available: bool = False,
|
233
|
-
) -> float:
|
234
|
-
"""
|
235
|
-
Calculates a specified quantile of derived allele frequencies in `tgt_gts` for loci that meet specific conditions
|
236
|
-
across reference and multiple source genotypes, with adjustments based on src_freq consistency.
|
237
|
-
|
238
|
-
Parameters
|
239
|
-
----------
|
240
|
-
ref_gts : np.ndarray
|
241
|
-
A 2D numpy array where each row represents a locus and each column represents an individual in the reference group.
|
242
|
-
tgt_gts : np.ndarray
|
243
|
-
A 2D numpy array where each row represents a locus and each column represents an individual in the target group.
|
244
|
-
src_gts_list : list of np.ndarray
|
245
|
-
A list of 2D numpy arrays for each source population, where each row represents a locus and each column
|
246
|
-
represents an individual in that source population.
|
247
|
-
pos: np.ndarray
|
248
|
-
A 1D numpy array where each element represents the genomic position.
|
249
|
-
w : float
|
250
|
-
Frequency threshold for the derived allele in `ref_gts`. Only loci with frequencies lower than `w` are included.
|
251
|
-
Must be within the range [0, 1].
|
252
|
-
y_list : list of float
|
253
|
-
List of exact frequency thresholds for each source population in `src_gts_list`.
|
254
|
-
Must be within the range [0, 1] and have the same length as `src_gts_list`.
|
255
|
-
quantile : float, optional
|
256
|
-
The quantile to compute for the filtered `tgt_gts` frequencies. Must be within the range [0, 1].
|
257
|
-
Default is 0.95 (95% quantile).
|
258
|
-
ploidy : int, optional
|
259
|
-
The ploidy level of the organism. Default is 1, which assumes phased data.
|
260
|
-
anc_allele_available : bool
|
261
|
-
If True, checks only for matches with `y` (assuming `1` represents the derived allele).
|
262
|
-
If False, checks both matches with `y` and `1 - y`, taking the major allele in the source as the reference.
|
263
|
-
|
264
|
-
Returns
|
265
|
-
-------
|
266
|
-
tuple[float, np.ndarray]
|
267
|
-
- The specified quantile of the derived allele frequencies in `tgt_gts` for loci meeting the specified conditions,
|
268
|
-
or NaN if no loci meet the criteria.
|
269
|
-
- A 1D numpy array containing the genomic positions of the loci that meet the conditions.
|
270
|
-
|
271
|
-
Raises
|
272
|
-
------
|
273
|
-
ValueError
|
274
|
-
If `quantile` is outside the range [0, 1].
|
275
|
-
"""
|
276
|
-
# Validate input parameters
|
277
|
-
if not (0 <= quantile <= 1):
|
278
|
-
raise ValueError("Parameter quantile must be within the range [0, 1].")
|
279
|
-
|
280
|
-
ref_freq, tgt_freq, condition = compute_matching_loci(
|
281
|
-
ref_gts,
|
282
|
-
tgt_gts,
|
283
|
-
src_gts_list,
|
284
|
-
w,
|
285
|
-
y_list,
|
286
|
-
ploidy,
|
287
|
-
anc_allele_available,
|
288
|
-
)
|
289
|
-
|
290
|
-
# Filter `tgt_gts` frequencies based on the combined condition
|
291
|
-
filtered_tgt_freq = tgt_freq[condition]
|
292
|
-
filtered_positions = pos[condition]
|
293
|
-
|
294
|
-
# Return NaN if no loci meet the criteria
|
295
|
-
if filtered_tgt_freq.size == 0:
|
296
|
-
return np.nan, np.array([])
|
297
|
-
|
298
|
-
threshold = np.nanquantile(filtered_tgt_freq, quantile)
|
299
|
-
loci_positions = filtered_positions[filtered_tgt_freq >= threshold]
|
300
|
-
|
301
|
-
# Calculate and return the specified quantile of the filtered `tgt_gts` frequencies
|
302
|
-
return threshold, loci_positions
|
@@ -1,211 +0,0 @@
|
|
1
|
-
# Copyright 2025 Xin Huang
|
2
|
-
#
|
3
|
-
# GNU General Public License v3.0
|
4
|
-
#
|
5
|
-
# This program is free software: you can redistribute it and/or modify
|
6
|
-
# it under the terms of the GNU General Public License as published by
|
7
|
-
# the Free Software Foundation, either version 3 of the License, or
|
8
|
-
# (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program. If not, please see
|
17
|
-
#
|
18
|
-
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
-
|
20
|
-
|
21
|
-
import numpy as np
|
22
|
-
from typing import Any
|
23
|
-
from sai.stats.features import calc_u, calc_q
|
24
|
-
from sai.utils.preprocessors import DataPreprocessor
|
25
|
-
|
26
|
-
|
27
|
-
class FeaturePreprocessor(DataPreprocessor):
|
28
|
-
"""
|
29
|
-
A preprocessor subclass for generating feature vectors from genomic data.
|
30
|
-
|
31
|
-
This class extends DataPreprocessor to include additional functionality for creating
|
32
|
-
feature vectors based on genomic variants, reference and target individual genotypes,
|
33
|
-
and window-based genomic statistics.
|
34
|
-
"""
|
35
|
-
|
36
|
-
def __init__(
|
37
|
-
self,
|
38
|
-
w: float,
|
39
|
-
y: list[float],
|
40
|
-
output_file: str,
|
41
|
-
stat_type: str,
|
42
|
-
anc_allele_available: bool = False,
|
43
|
-
):
|
44
|
-
"""
|
45
|
-
Initializes FeatureVectorsPreprocessor with specific frequency thresholds
|
46
|
-
and output file for storing generated feature vectors.
|
47
|
-
|
48
|
-
Parameters
|
49
|
-
----------
|
50
|
-
w : float
|
51
|
-
Frequency threshold for `calc_u` and `calc_q`.
|
52
|
-
y : list[float]
|
53
|
-
List of frequency thresholds for `calc_u` and `calc_q`.
|
54
|
-
output_file : str
|
55
|
-
Path to the output file to save processed feature vectors.
|
56
|
-
stat_type: str,
|
57
|
-
Specifies the type of statistic to compute.
|
58
|
-
- "UXX" (e.g., "U50", "U90") : Compute the U statistic using `calc_u()`.
|
59
|
-
- "QXX" (e.g., "Q95", "Q50") : Compute the Q statistic using `calc_q()`,
|
60
|
-
anc_allele_available: bool, optional
|
61
|
-
If True, ancestral allele information is available.
|
62
|
-
If False, ancestral allele information is unavailable.
|
63
|
-
Default is False.
|
64
|
-
|
65
|
-
Raises
|
66
|
-
------
|
67
|
-
ValueError
|
68
|
-
If `stat_type` is not in a valid format. Must be either: 'UXX' or 'QXX'.
|
69
|
-
"""
|
70
|
-
self.w = w
|
71
|
-
self.y = y
|
72
|
-
self.output_file = output_file
|
73
|
-
self.anc_allele_available = anc_allele_available
|
74
|
-
if not (
|
75
|
-
len(stat_type) == 3
|
76
|
-
and stat_type[0] in {"U", "Q"}
|
77
|
-
and stat_type[1:].isdigit()
|
78
|
-
):
|
79
|
-
raise ValueError(
|
80
|
-
f"Invalid stat_type format: {stat_type}. Expected format 'UXX' or 'QXX' (e.g., 'U50' or 'Q95')."
|
81
|
-
)
|
82
|
-
self.stat_prefix = stat_type[0]
|
83
|
-
self.threshold = int(stat_type[1:]) / 100
|
84
|
-
|
85
|
-
def run(
|
86
|
-
self,
|
87
|
-
chr_name: str,
|
88
|
-
ref_pop: str,
|
89
|
-
tgt_pop: str,
|
90
|
-
src_pop_list: list[str],
|
91
|
-
start: int,
|
92
|
-
end: int,
|
93
|
-
pos: np.ndarray,
|
94
|
-
ref_gts: np.ndarray,
|
95
|
-
tgt_gts: np.ndarray,
|
96
|
-
src_gts_list: list[np.ndarray],
|
97
|
-
ploidy: int,
|
98
|
-
) -> list[dict[str, Any]]:
|
99
|
-
"""
|
100
|
-
Generates feature vectors for a specified genomic window.
|
101
|
-
|
102
|
-
Parameters
|
103
|
-
----------
|
104
|
-
chr_name : str
|
105
|
-
Chromosome name.
|
106
|
-
ref_pop : str
|
107
|
-
Reference population name.
|
108
|
-
tgt_pop : str
|
109
|
-
Target population name.
|
110
|
-
src_pop_list : list[str]
|
111
|
-
List of source population names.
|
112
|
-
start : int
|
113
|
-
Start position of the genomic window.
|
114
|
-
end : int
|
115
|
-
End position of the genomic window.
|
116
|
-
pos : np.ndarray
|
117
|
-
A 1D numpy array where each element represents the genomic position.
|
118
|
-
ref_gts : np.ndarray
|
119
|
-
Genotype data for the reference population.
|
120
|
-
tgt_gts : np.ndarray
|
121
|
-
Genotype data for the target population.
|
122
|
-
src_gts_list : list[np.ndarray]
|
123
|
-
List of genotype arrays for each source population.
|
124
|
-
ploidy: int
|
125
|
-
Ploidy of the genome.
|
126
|
-
|
127
|
-
Returns
|
128
|
-
-------
|
129
|
-
list[dict[str, Any]]
|
130
|
-
A list containing a dictionary of calculated feature vectors for the genomic window.
|
131
|
-
"""
|
132
|
-
items = {
|
133
|
-
"chr_name": chr_name,
|
134
|
-
"start": start,
|
135
|
-
"end": end,
|
136
|
-
"ref_pop": ref_pop,
|
137
|
-
"tgt_pop": tgt_pop,
|
138
|
-
"src_pop_list": src_pop_list,
|
139
|
-
"nsnps": len(pos),
|
140
|
-
}
|
141
|
-
|
142
|
-
if (
|
143
|
-
(ref_gts is None)
|
144
|
-
or (tgt_gts is None)
|
145
|
-
or (src_gts_list is None)
|
146
|
-
or (ploidy is None)
|
147
|
-
):
|
148
|
-
items["statistic"] = np.nan
|
149
|
-
items["candidates"] = np.array([])
|
150
|
-
elif self.stat_prefix == "U":
|
151
|
-
items["statistic"], items["candidates"] = calc_u(
|
152
|
-
ref_gts=ref_gts,
|
153
|
-
tgt_gts=tgt_gts,
|
154
|
-
src_gts_list=src_gts_list,
|
155
|
-
pos=pos,
|
156
|
-
w=self.w,
|
157
|
-
x=self.threshold,
|
158
|
-
y_list=self.y,
|
159
|
-
ploidy=ploidy,
|
160
|
-
anc_allele_available=self.anc_allele_available,
|
161
|
-
)
|
162
|
-
elif self.stat_prefix == "Q":
|
163
|
-
items["statistic"], items["candidates"] = calc_q(
|
164
|
-
ref_gts=ref_gts,
|
165
|
-
tgt_gts=tgt_gts,
|
166
|
-
src_gts_list=src_gts_list,
|
167
|
-
pos=pos,
|
168
|
-
w=self.w,
|
169
|
-
y_list=self.y,
|
170
|
-
quantile=self.threshold,
|
171
|
-
ploidy=ploidy,
|
172
|
-
anc_allele_available=self.anc_allele_available,
|
173
|
-
)
|
174
|
-
else:
|
175
|
-
raise ValueError(
|
176
|
-
f"Invalid stat_type: {self.stat_type}. Must be 'U' or 'QXX' (e.g., 'Q95')."
|
177
|
-
)
|
178
|
-
|
179
|
-
return [items]
|
180
|
-
|
181
|
-
def process_items(self, items: list[dict[str, Any]]) -> None:
|
182
|
-
"""
|
183
|
-
Processes and writes a single dictionary of feature vectors to the output file.
|
184
|
-
|
185
|
-
Parameters
|
186
|
-
----------
|
187
|
-
items : dict[str, Any]
|
188
|
-
A dictionary containing feature vectors for a genomic window.
|
189
|
-
"""
|
190
|
-
with open(
|
191
|
-
self.output_file, "a"
|
192
|
-
) as f: # Open in append mode for continuous writing
|
193
|
-
lines = []
|
194
|
-
for item in items:
|
195
|
-
src_pop_str = ",".join(item["src_pop_list"])
|
196
|
-
candidates = (
|
197
|
-
"NA"
|
198
|
-
if item["candidates"].size == 0
|
199
|
-
else ",".join(
|
200
|
-
f"{item['chr_name']}:{pos}" for pos in item["candidates"]
|
201
|
-
)
|
202
|
-
)
|
203
|
-
|
204
|
-
line = (
|
205
|
-
f"{item['chr_name']}\t{item['start']}\t{item['end']}\t"
|
206
|
-
f"{item['ref_pop']}\t{item['tgt_pop']}\t{src_pop_str}\t"
|
207
|
-
f"{item['nsnps']}\t{item['statistic']}\t{candidates}\n"
|
208
|
-
)
|
209
|
-
lines.append(line)
|
210
|
-
|
211
|
-
f.writelines(lines)
|
sai_pg-1.0.0.dist-info/RECORD
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
sai/__init__.py,sha256=ZLPiBk86c9R8ZFx9y5VF7Up2v4JS8WsEMfsVNlj7nXY,724
|
2
|
-
sai/__main__.py,sha256=WJMDikIfZh-g9j9pl0o6sZqVb4llQpUhBcJbn4-9pvE,2058
|
3
|
-
sai/sai.py,sha256=ffPTVxKMoLFX-zr9lsiU-p5KDict9glSTMiNcgWVtsw,9969
|
4
|
-
sai/parsers/__init__.py,sha256=ZLPiBk86c9R8ZFx9y5VF7Up2v4JS8WsEMfsVNlj7nXY,724
|
5
|
-
sai/parsers/argument_validation.py,sha256=OzkE9Ayr3KCB8rSP8mXnfx4aNOeJuEEFfit9q7RbURU,4374
|
6
|
-
sai/parsers/outlier_parser.py,sha256=Qw0E_1zFxfYoynnYJ1scbQHEpPVaHwTCPtPe3n643I4,2354
|
7
|
-
sai/parsers/plot_parser.py,sha256=C4uGH96th8Yj7KN7onkE6xTqYa8K7EiU-21xgZYDNZE,4556
|
8
|
-
sai/parsers/score_parser.py,sha256=qZ6mLBHd2QwAb2V0QaOgzBkgpb6mZv_rdDckwzN0e38,8604
|
9
|
-
sai/stats/__init__.py,sha256=ZLPiBk86c9R8ZFx9y5VF7Up2v4JS8WsEMfsVNlj7nXY,724
|
10
|
-
sai/stats/features.py,sha256=7CI9i3csVH9sXryWGB0uLSJB6BbM9gJtA1IPN2EeT0k,11844
|
11
|
-
sai/utils/__init__.py,sha256=B3ZcC1ALSWieGHPiqXKBFQRTrnlTX4TaHc3tCx9fj0w,782
|
12
|
-
sai/utils/genomic_dataclasses.py,sha256=HBYp2dehdW_y3Pd6Un8XFMnN1Odg1EiZb9ci1syIibU,1443
|
13
|
-
sai/utils/utils.py,sha256=Nnb1tPHWk9vmlRmUJ65_FEmLfDEsT41HZ9WvJ0ps7Vs,23127
|
14
|
-
sai/utils/generators/__init__.py,sha256=hpE4PUQIOZQXzdpSx7dEllecDoOfxIWXNu1-WHa_VcM,858
|
15
|
-
sai/utils/generators/chunk_generator.py,sha256=Jh0wDOR6Z5PXlzOTncmY-pr9PotnmfiWTZQL2aaVGAs,4565
|
16
|
-
sai/utils/generators/data_generator.py,sha256=bVz9KPjJSL4becTsVZ-zH4i40y5UDA0JJOtzbwf5n84,1635
|
17
|
-
sai/utils/generators/window_generator.py,sha256=G8aMmJbB8uWLNaoJAGSeBFTCvAib7DdWvbXDk9JvDdM,9066
|
18
|
-
sai/utils/multiprocessing/__init__.py,sha256=mI_iVjOJschKjIcneo99utCMVjR0K3UpEjA_PMcmLm4,790
|
19
|
-
sai/utils/multiprocessing/mp_manager.py,sha256=xbgje4Fi4U9WD7eqMlUkTLKLQ9aNV_hIG7nTYvniGFk,10263
|
20
|
-
sai/utils/multiprocessing/mp_pool.py,sha256=wqgp7E7fU2MXKsv7d8kpIp40SLjqS8voxbB_FkVwczA,2273
|
21
|
-
sai/utils/preprocessors/__init__.py,sha256=q25uXnq4jTrIVBbl1HFv3ZgEV0d79fCMp4tTZrq4vsM,878
|
22
|
-
sai/utils/preprocessors/chunk_preprocessor.py,sha256=mNmc22PmIRZZjYoyktxKSRGSGzN_e1Cyi392VxTbtWc,5204
|
23
|
-
sai/utils/preprocessors/data_preprocessor.py,sha256=XSQN_kXUfyCtIpqIf7TcqWIPcSk8kynHPH-KY9gh5VY,3595
|
24
|
-
sai/utils/preprocessors/feature_preprocessor.py,sha256=Kmc7VsO9YJcbFwWk-Jha5Rzq7gZou1pwYh93Wpd9r5o,7143
|
25
|
-
sai_pg-1.0.0.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
26
|
-
sai_pg-1.0.0.dist-info/METADATA,sha256=e16SXHsIZBXK5EtJDbDhyPJdEXcKes88sCNPls_s3y4,1789
|
27
|
-
sai_pg-1.0.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
28
|
-
sai_pg-1.0.0.dist-info/entry_points.txt,sha256=uK34frE2UhEDNa5ISbGOtjF2HGAXF5uG_EgMocesEPs,42
|
29
|
-
sai_pg-1.0.0.dist-info/top_level.txt,sha256=Kvf-0z0sZYQbLuM6ta183EOzD37k3VZbRH-h7HXntR0,4
|
30
|
-
sai_pg-1.0.0.dist-info/RECORD,,
|
@@ -1 +0,0 @@
|
|
1
|
-
sai
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|