sai-pg 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sai/__init__.py +2 -0
- sai/__main__.py +6 -3
- sai/configs/__init__.py +24 -0
- sai/configs/global_config.py +83 -0
- sai/configs/ploidy_config.py +94 -0
- sai/configs/pop_config.py +82 -0
- sai/configs/stat_config.py +220 -0
- sai/{utils/generators → generators}/chunk_generator.py +1 -1
- sai/{utils/generators → generators}/window_generator.py +81 -37
- sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
- sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
- sai/parsers/outlier_parser.py +4 -3
- sai/parsers/score_parser.py +8 -119
- sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
- sai/preprocessors/feature_preprocessor.py +236 -0
- sai/registries/__init__.py +22 -0
- sai/registries/generic_registry.py +89 -0
- sai/registries/stat_registry.py +30 -0
- sai/sai.py +124 -220
- sai/stats/__init__.py +11 -0
- sai/stats/danc_statistic.py +83 -0
- sai/stats/dd_statistic.py +77 -0
- sai/stats/df_statistic.py +84 -0
- sai/stats/dplus_statistic.py +86 -0
- sai/stats/fd_statistic.py +92 -0
- sai/stats/generic_statistic.py +93 -0
- sai/stats/q_statistic.py +104 -0
- sai/stats/stat_utils.py +259 -0
- sai/stats/u_statistic.py +99 -0
- sai/utils/utils.py +213 -142
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
- sai_pg-1.1.0.dist-info/RECORD +70 -0
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
- sai_pg-1.1.0.dist-info/top_level.txt +2 -0
- tests/configs/test_global_config.py +163 -0
- tests/configs/test_ploidy_config.py +93 -0
- tests/configs/test_pop_config.py +90 -0
- tests/configs/test_stat_config.py +171 -0
- tests/generators/test_chunk_generator.py +51 -0
- tests/generators/test_window_generator.py +164 -0
- tests/multiprocessing/test_mp_manager.py +92 -0
- tests/multiprocessing/test_mp_pool.py +79 -0
- tests/parsers/test_argument_validation.py +133 -0
- tests/parsers/test_outlier_parser.py +53 -0
- tests/parsers/test_score_parser.py +63 -0
- tests/preprocessors/test_chunk_preprocessor.py +79 -0
- tests/preprocessors/test_feature_preprocessor.py +223 -0
- tests/registries/test_registries.py +74 -0
- tests/stats/test_danc_statistic.py +51 -0
- tests/stats/test_dd_statistic.py +45 -0
- tests/stats/test_df_statistic.py +73 -0
- tests/stats/test_dplus_statistic.py +79 -0
- tests/stats/test_fd_statistic.py +68 -0
- tests/stats/test_q_statistic.py +268 -0
- tests/stats/test_stat_utils.py +354 -0
- tests/stats/test_u_statistic.py +233 -0
- tests/test___main__.py +51 -0
- tests/test_sai.py +102 -0
- tests/utils/test_utils.py +511 -0
- sai/parsers/plot_parser.py +0 -152
- sai/stats/features.py +0 -302
- sai/utils/preprocessors/feature_preprocessor.py +0 -211
- sai_pg-1.0.1.dist-info/RECORD +0 -30
- sai_pg-1.0.1.dist-info/top_level.txt +0 -1
- /sai/{utils/generators → generators}/__init__.py +0 -0
- /sai/{utils/generators → generators}/data_generator.py +0 -0
- /sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -18,10 +18,12 @@
|
|
18
18
|
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
19
|
|
20
20
|
|
21
|
+
import numpy as np
|
21
22
|
from itertools import combinations, product
|
22
23
|
from typing import Iterator, Any
|
23
24
|
from sai.utils import read_data, split_genome
|
24
|
-
from sai.
|
25
|
+
from sai.generators import DataGenerator
|
26
|
+
from sai.configs import PloidyConfig
|
25
27
|
|
26
28
|
|
27
29
|
class WindowGenerator(DataGenerator):
|
@@ -37,8 +39,10 @@ class WindowGenerator(DataGenerator):
|
|
37
39
|
ref_ind_file: str,
|
38
40
|
tgt_ind_file: str,
|
39
41
|
src_ind_file: str,
|
42
|
+
out_ind_file: str,
|
40
43
|
win_len: int,
|
41
44
|
win_step: int,
|
45
|
+
ploidy_config: PloidyConfig,
|
42
46
|
start: int = None,
|
43
47
|
end: int = None,
|
44
48
|
anc_allele_file: str = None,
|
@@ -59,10 +63,14 @@ class WindowGenerator(DataGenerator):
|
|
59
63
|
The path to the file containing identifiers for target populations.
|
60
64
|
src_ind_file : str
|
61
65
|
The path to the file containing identifiers for source populations.
|
66
|
+
out_ind_file : str
|
67
|
+
The path to the file containing identifiers for outgroup populations.
|
62
68
|
win_len : int
|
63
69
|
The length of each window in base pairs.
|
64
70
|
win_step : int
|
65
71
|
The step size between windows in base pairs.
|
72
|
+
ploidy_config : PloidyConfig
|
73
|
+
Configuration specifying ploidy levels for each population involved in the analysis.
|
66
74
|
start: int, optional
|
67
75
|
The starting position (1-based, inclusive) on the chromosome. Default: None.
|
68
76
|
end: int, optional
|
@@ -88,17 +96,10 @@ class WindowGenerator(DataGenerator):
|
|
88
96
|
self.win_step = win_step
|
89
97
|
self.num_src = num_src
|
90
98
|
self.chr_name = chr_name
|
99
|
+
self.ploidy_config = ploidy_config
|
91
100
|
|
92
101
|
# Load data
|
93
|
-
(
|
94
|
-
self.ref_data,
|
95
|
-
self.ref_samples,
|
96
|
-
self.tgt_data,
|
97
|
-
self.tgt_samples,
|
98
|
-
self.src_data,
|
99
|
-
self.src_samples,
|
100
|
-
self.ploidy,
|
101
|
-
) = read_data(
|
102
|
+
results = read_data(
|
102
103
|
vcf_file=vcf_file,
|
103
104
|
chr_name=self.chr_name,
|
104
105
|
start=start,
|
@@ -106,13 +107,25 @@ class WindowGenerator(DataGenerator):
|
|
106
107
|
ref_ind_file=ref_ind_file,
|
107
108
|
tgt_ind_file=tgt_ind_file,
|
108
109
|
src_ind_file=src_ind_file,
|
110
|
+
out_ind_file=out_ind_file,
|
111
|
+
ploidy_config=ploidy_config,
|
109
112
|
anc_allele_file=anc_allele_file,
|
110
113
|
is_phased=False,
|
111
114
|
filter_ref=False,
|
112
115
|
filter_tgt=False,
|
113
116
|
filter_src=False,
|
117
|
+
filter_missing=True,
|
114
118
|
)
|
115
119
|
|
120
|
+
self.ref_data = results["ref"][0]
|
121
|
+
self.tgt_data = results["tgt"][0]
|
122
|
+
self.src_data = results["src"][0]
|
123
|
+
self.out_data = results["outgroup"][0]
|
124
|
+
self.ref_samples = results["ref"][1]
|
125
|
+
self.tgt_samples = results["tgt"][1]
|
126
|
+
self.src_samples = results["src"][1]
|
127
|
+
self.out_samples = results["outgroup"][1]
|
128
|
+
|
116
129
|
self.src_combinations = list(
|
117
130
|
combinations(self.src_samples.keys(), self.num_src)
|
118
131
|
)
|
@@ -149,39 +162,70 @@ class WindowGenerator(DataGenerator):
|
|
149
162
|
for ref_pop, tgt_pop, src_comb in product(
|
150
163
|
self.ref_samples, self.tgt_samples, self.src_combinations
|
151
164
|
):
|
152
|
-
tgt_pos = self.tgt_data[tgt_pop].POS
|
153
165
|
for start, end in self.tgt_windows[tgt_pop]:
|
154
|
-
|
155
|
-
|
156
|
-
|
166
|
+
ref_data = self.ref_data[ref_pop]
|
167
|
+
tgt_data = self.tgt_data[tgt_pop]
|
168
|
+
src_data_list = [self.src_data[src_pop] for src_pop in src_comb]
|
169
|
+
|
170
|
+
ref_mask = (ref_data.POS >= start) & (ref_data.POS <= end)
|
171
|
+
tgt_mask = (tgt_data.POS >= start) & (tgt_data.POS <= end)
|
172
|
+
src_masks = [
|
173
|
+
(src_data.POS >= start) & (src_data.POS <= end)
|
174
|
+
for src_data in src_data_list
|
157
175
|
]
|
158
|
-
|
159
|
-
|
160
|
-
|
176
|
+
|
177
|
+
ref_pos = ref_data.POS[ref_mask]
|
178
|
+
tgt_pos = tgt_data.POS[tgt_mask]
|
179
|
+
src_pos_list = [
|
180
|
+
src_data.POS[mask]
|
181
|
+
for src_data, mask in zip(src_data_list, src_masks)
|
161
182
|
]
|
183
|
+
|
184
|
+
common_pos = np.intersect1d(ref_pos, tgt_pos)
|
185
|
+
for src_pos in src_pos_list:
|
186
|
+
common_pos = np.intersect1d(common_pos, src_pos)
|
187
|
+
|
188
|
+
ref_gts = ref_data.GT.compress(
|
189
|
+
np.isin(ref_data.POS, common_pos), axis=0
|
190
|
+
)
|
191
|
+
tgt_gts = tgt_data.GT.compress(
|
192
|
+
np.isin(tgt_data.POS, common_pos), axis=0
|
193
|
+
)
|
162
194
|
src_gts_list = [
|
163
|
-
|
164
|
-
|
165
|
-
& (self.src_data[src_pop].POS <= end)
|
166
|
-
]
|
167
|
-
for src_pop in src_comb
|
195
|
+
src_data.GT.compress(np.isin(src_data.POS, common_pos), axis=0)
|
196
|
+
for src_data in src_data_list
|
168
197
|
]
|
169
198
|
|
170
|
-
sub_pos =
|
199
|
+
sub_pos = common_pos
|
171
200
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
201
|
+
if len(sub_pos) == 0:
|
202
|
+
yield {
|
203
|
+
"chr_name": self.chr_name,
|
204
|
+
"ref_pop": ref_pop,
|
205
|
+
"tgt_pop": tgt_pop,
|
206
|
+
"src_pop_list": src_comb,
|
207
|
+
"start": start,
|
208
|
+
"end": end,
|
209
|
+
"pos": [],
|
210
|
+
"ref_gts": None,
|
211
|
+
"tgt_gts": None,
|
212
|
+
"src_gts_list": None,
|
213
|
+
"ploidy_config": self.ploidy_config,
|
214
|
+
}
|
215
|
+
else:
|
216
|
+
yield {
|
217
|
+
"chr_name": self.chr_name,
|
218
|
+
"ref_pop": ref_pop,
|
219
|
+
"tgt_pop": tgt_pop,
|
220
|
+
"src_pop_list": src_comb, # List of source populations in this combination
|
221
|
+
"start": start,
|
222
|
+
"end": end,
|
223
|
+
"pos": sub_pos,
|
224
|
+
"ref_gts": ref_gts,
|
225
|
+
"tgt_gts": tgt_gts,
|
226
|
+
"src_gts_list": src_gts_list, # List of genotypes for each source population in src_comb
|
227
|
+
"ploidy_config": self.ploidy_config,
|
228
|
+
}
|
185
229
|
|
186
230
|
def _none_window_generator(self) -> Iterator[dict[str, Any]]:
|
187
231
|
"""
|
@@ -218,7 +262,7 @@ class WindowGenerator(DataGenerator):
|
|
218
262
|
"ref_gts": None,
|
219
263
|
"tgt_gts": None,
|
220
264
|
"src_gts_list": None,
|
221
|
-
"
|
265
|
+
"ploidy_config": self.ploidy_config,
|
222
266
|
}
|
223
267
|
|
224
268
|
def get(self) -> Iterator[dict[str, Any]]:
|
@@ -25,8 +25,8 @@ from multiprocessing import current_process
|
|
25
25
|
from multiprocessing import Manager
|
26
26
|
from multiprocessing import Process
|
27
27
|
from threading import Thread
|
28
|
-
from sai.
|
29
|
-
from sai.
|
28
|
+
from sai.generators import DataGenerator
|
29
|
+
from sai.preprocessors import DataPreprocessor
|
30
30
|
|
31
31
|
|
32
32
|
def monitor(shared_dict: dict, workers: list[multiprocessing.Process]) -> None:
|
@@ -20,8 +20,8 @@
|
|
20
20
|
|
21
21
|
from multiprocessing import Pool
|
22
22
|
from typing import Any
|
23
|
-
from sai.
|
24
|
-
from sai.
|
23
|
+
from sai.generators import DataGenerator
|
24
|
+
from sai.preprocessors import DataPreprocessor
|
25
25
|
|
26
26
|
|
27
27
|
def mp_worker(params: tuple[DataPreprocessor, dict]) -> Any:
|
sai/parsers/outlier_parser.py
CHANGED
@@ -37,7 +37,7 @@ def _run_outlier(args: argparse.Namespace) -> None:
|
|
37
37
|
# Call the outlier function with parsed arguments
|
38
38
|
outlier(
|
39
39
|
score_file=args.score,
|
40
|
-
|
40
|
+
output_prefix=args.output_prefix,
|
41
41
|
quantile=args.quantile,
|
42
42
|
)
|
43
43
|
|
@@ -62,10 +62,11 @@ def add_outlier_parser(subparsers: argparse.ArgumentParser) -> None:
|
|
62
62
|
help="Path to the input score file.",
|
63
63
|
)
|
64
64
|
parser.add_argument(
|
65
|
-
"--output",
|
65
|
+
"--output-prefix",
|
66
|
+
dest="output_prefix",
|
66
67
|
type=str,
|
67
68
|
required=True,
|
68
|
-
help="
|
69
|
+
help="Prefix of the output files.",
|
69
70
|
)
|
70
71
|
parser.add_argument(
|
71
72
|
"--quantile",
|
sai/parsers/score_parser.py
CHANGED
@@ -19,13 +19,9 @@
|
|
19
19
|
|
20
20
|
|
21
21
|
import argparse
|
22
|
-
import re
|
23
22
|
from sai.parsers.argument_validation import positive_int
|
24
23
|
from sai.parsers.argument_validation import existed_file
|
25
|
-
from sai.parsers.argument_validation import between_zero_and_one
|
26
|
-
from sai.parsers.argument_validation import validate_stat_type
|
27
24
|
from sai.sai import score
|
28
|
-
from sai.utils.utils import parse_ind_file
|
29
25
|
|
30
26
|
|
31
27
|
def _run_score(args: argparse.Namespace) -> None:
|
@@ -42,108 +38,36 @@ def _run_score(args: argparse.Namespace) -> None:
|
|
42
38
|
Path to the VCF file containing variant data.
|
43
39
|
- chr_name : str
|
44
40
|
Name of the chromosome to be analyzed.
|
45
|
-
- ref : str
|
46
|
-
Path to the reference group individual file.
|
47
|
-
- tgt : str
|
48
|
-
Path to the target group individual file.
|
49
|
-
- src : str
|
50
|
-
Path to the source population individual file.
|
51
41
|
- win_len : int
|
52
42
|
Length of each analysis window.
|
53
43
|
- win_step : int
|
54
44
|
Step size for moving the window along the sequence.
|
55
|
-
- num_src : int
|
56
|
-
Number of source populations. The length of `args.y` should match `num_src`.
|
57
45
|
- anc_alleles : str
|
58
46
|
Path to the ancestral allele file.
|
59
|
-
- w : float
|
60
|
-
Allele frequency threshold for the reference group.
|
61
|
-
- y : list of float
|
62
|
-
List of allele frequency thresholds for each source population. Its length must match `num_src`.
|
63
47
|
- output : str
|
64
48
|
Path to the output file for storing results.
|
65
|
-
-
|
66
|
-
|
49
|
+
- stat_config: str
|
50
|
+
Path to the YAML configuration file specifying the statistics, ploidy levels, and populations to compute.
|
67
51
|
|
68
52
|
Raises
|
69
53
|
------
|
70
54
|
ValueError
|
71
|
-
If
|
55
|
+
If fewer than three ploidy values are provided,
|
56
|
+
or if the number of ploidy values for source populations does not match `num_src`.
|
72
57
|
or if other input parameters do not meet expected conditions.
|
73
58
|
"""
|
74
|
-
src_samples = parse_ind_file(args.src)
|
75
|
-
num_src = len(src_samples.keys())
|
76
|
-
if len(args.y) != num_src:
|
77
|
-
raise ValueError(
|
78
|
-
f"The length of y ({len(args.y)}) does not match the number of source populations ({num_src}) found in {args.src}."
|
79
|
-
)
|
80
|
-
|
81
59
|
score(
|
82
60
|
vcf_file=args.vcf,
|
83
61
|
chr_name=args.chr_name,
|
84
|
-
ref_ind_file=args.ref,
|
85
|
-
tgt_ind_file=args.tgt,
|
86
|
-
src_ind_file=args.src,
|
87
62
|
win_len=args.win_len,
|
88
63
|
win_step=args.win_step,
|
89
|
-
num_src=num_src,
|
90
64
|
anc_allele_file=args.anc_alleles,
|
91
|
-
w=args.w,
|
92
|
-
y=args.y,
|
93
65
|
output_file=args.output,
|
94
|
-
|
66
|
+
config=args.config,
|
95
67
|
num_workers=1,
|
96
68
|
)
|
97
69
|
|
98
70
|
|
99
|
-
def _parse_y_thresholds(value: str) -> tuple[str, float]:
|
100
|
-
"""
|
101
|
-
Parses the --y parameter value to extract an operator and a numerical threshold.
|
102
|
-
|
103
|
-
This function ensures that the input is correctly formatted as one of the following:
|
104
|
-
- `=X` (equality condition)
|
105
|
-
- `>X` (greater than condition)
|
106
|
-
- `<X` (less than condition)
|
107
|
-
- `>=X` (greater than or equal to condition)
|
108
|
-
- `<=X` (less than or equal to condition)
|
109
|
-
|
110
|
-
The numerical value `X` must be within the range [0, 1].
|
111
|
-
|
112
|
-
Parameters
|
113
|
-
----------
|
114
|
-
value : str
|
115
|
-
A string representing the allele frequency threshold condition, e.g., "=0.7", ">0.8", "<=0.2".
|
116
|
-
|
117
|
-
Returns
|
118
|
-
-------
|
119
|
-
tuple[str, float]
|
120
|
-
A tuple containing:
|
121
|
-
- A string representing the comparison operator (`=`, `<`, `>`, `<=`, `>=`).
|
122
|
-
- A float representing the threshold value.
|
123
|
-
|
124
|
-
Raises
|
125
|
-
------
|
126
|
-
argparse.ArgumentTypeError
|
127
|
-
If the input format is invalid or the numerical threshold is outside the range [0, 1].
|
128
|
-
"""
|
129
|
-
match = re.match(r"^(=|<|>|<=|>=)(\d*\.?\d+)$", value)
|
130
|
-
if not match:
|
131
|
-
raise argparse.ArgumentTypeError(
|
132
|
-
f"Invalid format for --y: {value}. Must be in the form =X, >X, <X, >=X, or <=X "
|
133
|
-
f"(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2)."
|
134
|
-
)
|
135
|
-
|
136
|
-
operator, num_str = match.groups()
|
137
|
-
num = float(num_str)
|
138
|
-
|
139
|
-
if not (0 <= num <= 1):
|
140
|
-
raise argparse.ArgumentTypeError(
|
141
|
-
f"Value for --y must be between 0 and 1, got {num}."
|
142
|
-
)
|
143
|
-
|
144
|
-
return operator, num
|
145
|
-
|
146
|
-
|
147
71
|
def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
|
148
72
|
"""
|
149
73
|
Initializes and configures the command-line interface parser
|
@@ -170,24 +94,6 @@ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
|
|
170
94
|
required=True,
|
171
95
|
help="Chromosome name to analyze from the VCF file.",
|
172
96
|
)
|
173
|
-
parser.add_argument(
|
174
|
-
"--ref",
|
175
|
-
type=existed_file,
|
176
|
-
required=True,
|
177
|
-
help="Path to the file with reference population identifiers.",
|
178
|
-
)
|
179
|
-
parser.add_argument(
|
180
|
-
"--tgt",
|
181
|
-
type=existed_file,
|
182
|
-
required=True,
|
183
|
-
help="Path to the file with target population identifiers.",
|
184
|
-
)
|
185
|
-
parser.add_argument(
|
186
|
-
"--src",
|
187
|
-
type=existed_file,
|
188
|
-
required=True,
|
189
|
-
help="Path to the file with source population identifiers.",
|
190
|
-
)
|
191
97
|
parser.add_argument(
|
192
98
|
"--win-len",
|
193
99
|
dest="win_len",
|
@@ -209,23 +115,6 @@ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
|
|
209
115
|
default=None,
|
210
116
|
help="Path to the BED file with ancestral allele information. If ancestral allele information is not provided, filtering will be performed for each variant based on whether the allele frequency of any allele (assuming biallelic) meets the specified condition during the calculation of the statistics. Default: None.",
|
211
117
|
)
|
212
|
-
parser.add_argument(
|
213
|
-
"--w",
|
214
|
-
type=between_zero_and_one,
|
215
|
-
default=0.01,
|
216
|
-
help="Frequency threshold for variants in the reference population; only variants with frequencies below this threshold are included in the analysis. Default: 0.01.",
|
217
|
-
)
|
218
|
-
parser.add_argument(
|
219
|
-
"--y",
|
220
|
-
type=_parse_y_thresholds,
|
221
|
-
nargs="+",
|
222
|
-
default=[("=", 1.0)],
|
223
|
-
help="List of allele frequency conditions for the source populations. "
|
224
|
-
"Each value must be in the form =X, >X, <X, >=X, or <=X "
|
225
|
-
"(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2). "
|
226
|
-
"The number of values must match the number of source populations in the file specified by `--src`; "
|
227
|
-
"the order of the allele frequency conditions should also correspond to the order of source populations in that file. Default: =1",
|
228
|
-
)
|
229
118
|
parser.add_argument(
|
230
119
|
"--output",
|
231
120
|
type=str,
|
@@ -233,9 +122,9 @@ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
|
|
233
122
|
help="Output file path for saving results.",
|
234
123
|
)
|
235
124
|
parser.add_argument(
|
236
|
-
"--
|
237
|
-
type=
|
125
|
+
"--config",
|
126
|
+
type=existed_file,
|
238
127
|
required=True,
|
239
|
-
help="
|
128
|
+
help="Path to the YAML configuration file specifying the statistics to compute, ploidy settings, and population group file paths.",
|
240
129
|
)
|
241
130
|
parser.set_defaults(runner=_run_score)
|
@@ -19,8 +19,10 @@
|
|
19
19
|
|
20
20
|
|
21
21
|
from typing import Any
|
22
|
-
from sai.utils
|
23
|
-
from sai.
|
22
|
+
from sai.utils import parse_ind_file
|
23
|
+
from sai.generators import WindowGenerator
|
24
|
+
from sai.preprocessors import DataPreprocessor
|
25
|
+
from sai.configs import PloidyConfig, StatConfig
|
24
26
|
from .feature_preprocessor import FeaturePreprocessor
|
25
27
|
|
26
28
|
|
@@ -39,12 +41,12 @@ class ChunkPreprocessor(DataPreprocessor):
|
|
39
41
|
ref_ind_file: str,
|
40
42
|
tgt_ind_file: str,
|
41
43
|
src_ind_file: str,
|
44
|
+
out_ind_file: str,
|
42
45
|
win_len: int,
|
43
46
|
win_step: int,
|
44
|
-
w: float,
|
45
|
-
y: list[float],
|
46
47
|
output_file: str,
|
47
|
-
|
48
|
+
ploidy_config: PloidyConfig,
|
49
|
+
stat_config: StatConfig,
|
48
50
|
anc_allele_file: str = None,
|
49
51
|
num_src: int = 1,
|
50
52
|
):
|
@@ -61,18 +63,18 @@ class ChunkPreprocessor(DataPreprocessor):
|
|
61
63
|
Path to the file containing target individual IDs.
|
62
64
|
src_ind_file : str
|
63
65
|
Path to the file containing source individual IDs.
|
66
|
+
out_ind_file : str
|
67
|
+
Path to the file containing outgroup individual IDs.
|
64
68
|
win_len : int
|
65
69
|
Window length for generating genomic windows.
|
66
70
|
win_step : int
|
67
71
|
Step size for sliding windows across the genome.
|
68
|
-
w : float
|
69
|
-
Parameter w for feature vector computation.
|
70
|
-
y : list of float
|
71
|
-
List of y parameters for feature vector computation.
|
72
72
|
output_file : str
|
73
73
|
Path to the output file for storing feature vectors.
|
74
|
-
|
75
|
-
|
74
|
+
ploidy_config : PloidyConfig
|
75
|
+
Configuration specifying ploidy levels for each population involved in the analysis.
|
76
|
+
stat_config : StatConfig
|
77
|
+
Configuration of statistics to compute for feature vectors.
|
76
78
|
anc_allele_file : str, optional
|
77
79
|
Path to the ancestral allele file. If None, ancestral allele
|
78
80
|
information is considered unavailable.
|
@@ -83,18 +85,20 @@ class ChunkPreprocessor(DataPreprocessor):
|
|
83
85
|
self.ref_ind_file = ref_ind_file
|
84
86
|
self.tgt_ind_file = tgt_ind_file
|
85
87
|
self.src_ind_file = src_ind_file
|
88
|
+
self.out_ind_file = out_ind_file
|
86
89
|
self.win_len = win_len
|
87
90
|
self.win_step = win_step
|
91
|
+
self.ploidy_config = ploidy_config
|
88
92
|
self.anc_allele_file = anc_allele_file
|
89
|
-
|
93
|
+
|
94
|
+
src_samples = parse_ind_file(src_ind_file)
|
95
|
+
self.num_src = len(src_samples.keys())
|
90
96
|
|
91
97
|
anc_allele_available = anc_allele_file is not None
|
92
98
|
|
93
99
|
self.feature_preprocessor = FeaturePreprocessor(
|
94
|
-
w=w,
|
95
|
-
y=y,
|
96
100
|
output_file=output_file,
|
97
|
-
|
101
|
+
stat_config=stat_config,
|
98
102
|
anc_allele_available=anc_allele_available,
|
99
103
|
)
|
100
104
|
|
@@ -127,8 +131,10 @@ class ChunkPreprocessor(DataPreprocessor):
|
|
127
131
|
ref_ind_file=self.ref_ind_file,
|
128
132
|
tgt_ind_file=self.tgt_ind_file,
|
129
133
|
src_ind_file=self.src_ind_file,
|
134
|
+
out_ind_file=self.out_ind_file,
|
130
135
|
win_len=self.win_len,
|
131
136
|
win_step=self.win_step,
|
137
|
+
ploidy_config=self.ploidy_config,
|
132
138
|
anc_allele_file=self.anc_allele_file,
|
133
139
|
num_src=self.num_src,
|
134
140
|
)
|