sai-pg 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sai/__init__.py +2 -0
- sai/__main__.py +6 -3
- sai/configs/__init__.py +24 -0
- sai/configs/global_config.py +83 -0
- sai/configs/ploidy_config.py +94 -0
- sai/configs/pop_config.py +82 -0
- sai/configs/stat_config.py +220 -0
- sai/{utils/generators → generators}/chunk_generator.py +2 -8
- sai/{utils/generators → generators}/window_generator.py +82 -37
- sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
- sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
- sai/parsers/outlier_parser.py +4 -3
- sai/parsers/score_parser.py +8 -119
- sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
- sai/preprocessors/feature_preprocessor.py +236 -0
- sai/registries/__init__.py +22 -0
- sai/registries/generic_registry.py +89 -0
- sai/registries/stat_registry.py +30 -0
- sai/sai.py +124 -220
- sai/stats/__init__.py +11 -0
- sai/stats/danc_statistic.py +83 -0
- sai/stats/dd_statistic.py +77 -0
- sai/stats/df_statistic.py +84 -0
- sai/stats/dplus_statistic.py +86 -0
- sai/stats/fd_statistic.py +92 -0
- sai/stats/generic_statistic.py +93 -0
- sai/stats/q_statistic.py +104 -0
- sai/stats/stat_utils.py +259 -0
- sai/stats/u_statistic.py +99 -0
- sai/utils/utils.py +220 -143
- {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
- sai_pg-1.1.0.dist-info/RECORD +70 -0
- {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
- sai_pg-1.1.0.dist-info/top_level.txt +2 -0
- tests/configs/test_global_config.py +163 -0
- tests/configs/test_ploidy_config.py +93 -0
- tests/configs/test_pop_config.py +90 -0
- tests/configs/test_stat_config.py +171 -0
- tests/generators/test_chunk_generator.py +51 -0
- tests/generators/test_window_generator.py +164 -0
- tests/multiprocessing/test_mp_manager.py +92 -0
- tests/multiprocessing/test_mp_pool.py +79 -0
- tests/parsers/test_argument_validation.py +133 -0
- tests/parsers/test_outlier_parser.py +53 -0
- tests/parsers/test_score_parser.py +63 -0
- tests/preprocessors/test_chunk_preprocessor.py +79 -0
- tests/preprocessors/test_feature_preprocessor.py +223 -0
- tests/registries/test_registries.py +74 -0
- tests/stats/test_danc_statistic.py +51 -0
- tests/stats/test_dd_statistic.py +45 -0
- tests/stats/test_df_statistic.py +73 -0
- tests/stats/test_dplus_statistic.py +79 -0
- tests/stats/test_fd_statistic.py +68 -0
- tests/stats/test_q_statistic.py +268 -0
- tests/stats/test_stat_utils.py +354 -0
- tests/stats/test_u_statistic.py +233 -0
- tests/test___main__.py +51 -0
- tests/test_sai.py +102 -0
- tests/utils/test_utils.py +511 -0
- sai/parsers/plot_parser.py +0 -152
- sai/stats/features.py +0 -302
- sai/utils/preprocessors/feature_preprocessor.py +0 -211
- sai_pg-1.0.0.dist-info/RECORD +0 -30
- sai_pg-1.0.0.dist-info/top_level.txt +0 -1
- /sai/{utils/generators → generators}/__init__.py +0 -0
- /sai/{utils/generators → generators}/data_generator.py +0 -0
- /sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
- {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
- {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,236 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
import numpy as np
|
22
|
+
from pathlib import Path
|
23
|
+
from typing import Any
|
24
|
+
from sai.preprocessors import DataPreprocessor
|
25
|
+
from sai.registries.stat_registry import STAT_REGISTRY
|
26
|
+
from sai.configs import PloidyConfig, StatConfig
|
27
|
+
|
28
|
+
|
29
|
+
class FeaturePreprocessor(DataPreprocessor):
|
30
|
+
"""
|
31
|
+
A preprocessor subclass for generating feature vectors from genomic data.
|
32
|
+
|
33
|
+
This class extends DataPreprocessor to include additional functionality for creating
|
34
|
+
feature vectors based on genomic variants, reference and target individual genotypes,
|
35
|
+
and window-based genomic statistics.
|
36
|
+
"""
|
37
|
+
|
38
|
+
def __init__(
|
39
|
+
self,
|
40
|
+
output_file: str,
|
41
|
+
stat_config: StatConfig,
|
42
|
+
anc_allele_available: bool = False,
|
43
|
+
):
|
44
|
+
"""
|
45
|
+
Initializes FeatureVectorsPreprocessor with specific frequency thresholds
|
46
|
+
and output file for storing generated feature vectors.
|
47
|
+
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
output_file : str
|
51
|
+
Path to the output file to save processed feature vectors.
|
52
|
+
stat_config: StatConfig,
|
53
|
+
Specifies the configuration of statistics to compute.
|
54
|
+
anc_allele_available: bool, optional
|
55
|
+
If True, ancestral allele information is available.
|
56
|
+
If False, ancestral allele information is unavailable.
|
57
|
+
Default is False.
|
58
|
+
"""
|
59
|
+
self.output_file = output_file
|
60
|
+
self.anc_allele_available = anc_allele_available
|
61
|
+
self.stat_config = stat_config
|
62
|
+
|
63
|
+
def run(
|
64
|
+
self,
|
65
|
+
chr_name: str,
|
66
|
+
ref_pop: str,
|
67
|
+
tgt_pop: str,
|
68
|
+
src_pop_list: list[str],
|
69
|
+
start: int,
|
70
|
+
end: int,
|
71
|
+
pos: np.ndarray,
|
72
|
+
ref_gts: np.ndarray,
|
73
|
+
tgt_gts: np.ndarray,
|
74
|
+
src_gts_list: list[np.ndarray],
|
75
|
+
ploidy_config: PloidyConfig,
|
76
|
+
) -> list[dict[str, Any]]:
|
77
|
+
"""
|
78
|
+
Generates feature vectors for a specified genomic window.
|
79
|
+
|
80
|
+
Parameters
|
81
|
+
----------
|
82
|
+
chr_name : str
|
83
|
+
Chromosome name.
|
84
|
+
ref_pop : str
|
85
|
+
Reference population name.
|
86
|
+
tgt_pop : str
|
87
|
+
Target population name.
|
88
|
+
src_pop_list : list[str]
|
89
|
+
List of source population names.
|
90
|
+
start : int
|
91
|
+
Start position of the genomic window.
|
92
|
+
end : int
|
93
|
+
End position of the genomic window.
|
94
|
+
pos : np.ndarray
|
95
|
+
A 1D numpy array where each element represents the genomic position.
|
96
|
+
ref_gts : np.ndarray
|
97
|
+
Genotype data for the reference population.
|
98
|
+
tgt_gts : np.ndarray
|
99
|
+
Genotype data for the target population.
|
100
|
+
src_gts_list : list[np.ndarray]
|
101
|
+
List of genotype arrays for each source population.
|
102
|
+
ploidy_config: PloidyConfig
|
103
|
+
Configuration specifying ploidy levels for each population involved in the analysis.
|
104
|
+
|
105
|
+
Returns
|
106
|
+
-------
|
107
|
+
list[dict[str, Any]]
|
108
|
+
A list containing a dictionary of calculated feature vectors for the genomic window.
|
109
|
+
"""
|
110
|
+
items = {
|
111
|
+
"chr_name": chr_name,
|
112
|
+
"start": start,
|
113
|
+
"end": end,
|
114
|
+
"ref_pop": ref_pop,
|
115
|
+
"tgt_pop": tgt_pop,
|
116
|
+
"src_pop_list": src_pop_list,
|
117
|
+
"nsnps": len(pos),
|
118
|
+
"cdd_pos": {},
|
119
|
+
}
|
120
|
+
|
121
|
+
if (
|
122
|
+
(ref_gts is None)
|
123
|
+
or (tgt_gts is None)
|
124
|
+
or (src_gts_list is None)
|
125
|
+
or (ploidy_config is None)
|
126
|
+
):
|
127
|
+
for stat_name in self.stat_config.root.keys():
|
128
|
+
if len(src_pop_list) > 1:
|
129
|
+
items[stat_name] = [np.nan for _ in range(len(src_pop_list))]
|
130
|
+
else:
|
131
|
+
items[stat_name] = np.nan
|
132
|
+
if stat_name in ["U", "Q"]:
|
133
|
+
items[stat_name] = np.nan
|
134
|
+
items["cdd_pos"][stat_name] = np.array([])
|
135
|
+
else:
|
136
|
+
for stat_name in self.stat_config.root.keys():
|
137
|
+
stat_cls = STAT_REGISTRY.get(stat_name)
|
138
|
+
stat = stat_cls(
|
139
|
+
ref_gts=ref_gts,
|
140
|
+
tgt_gts=tgt_gts,
|
141
|
+
src_gts_list=src_gts_list,
|
142
|
+
ref_ploidy=ploidy_config.get_ploidy("ref", ref_pop),
|
143
|
+
tgt_ploidy=ploidy_config.get_ploidy("tgt", tgt_pop),
|
144
|
+
src_ploidy_list=ploidy_config.get_ploidy("src"),
|
145
|
+
)
|
146
|
+
if stat_name == "U":
|
147
|
+
results = stat.compute(
|
148
|
+
pos=pos,
|
149
|
+
w=self.stat_config.get_parameters(stat_name)["ref"][ref_pop],
|
150
|
+
x=self.stat_config.get_parameters(stat_name)["tgt"][tgt_pop],
|
151
|
+
y_list=list(
|
152
|
+
self.stat_config.get_parameters(stat_name)["src"].values()
|
153
|
+
),
|
154
|
+
anc_allele_available=self.anc_allele_available,
|
155
|
+
)
|
156
|
+
items["cdd_pos"][stat_name] = results["cdd_pos"]
|
157
|
+
elif stat_name == "Q":
|
158
|
+
results = stat.compute(
|
159
|
+
pos=pos,
|
160
|
+
w=self.stat_config.get_parameters(stat_name)["ref"][ref_pop],
|
161
|
+
quantile=self.stat_config.get_parameters(stat_name)["tgt"][
|
162
|
+
tgt_pop
|
163
|
+
],
|
164
|
+
y_list=list(
|
165
|
+
self.stat_config.get_parameters(stat_name)["src"].values()
|
166
|
+
),
|
167
|
+
anc_allele_available=self.anc_allele_available,
|
168
|
+
)
|
169
|
+
items["cdd_pos"][stat_name] = results["cdd_pos"]
|
170
|
+
else:
|
171
|
+
results = stat.compute()
|
172
|
+
items[stat_name] = results["value"]
|
173
|
+
|
174
|
+
return [items]
|
175
|
+
|
176
|
+
def process_items(self, items: list[dict[str, Any]]) -> None:
|
177
|
+
"""
|
178
|
+
Processes and writes a single dictionary of feature vectors to the output file.
|
179
|
+
|
180
|
+
Parameters
|
181
|
+
----------
|
182
|
+
items : dict[str, Any]
|
183
|
+
A dictionary containing feature vectors for a genomic window.
|
184
|
+
"""
|
185
|
+
with open(
|
186
|
+
self.output_file, "a"
|
187
|
+
) as f: # Open in append mode for continuous writing
|
188
|
+
lines = []
|
189
|
+
for item in items:
|
190
|
+
src_pop = item["src_pop_list"]
|
191
|
+
src_pop_str = ",".join(src_pop)
|
192
|
+
|
193
|
+
stats_parts = []
|
194
|
+
for stat_name in self.stat_config.root.keys():
|
195
|
+
val = item.get(stat_name)
|
196
|
+
|
197
|
+
if isinstance(val, list) and len(val) == len(src_pop):
|
198
|
+
# expand one value per src, in the same order as src_pop
|
199
|
+
stats_parts.extend("" if v is None else str(v) for v in val)
|
200
|
+
else:
|
201
|
+
# fallback: single value
|
202
|
+
if isinstance(val, list):
|
203
|
+
v = val[0] if len(val) > 0 else ""
|
204
|
+
else:
|
205
|
+
v = val
|
206
|
+
stats_parts.append("" if v is None else str(v))
|
207
|
+
|
208
|
+
stats = "\t".join(stats_parts)
|
209
|
+
|
210
|
+
line = (
|
211
|
+
f"{item['chr_name']}\t{item['start']}\t{item['end']}\t"
|
212
|
+
f"{item['ref_pop']}\t{item['tgt_pop']}\t{src_pop_str}\t"
|
213
|
+
f"{item['nsnps']}\t{stats}\n"
|
214
|
+
)
|
215
|
+
lines.append(line)
|
216
|
+
|
217
|
+
f.writelines(lines)
|
218
|
+
|
219
|
+
for key in ("U", "Q"):
|
220
|
+
if key in self.stat_config.root:
|
221
|
+
path = Path(self.output_file)
|
222
|
+
log_file = path.with_suffix(f".{key}.log")
|
223
|
+
with open(log_file, "a") as f:
|
224
|
+
lines = []
|
225
|
+
for item in items:
|
226
|
+
cdd = (
|
227
|
+
"NA"
|
228
|
+
if item["cdd_pos"][key].size == 0
|
229
|
+
else ",".join(
|
230
|
+
f"{item['chr_name']}:{pos}"
|
231
|
+
for pos in item["cdd_pos"][key]
|
232
|
+
)
|
233
|
+
)
|
234
|
+
line = f"{item['chr_name']}\t{item['start']}\t{item['end']}\t{cdd}\n"
|
235
|
+
lines.append(line)
|
236
|
+
f.writelines(lines)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
from .generic_registry import GenericRegistry
|
22
|
+
from .stat_registry import StatRegistry
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
from abc import ABC, abstractmethod
|
22
|
+
from typing import Any, Callable
|
23
|
+
|
24
|
+
|
25
|
+
class GenericRegistry(ABC):
|
26
|
+
"""
|
27
|
+
Generic class for all registries.
|
28
|
+
|
29
|
+
Provides a consistent interface for registering and retrieving
|
30
|
+
named components such as missions, models, or features.
|
31
|
+
"""
|
32
|
+
|
33
|
+
def __init__(self):
|
34
|
+
self._registry = {}
|
35
|
+
|
36
|
+
def register(self, name: str) -> Callable:
|
37
|
+
"""
|
38
|
+
Decorator to register a class or callable under a given name.
|
39
|
+
|
40
|
+
Parameters
|
41
|
+
----------
|
42
|
+
name : str
|
43
|
+
The key under which the component is registered.
|
44
|
+
|
45
|
+
Returns
|
46
|
+
-------
|
47
|
+
Callable
|
48
|
+
A decorator that registers the class/function.
|
49
|
+
"""
|
50
|
+
|
51
|
+
def decorator(obj: Any) -> Any:
|
52
|
+
self._register(name, obj)
|
53
|
+
return obj
|
54
|
+
|
55
|
+
return decorator
|
56
|
+
|
57
|
+
def _register(self, name: str, obj: Any) -> None:
|
58
|
+
if name in self._registry:
|
59
|
+
raise ValueError(f"{name!r} is already registered.")
|
60
|
+
self._registry[name] = obj
|
61
|
+
|
62
|
+
def get(self, name: str) -> Any:
|
63
|
+
"""
|
64
|
+
Retrieves a registered component by name.
|
65
|
+
|
66
|
+
Parameters
|
67
|
+
----------
|
68
|
+
name : str
|
69
|
+
The key of the registered component.
|
70
|
+
|
71
|
+
Returns
|
72
|
+
-------
|
73
|
+
Any
|
74
|
+
The registered component.
|
75
|
+
"""
|
76
|
+
if name not in self._registry:
|
77
|
+
raise KeyError(f"No component registered under name '{name}'")
|
78
|
+
return self._registry[name]
|
79
|
+
|
80
|
+
def list_registered(self) -> list[str]:
|
81
|
+
"""
|
82
|
+
Lists all registered component names.
|
83
|
+
|
84
|
+
Returns
|
85
|
+
-------
|
86
|
+
list of str
|
87
|
+
Names of all registered components.
|
88
|
+
"""
|
89
|
+
return list(self._registry.keys())
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
from sai.registries.generic_registry import GenericRegistry
|
22
|
+
|
23
|
+
|
24
|
+
class StatRegistry(GenericRegistry):
|
25
|
+
"""
|
26
|
+
Concrete registry for statistic classes.
|
27
|
+
"""
|
28
|
+
|
29
|
+
|
30
|
+
STAT_REGISTRY = StatRegistry()
|