sai-pg 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. sai/__init__.py +2 -0
  2. sai/__main__.py +6 -3
  3. sai/configs/__init__.py +24 -0
  4. sai/configs/global_config.py +83 -0
  5. sai/configs/ploidy_config.py +94 -0
  6. sai/configs/pop_config.py +82 -0
  7. sai/configs/stat_config.py +220 -0
  8. sai/{utils/generators → generators}/chunk_generator.py +1 -1
  9. sai/{utils/generators → generators}/window_generator.py +81 -37
  10. sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
  11. sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
  12. sai/parsers/outlier_parser.py +4 -3
  13. sai/parsers/score_parser.py +8 -119
  14. sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
  15. sai/preprocessors/feature_preprocessor.py +236 -0
  16. sai/registries/__init__.py +22 -0
  17. sai/registries/generic_registry.py +89 -0
  18. sai/registries/stat_registry.py +30 -0
  19. sai/sai.py +124 -220
  20. sai/stats/__init__.py +11 -0
  21. sai/stats/danc_statistic.py +83 -0
  22. sai/stats/dd_statistic.py +77 -0
  23. sai/stats/df_statistic.py +84 -0
  24. sai/stats/dplus_statistic.py +86 -0
  25. sai/stats/fd_statistic.py +92 -0
  26. sai/stats/generic_statistic.py +93 -0
  27. sai/stats/q_statistic.py +104 -0
  28. sai/stats/stat_utils.py +259 -0
  29. sai/stats/u_statistic.py +99 -0
  30. sai/utils/utils.py +213 -142
  31. {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
  32. sai_pg-1.1.0.dist-info/RECORD +70 -0
  33. {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
  34. sai_pg-1.1.0.dist-info/top_level.txt +2 -0
  35. tests/configs/test_global_config.py +163 -0
  36. tests/configs/test_ploidy_config.py +93 -0
  37. tests/configs/test_pop_config.py +90 -0
  38. tests/configs/test_stat_config.py +171 -0
  39. tests/generators/test_chunk_generator.py +51 -0
  40. tests/generators/test_window_generator.py +164 -0
  41. tests/multiprocessing/test_mp_manager.py +92 -0
  42. tests/multiprocessing/test_mp_pool.py +79 -0
  43. tests/parsers/test_argument_validation.py +133 -0
  44. tests/parsers/test_outlier_parser.py +53 -0
  45. tests/parsers/test_score_parser.py +63 -0
  46. tests/preprocessors/test_chunk_preprocessor.py +79 -0
  47. tests/preprocessors/test_feature_preprocessor.py +223 -0
  48. tests/registries/test_registries.py +74 -0
  49. tests/stats/test_danc_statistic.py +51 -0
  50. tests/stats/test_dd_statistic.py +45 -0
  51. tests/stats/test_df_statistic.py +73 -0
  52. tests/stats/test_dplus_statistic.py +79 -0
  53. tests/stats/test_fd_statistic.py +68 -0
  54. tests/stats/test_q_statistic.py +268 -0
  55. tests/stats/test_stat_utils.py +354 -0
  56. tests/stats/test_u_statistic.py +233 -0
  57. tests/test___main__.py +51 -0
  58. tests/test_sai.py +102 -0
  59. tests/utils/test_utils.py +511 -0
  60. sai/parsers/plot_parser.py +0 -152
  61. sai/stats/features.py +0 -302
  62. sai/utils/preprocessors/feature_preprocessor.py +0 -211
  63. sai_pg-1.0.1.dist-info/RECORD +0 -30
  64. sai_pg-1.0.1.dist-info/top_level.txt +0 -1
  65. /sai/{utils/generators → generators}/__init__.py +0 -0
  66. /sai/{utils/generators → generators}/data_generator.py +0 -0
  67. /sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
  68. /sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
  69. /sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
  70. {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
  71. {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,236 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ import numpy as np
22
+ from pathlib import Path
23
+ from typing import Any
24
+ from sai.preprocessors import DataPreprocessor
25
+ from sai.registries.stat_registry import STAT_REGISTRY
26
+ from sai.configs import PloidyConfig, StatConfig
27
+
28
+
29
+ class FeaturePreprocessor(DataPreprocessor):
30
+ """
31
+ A preprocessor subclass for generating feature vectors from genomic data.
32
+
33
+ This class extends DataPreprocessor to include additional functionality for creating
34
+ feature vectors based on genomic variants, reference and target individual genotypes,
35
+ and window-based genomic statistics.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ output_file: str,
41
+ stat_config: StatConfig,
42
+ anc_allele_available: bool = False,
43
+ ):
44
+ """
45
+ Initializes FeatureVectorsPreprocessor with specific frequency thresholds
46
+ and output file for storing generated feature vectors.
47
+
48
+ Parameters
49
+ ----------
50
+ output_file : str
51
+ Path to the output file to save processed feature vectors.
52
+ stat_config: StatConfig,
53
+ Specifies the configuration of statistics to compute.
54
+ anc_allele_available: bool, optional
55
+ If True, ancestral allele information is available.
56
+ If False, ancestral allele information is unavailable.
57
+ Default is False.
58
+ """
59
+ self.output_file = output_file
60
+ self.anc_allele_available = anc_allele_available
61
+ self.stat_config = stat_config
62
+
63
+ def run(
64
+ self,
65
+ chr_name: str,
66
+ ref_pop: str,
67
+ tgt_pop: str,
68
+ src_pop_list: list[str],
69
+ start: int,
70
+ end: int,
71
+ pos: np.ndarray,
72
+ ref_gts: np.ndarray,
73
+ tgt_gts: np.ndarray,
74
+ src_gts_list: list[np.ndarray],
75
+ ploidy_config: PloidyConfig,
76
+ ) -> list[dict[str, Any]]:
77
+ """
78
+ Generates feature vectors for a specified genomic window.
79
+
80
+ Parameters
81
+ ----------
82
+ chr_name : str
83
+ Chromosome name.
84
+ ref_pop : str
85
+ Reference population name.
86
+ tgt_pop : str
87
+ Target population name.
88
+ src_pop_list : list[str]
89
+ List of source population names.
90
+ start : int
91
+ Start position of the genomic window.
92
+ end : int
93
+ End position of the genomic window.
94
+ pos : np.ndarray
95
+ A 1D numpy array where each element represents the genomic position.
96
+ ref_gts : np.ndarray
97
+ Genotype data for the reference population.
98
+ tgt_gts : np.ndarray
99
+ Genotype data for the target population.
100
+ src_gts_list : list[np.ndarray]
101
+ List of genotype arrays for each source population.
102
+ ploidy_config: PloidyConfig
103
+ Configuration specifying ploidy levels for each population involved in the analysis.
104
+
105
+ Returns
106
+ -------
107
+ list[dict[str, Any]]
108
+ A list containing a dictionary of calculated feature vectors for the genomic window.
109
+ """
110
+ items = {
111
+ "chr_name": chr_name,
112
+ "start": start,
113
+ "end": end,
114
+ "ref_pop": ref_pop,
115
+ "tgt_pop": tgt_pop,
116
+ "src_pop_list": src_pop_list,
117
+ "nsnps": len(pos),
118
+ "cdd_pos": {},
119
+ }
120
+
121
+ if (
122
+ (ref_gts is None)
123
+ or (tgt_gts is None)
124
+ or (src_gts_list is None)
125
+ or (ploidy_config is None)
126
+ ):
127
+ for stat_name in self.stat_config.root.keys():
128
+ if len(src_pop_list) > 1:
129
+ items[stat_name] = [np.nan for _ in range(len(src_pop_list))]
130
+ else:
131
+ items[stat_name] = np.nan
132
+ if stat_name in ["U", "Q"]:
133
+ items[stat_name] = np.nan
134
+ items["cdd_pos"][stat_name] = np.array([])
135
+ else:
136
+ for stat_name in self.stat_config.root.keys():
137
+ stat_cls = STAT_REGISTRY.get(stat_name)
138
+ stat = stat_cls(
139
+ ref_gts=ref_gts,
140
+ tgt_gts=tgt_gts,
141
+ src_gts_list=src_gts_list,
142
+ ref_ploidy=ploidy_config.get_ploidy("ref", ref_pop),
143
+ tgt_ploidy=ploidy_config.get_ploidy("tgt", tgt_pop),
144
+ src_ploidy_list=ploidy_config.get_ploidy("src"),
145
+ )
146
+ if stat_name == "U":
147
+ results = stat.compute(
148
+ pos=pos,
149
+ w=self.stat_config.get_parameters(stat_name)["ref"][ref_pop],
150
+ x=self.stat_config.get_parameters(stat_name)["tgt"][tgt_pop],
151
+ y_list=list(
152
+ self.stat_config.get_parameters(stat_name)["src"].values()
153
+ ),
154
+ anc_allele_available=self.anc_allele_available,
155
+ )
156
+ items["cdd_pos"][stat_name] = results["cdd_pos"]
157
+ elif stat_name == "Q":
158
+ results = stat.compute(
159
+ pos=pos,
160
+ w=self.stat_config.get_parameters(stat_name)["ref"][ref_pop],
161
+ quantile=self.stat_config.get_parameters(stat_name)["tgt"][
162
+ tgt_pop
163
+ ],
164
+ y_list=list(
165
+ self.stat_config.get_parameters(stat_name)["src"].values()
166
+ ),
167
+ anc_allele_available=self.anc_allele_available,
168
+ )
169
+ items["cdd_pos"][stat_name] = results["cdd_pos"]
170
+ else:
171
+ results = stat.compute()
172
+ items[stat_name] = results["value"]
173
+
174
+ return [items]
175
+
176
+ def process_items(self, items: list[dict[str, Any]]) -> None:
177
+ """
178
+ Processes and writes a single dictionary of feature vectors to the output file.
179
+
180
+ Parameters
181
+ ----------
182
+ items : dict[str, Any]
183
+ A dictionary containing feature vectors for a genomic window.
184
+ """
185
+ with open(
186
+ self.output_file, "a"
187
+ ) as f: # Open in append mode for continuous writing
188
+ lines = []
189
+ for item in items:
190
+ src_pop = item["src_pop_list"]
191
+ src_pop_str = ",".join(src_pop)
192
+
193
+ stats_parts = []
194
+ for stat_name in self.stat_config.root.keys():
195
+ val = item.get(stat_name)
196
+
197
+ if isinstance(val, list) and len(val) == len(src_pop):
198
+ # expand one value per src, in the same order as src_pop
199
+ stats_parts.extend("" if v is None else str(v) for v in val)
200
+ else:
201
+ # fallback: single value
202
+ if isinstance(val, list):
203
+ v = val[0] if len(val) > 0 else ""
204
+ else:
205
+ v = val
206
+ stats_parts.append("" if v is None else str(v))
207
+
208
+ stats = "\t".join(stats_parts)
209
+
210
+ line = (
211
+ f"{item['chr_name']}\t{item['start']}\t{item['end']}\t"
212
+ f"{item['ref_pop']}\t{item['tgt_pop']}\t{src_pop_str}\t"
213
+ f"{item['nsnps']}\t{stats}\n"
214
+ )
215
+ lines.append(line)
216
+
217
+ f.writelines(lines)
218
+
219
+ for key in ("U", "Q"):
220
+ if key in self.stat_config.root:
221
+ path = Path(self.output_file)
222
+ log_file = path.with_suffix(f".{key}.log")
223
+ with open(log_file, "a") as f:
224
+ lines = []
225
+ for item in items:
226
+ cdd = (
227
+ "NA"
228
+ if item["cdd_pos"][key].size == 0
229
+ else ",".join(
230
+ f"{item['chr_name']}:{pos}"
231
+ for pos in item["cdd_pos"][key]
232
+ )
233
+ )
234
+ line = f"{item['chr_name']}\t{item['start']}\t{item['end']}\t{cdd}\n"
235
+ lines.append(line)
236
+ f.writelines(lines)
@@ -0,0 +1,22 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ from .generic_registry import GenericRegistry
22
+ from .stat_registry import StatRegistry
@@ -0,0 +1,89 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ from abc import ABC, abstractmethod
22
+ from typing import Any, Callable
23
+
24
+
25
+ class GenericRegistry(ABC):
26
+ """
27
+ Generic class for all registries.
28
+
29
+ Provides a consistent interface for registering and retrieving
30
+ named components such as missions, models, or features.
31
+ """
32
+
33
+ def __init__(self):
34
+ self._registry = {}
35
+
36
+ def register(self, name: str) -> Callable:
37
+ """
38
+ Decorator to register a class or callable under a given name.
39
+
40
+ Parameters
41
+ ----------
42
+ name : str
43
+ The key under which the component is registered.
44
+
45
+ Returns
46
+ -------
47
+ Callable
48
+ A decorator that registers the class/function.
49
+ """
50
+
51
+ def decorator(obj: Any) -> Any:
52
+ self._register(name, obj)
53
+ return obj
54
+
55
+ return decorator
56
+
57
+ def _register(self, name: str, obj: Any) -> None:
58
+ if name in self._registry:
59
+ raise ValueError(f"{name!r} is already registered.")
60
+ self._registry[name] = obj
61
+
62
+ def get(self, name: str) -> Any:
63
+ """
64
+ Retrieves a registered component by name.
65
+
66
+ Parameters
67
+ ----------
68
+ name : str
69
+ The key of the registered component.
70
+
71
+ Returns
72
+ -------
73
+ Any
74
+ The registered component.
75
+ """
76
+ if name not in self._registry:
77
+ raise KeyError(f"No component registered under name '{name}'")
78
+ return self._registry[name]
79
+
80
+ def list_registered(self) -> list[str]:
81
+ """
82
+ Lists all registered component names.
83
+
84
+ Returns
85
+ -------
86
+ list of str
87
+ Names of all registered components.
88
+ """
89
+ return list(self._registry.keys())
@@ -0,0 +1,30 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ from sai.registries.generic_registry import GenericRegistry
22
+
23
+
24
+ class StatRegistry(GenericRegistry):
25
+ """
26
+ Concrete registry for statistic classes.
27
+ """
28
+
29
+
30
+ STAT_REGISTRY = StatRegistry()