sai-pg 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,152 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ from typing import Any
22
+ from sai.utils.generators import WindowGenerator
23
+ from sai.utils.preprocessors import DataPreprocessor
24
+ from .feature_preprocessor import FeaturePreprocessor
25
+
26
+
27
+ class ChunkPreprocessor(DataPreprocessor):
28
+ """
29
+ Preprocesses VCF data in genomic windows and applies feature preprocessing.
30
+
31
+ This class generates genomic windows from a VCF file, processes them
32
+ with specified reference, target, and source individuals, and computes
33
+ feature vectors using the provided feature preprocessor.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ vcf_file: str,
39
+ ref_ind_file: str,
40
+ tgt_ind_file: str,
41
+ src_ind_file: str,
42
+ win_len: int,
43
+ win_step: int,
44
+ w: float,
45
+ y: list[float],
46
+ output_file: str,
47
+ stat_type: str,
48
+ anc_allele_file: str = None,
49
+ num_src: int = 1,
50
+ ):
51
+ """
52
+ Initializes a new instance of ChunkPreprocessor.
53
+
54
+ Parameters
55
+ ----------
56
+ vcf_file : str
57
+ Path to the VCF file to process.
58
+ ref_ind_file : str
59
+ Path to the file containing reference individual IDs.
60
+ tgt_ind_file : str
61
+ Path to the file containing target individual IDs.
62
+ src_ind_file : str
63
+ Path to the file containing source individual IDs.
64
+ win_len : int
65
+ Window length for generating genomic windows.
66
+ win_step : int
67
+ Step size for sliding windows across the genome.
68
+ w : float
69
+ Parameter w for feature vector computation.
70
+ y : list of float
71
+ List of y parameters for feature vector computation.
72
+ output_file : str
73
+ Path to the output file for storing feature vectors.
74
+ stat_type : str
75
+ Type of statistic to compute for feature vectors.
76
+ anc_allele_file : str, optional
77
+ Path to the ancestral allele file. If None, ancestral allele
78
+ information is considered unavailable.
79
+ num_src : int, optional
80
+ Number of source populations to use. Default is 1.
81
+ """
82
+ self.vcf_file = vcf_file
83
+ self.ref_ind_file = ref_ind_file
84
+ self.tgt_ind_file = tgt_ind_file
85
+ self.src_ind_file = src_ind_file
86
+ self.win_len = win_len
87
+ self.win_step = win_step
88
+ self.anc_allele_file = anc_allele_file
89
+ self.num_src = num_src
90
+
91
+ anc_allele_available = anc_allele_file is not None
92
+
93
+ self.feature_preprocessor = FeaturePreprocessor(
94
+ w=w,
95
+ y=y,
96
+ output_file=output_file,
97
+ stat_type=stat_type,
98
+ anc_allele_available=anc_allele_available,
99
+ )
100
+
101
+ def run(self, chr_name: str, start: int, end: int) -> list[dict[str, Any]]:
102
+ """
103
+ Runs the preprocessing pipeline on a specific chromosome region.
104
+
105
+ Generates genomic windows within the specified chromosome region,
106
+ processes each window to compute feature vectors, and aggregates the results.
107
+
108
+ Parameters
109
+ ----------
110
+ chr_name : str
111
+ Name of the chromosome to process.
112
+ start : int
113
+ Start position (1-based, inclusive) of the region to process.
114
+ end : int
115
+ End position (1-based, exclusive) of the region to process.
116
+
117
+ Returns
118
+ -------
119
+ list of dict of {str: Any}
120
+ A list of dictionaries containing computed feature vectors for each genomic window.
121
+ """
122
+ window_generator = WindowGenerator(
123
+ vcf_file=self.vcf_file,
124
+ chr_name=chr_name,
125
+ start=start,
126
+ end=end,
127
+ ref_ind_file=self.ref_ind_file,
128
+ tgt_ind_file=self.tgt_ind_file,
129
+ src_ind_file=self.src_ind_file,
130
+ win_len=self.win_len,
131
+ win_step=self.win_step,
132
+ anc_allele_file=self.anc_allele_file,
133
+ num_src=self.num_src,
134
+ )
135
+
136
+ items = []
137
+
138
+ for item in window_generator.get():
139
+ items.extend(self.feature_preprocessor.run(**item))
140
+
141
+ return items
142
+
143
+ def process_items(self, items: list[dict[str, Any]]) -> None:
144
+ """
145
+ Processes and writes computed feature vectors to the output.
146
+
147
+ Parameters
148
+ ----------
149
+ items : list of dict of {str: Any}
150
+ A list of dictionaries containing computed feature vectors for each genomic window.
151
+ """
152
+ self.feature_preprocessor.process_items(items)
@@ -0,0 +1,94 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ from typing import Any
22
+ from abc import ABC, abstractmethod
23
+
24
+
25
+ class DataPreprocessor(ABC):
26
+ """
27
+ Abstract base class for preprocessing genomic data.
28
+
29
+ This class defines a common interface for various data preprocessing operations,
30
+ such as filtering, normalization, and transformation of genomic data. Subclasses
31
+ should implement specific methods to handle data processing tasks, ensuring a
32
+ consistent way to run operations and manage the output of processed data.
33
+
34
+ Methods:
35
+ --------
36
+ run(**kwargs) -> Any:
37
+ Execute the core data processing task. Subclasses must define this method
38
+ to carry out specific preprocessing tasks such as filtering, normalization,
39
+ or transformation. This method should return the processed data, which
40
+ will then be handled by the main process to manage further steps or output.
41
+
42
+ process_items(items, **kwargs) -> None:
43
+ Handle the output or further processing of data once the `run` method
44
+ has completed. This allows subclasses to define how processed data
45
+ should be managed, such as saving results to a file, database, or converting
46
+ the data to a specific format for future analysis.
47
+ """
48
+
49
+ @abstractmethod
50
+ def run(self, **kwargs) -> Any:
51
+ """
52
+ Abstract method to run the preprocessing operations.
53
+
54
+ Subclasses must implement this method to perform specific preprocessing
55
+ tasks based on the initialized parameters and any additional keyword
56
+ arguments.
57
+
58
+ Parameters:
59
+ -----------
60
+ **kwargs : dict
61
+ Additional keyword arguments that may be required for specific
62
+ preprocessing operations.
63
+
64
+ Returns:
65
+ --------
66
+ processed_data : Any
67
+ The result of the preprocessing task, which can be further handled
68
+ by the `process_items` method.
69
+ """
70
+ pass
71
+
72
+ @abstractmethod
73
+ def process_items(self, items: Any, **kwargs) -> None:
74
+ """
75
+ Abstract method to handle the output or post-processing of data.
76
+
77
+ Subclasses must implement this method to define how the processed data
78
+ should be managed. This could include saving the data to a file,
79
+ transforming it into a new format, or preparing it for the next step
80
+ of analysis.
81
+
82
+ Parameters:
83
+ -----------
84
+ items : Any
85
+ The processed data returned by the `run` method, which will be managed
86
+ or output according to the logic defined in this method.
87
+
88
+ **kwargs : dict
89
+ Additional keyword arguments that can be used for customizing the
90
+ output process. For example, this may include options like `output_file`
91
+ to specify where the data should be saved or other settings to control
92
+ the output format.
93
+ """
94
+ pass
@@ -0,0 +1,211 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ import numpy as np
22
+ from typing import Any
23
+ from sai.stats.features import calc_u, calc_q
24
+ from sai.utils.preprocessors import DataPreprocessor
25
+
26
+
27
+ class FeaturePreprocessor(DataPreprocessor):
28
+ """
29
+ A preprocessor subclass for generating feature vectors from genomic data.
30
+
31
+ This class extends DataPreprocessor to include additional functionality for creating
32
+ feature vectors based on genomic variants, reference and target individual genotypes,
33
+ and window-based genomic statistics.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ w: float,
39
+ y: list[float],
40
+ output_file: str,
41
+ stat_type: str,
42
+ anc_allele_available: bool = False,
43
+ ):
44
+ """
45
+ Initializes FeatureVectorsPreprocessor with specific frequency thresholds
46
+ and output file for storing generated feature vectors.
47
+
48
+ Parameters
49
+ ----------
50
+ w : float
51
+ Frequency threshold for `calc_u` and `calc_q`.
52
+ y : list[float]
53
+ List of frequency thresholds for `calc_u` and `calc_q`.
54
+ output_file : str
55
+ Path to the output file to save processed feature vectors.
56
+ stat_type: str,
57
+ Specifies the type of statistic to compute.
58
+ - "UXX" (e.g., "U50", "U90") : Compute the U statistic using `calc_u()`.
59
+ - "QXX" (e.g., "Q95", "Q50") : Compute the Q statistic using `calc_q()`,
60
+ anc_allele_available: bool, optional
61
+ If True, ancestral allele information is available.
62
+ If False, ancestral allele information is unavailable.
63
+ Default is False.
64
+
65
+ Raises
66
+ ------
67
+ ValueError
68
+ If `stat_type` is not in a valid format. Must be either: 'UXX' or 'QXX'.
69
+ """
70
+ self.w = w
71
+ self.y = y
72
+ self.output_file = output_file
73
+ self.anc_allele_available = anc_allele_available
74
+ if not (
75
+ len(stat_type) == 3
76
+ and stat_type[0] in {"U", "Q"}
77
+ and stat_type[1:].isdigit()
78
+ ):
79
+ raise ValueError(
80
+ f"Invalid stat_type format: {stat_type}. Expected format 'UXX' or 'QXX' (e.g., 'U50' or 'Q95')."
81
+ )
82
+ self.stat_prefix = stat_type[0]
83
+ self.threshold = int(stat_type[1:]) / 100
84
+
85
+ def run(
86
+ self,
87
+ chr_name: str,
88
+ ref_pop: str,
89
+ tgt_pop: str,
90
+ src_pop_list: list[str],
91
+ start: int,
92
+ end: int,
93
+ pos: np.ndarray,
94
+ ref_gts: np.ndarray,
95
+ tgt_gts: np.ndarray,
96
+ src_gts_list: list[np.ndarray],
97
+ ploidy: int,
98
+ ) -> list[dict[str, Any]]:
99
+ """
100
+ Generates feature vectors for a specified genomic window.
101
+
102
+ Parameters
103
+ ----------
104
+ chr_name : str
105
+ Chromosome name.
106
+ ref_pop : str
107
+ Reference population name.
108
+ tgt_pop : str
109
+ Target population name.
110
+ src_pop_list : list[str]
111
+ List of source population names.
112
+ start : int
113
+ Start position of the genomic window.
114
+ end : int
115
+ End position of the genomic window.
116
+ pos : np.ndarray
117
+ A 1D numpy array where each element represents the genomic position.
118
+ ref_gts : np.ndarray
119
+ Genotype data for the reference population.
120
+ tgt_gts : np.ndarray
121
+ Genotype data for the target population.
122
+ src_gts_list : list[np.ndarray]
123
+ List of genotype arrays for each source population.
124
+ ploidy: int
125
+ Ploidy of the genome.
126
+
127
+ Returns
128
+ -------
129
+ list[dict[str, Any]]
130
+ A list containing a dictionary of calculated feature vectors for the genomic window.
131
+ """
132
+ items = {
133
+ "chr_name": chr_name,
134
+ "start": start,
135
+ "end": end,
136
+ "ref_pop": ref_pop,
137
+ "tgt_pop": tgt_pop,
138
+ "src_pop_list": src_pop_list,
139
+ "nsnps": len(pos),
140
+ }
141
+
142
+ if (
143
+ (ref_gts is None)
144
+ or (tgt_gts is None)
145
+ or (src_gts_list is None)
146
+ or (ploidy is None)
147
+ ):
148
+ items["statistic"] = np.nan
149
+ items["candidates"] = np.array([])
150
+ elif self.stat_prefix == "U":
151
+ items["statistic"], items["candidates"] = calc_u(
152
+ ref_gts=ref_gts,
153
+ tgt_gts=tgt_gts,
154
+ src_gts_list=src_gts_list,
155
+ pos=pos,
156
+ w=self.w,
157
+ x=self.threshold,
158
+ y_list=self.y,
159
+ ploidy=ploidy,
160
+ anc_allele_available=self.anc_allele_available,
161
+ )
162
+ elif self.stat_prefix == "Q":
163
+ items["statistic"], items["candidates"] = calc_q(
164
+ ref_gts=ref_gts,
165
+ tgt_gts=tgt_gts,
166
+ src_gts_list=src_gts_list,
167
+ pos=pos,
168
+ w=self.w,
169
+ y_list=self.y,
170
+ quantile=self.threshold,
171
+ ploidy=ploidy,
172
+ anc_allele_available=self.anc_allele_available,
173
+ )
174
+ else:
175
+ raise ValueError(
176
+ f"Invalid stat_type: {self.stat_type}. Must be 'U' or 'QXX' (e.g., 'Q95')."
177
+ )
178
+
179
+ return [items]
180
+
181
+ def process_items(self, items: list[dict[str, Any]]) -> None:
182
+ """
183
+ Processes and writes a single dictionary of feature vectors to the output file.
184
+
185
+ Parameters
186
+ ----------
187
+ items : dict[str, Any]
188
+ A dictionary containing feature vectors for a genomic window.
189
+ """
190
+ with open(
191
+ self.output_file, "a"
192
+ ) as f: # Open in append mode for continuous writing
193
+ lines = []
194
+ for item in items:
195
+ src_pop_str = ",".join(item["src_pop_list"])
196
+ candidates = (
197
+ "NA"
198
+ if item["candidates"].size == 0
199
+ else ",".join(
200
+ f"{item['chr_name']}:{pos}" for pos in item["candidates"]
201
+ )
202
+ )
203
+
204
+ line = (
205
+ f"{item['chr_name']}\t{item['start']}\t{item['end']}\t"
206
+ f"{item['ref_pop']}\t{item['tgt_pop']}\t{src_pop_str}\t"
207
+ f"{item['nsnps']}\t{item['statistic']}\t{candidates}\n"
208
+ )
209
+ lines.append(line)
210
+
211
+ f.writelines(lines)