sai-pg 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sai/__init__.py +18 -0
- sai/__main__.py +73 -0
- sai/parsers/__init__.py +18 -0
- sai/parsers/argument_validation.py +169 -0
- sai/parsers/outlier_parser.py +76 -0
- sai/parsers/plot_parser.py +152 -0
- sai/parsers/score_parser.py +241 -0
- sai/sai.py +315 -0
- sai/stats/__init__.py +18 -0
- sai/stats/features.py +302 -0
- sai/utils/__init__.py +22 -0
- sai/utils/generators/__init__.py +23 -0
- sai/utils/generators/chunk_generator.py +148 -0
- sai/utils/generators/data_generator.py +49 -0
- sai/utils/generators/window_generator.py +250 -0
- sai/utils/genomic_dataclasses.py +46 -0
- sai/utils/multiprocessing/__init__.py +22 -0
- sai/utils/multiprocessing/mp_manager.py +251 -0
- sai/utils/multiprocessing/mp_pool.py +73 -0
- sai/utils/preprocessors/__init__.py +23 -0
- sai/utils/preprocessors/chunk_preprocessor.py +152 -0
- sai/utils/preprocessors/data_preprocessor.py +94 -0
- sai/utils/preprocessors/feature_preprocessor.py +211 -0
- sai/utils/utils.py +689 -0
- sai_pg-1.0.0.dist-info/METADATA +44 -0
- sai_pg-1.0.0.dist-info/RECORD +30 -0
- sai_pg-1.0.0.dist-info/WHEEL +5 -0
- sai_pg-1.0.0.dist-info/entry_points.txt +2 -0
- sai_pg-1.0.0.dist-info/licenses/LICENSE +674 -0
- sai_pg-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,152 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
from typing import Any
|
22
|
+
from sai.utils.generators import WindowGenerator
|
23
|
+
from sai.utils.preprocessors import DataPreprocessor
|
24
|
+
from .feature_preprocessor import FeaturePreprocessor
|
25
|
+
|
26
|
+
|
27
|
+
class ChunkPreprocessor(DataPreprocessor):
|
28
|
+
"""
|
29
|
+
Preprocesses VCF data in genomic windows and applies feature preprocessing.
|
30
|
+
|
31
|
+
This class generates genomic windows from a VCF file, processes them
|
32
|
+
with specified reference, target, and source individuals, and computes
|
33
|
+
feature vectors using the provided feature preprocessor.
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
vcf_file: str,
|
39
|
+
ref_ind_file: str,
|
40
|
+
tgt_ind_file: str,
|
41
|
+
src_ind_file: str,
|
42
|
+
win_len: int,
|
43
|
+
win_step: int,
|
44
|
+
w: float,
|
45
|
+
y: list[float],
|
46
|
+
output_file: str,
|
47
|
+
stat_type: str,
|
48
|
+
anc_allele_file: str = None,
|
49
|
+
num_src: int = 1,
|
50
|
+
):
|
51
|
+
"""
|
52
|
+
Initializes a new instance of ChunkPreprocessor.
|
53
|
+
|
54
|
+
Parameters
|
55
|
+
----------
|
56
|
+
vcf_file : str
|
57
|
+
Path to the VCF file to process.
|
58
|
+
ref_ind_file : str
|
59
|
+
Path to the file containing reference individual IDs.
|
60
|
+
tgt_ind_file : str
|
61
|
+
Path to the file containing target individual IDs.
|
62
|
+
src_ind_file : str
|
63
|
+
Path to the file containing source individual IDs.
|
64
|
+
win_len : int
|
65
|
+
Window length for generating genomic windows.
|
66
|
+
win_step : int
|
67
|
+
Step size for sliding windows across the genome.
|
68
|
+
w : float
|
69
|
+
Parameter w for feature vector computation.
|
70
|
+
y : list of float
|
71
|
+
List of y parameters for feature vector computation.
|
72
|
+
output_file : str
|
73
|
+
Path to the output file for storing feature vectors.
|
74
|
+
stat_type : str
|
75
|
+
Type of statistic to compute for feature vectors.
|
76
|
+
anc_allele_file : str, optional
|
77
|
+
Path to the ancestral allele file. If None, ancestral allele
|
78
|
+
information is considered unavailable.
|
79
|
+
num_src : int, optional
|
80
|
+
Number of source populations to use. Default is 1.
|
81
|
+
"""
|
82
|
+
self.vcf_file = vcf_file
|
83
|
+
self.ref_ind_file = ref_ind_file
|
84
|
+
self.tgt_ind_file = tgt_ind_file
|
85
|
+
self.src_ind_file = src_ind_file
|
86
|
+
self.win_len = win_len
|
87
|
+
self.win_step = win_step
|
88
|
+
self.anc_allele_file = anc_allele_file
|
89
|
+
self.num_src = num_src
|
90
|
+
|
91
|
+
anc_allele_available = anc_allele_file is not None
|
92
|
+
|
93
|
+
self.feature_preprocessor = FeaturePreprocessor(
|
94
|
+
w=w,
|
95
|
+
y=y,
|
96
|
+
output_file=output_file,
|
97
|
+
stat_type=stat_type,
|
98
|
+
anc_allele_available=anc_allele_available,
|
99
|
+
)
|
100
|
+
|
101
|
+
def run(self, chr_name: str, start: int, end: int) -> list[dict[str, Any]]:
|
102
|
+
"""
|
103
|
+
Runs the preprocessing pipeline on a specific chromosome region.
|
104
|
+
|
105
|
+
Generates genomic windows within the specified chromosome region,
|
106
|
+
processes each window to compute feature vectors, and aggregates the results.
|
107
|
+
|
108
|
+
Parameters
|
109
|
+
----------
|
110
|
+
chr_name : str
|
111
|
+
Name of the chromosome to process.
|
112
|
+
start : int
|
113
|
+
Start position (1-based, inclusive) of the region to process.
|
114
|
+
end : int
|
115
|
+
End position (1-based, exclusive) of the region to process.
|
116
|
+
|
117
|
+
Returns
|
118
|
+
-------
|
119
|
+
list of dict of {str: Any}
|
120
|
+
A list of dictionaries containing computed feature vectors for each genomic window.
|
121
|
+
"""
|
122
|
+
window_generator = WindowGenerator(
|
123
|
+
vcf_file=self.vcf_file,
|
124
|
+
chr_name=chr_name,
|
125
|
+
start=start,
|
126
|
+
end=end,
|
127
|
+
ref_ind_file=self.ref_ind_file,
|
128
|
+
tgt_ind_file=self.tgt_ind_file,
|
129
|
+
src_ind_file=self.src_ind_file,
|
130
|
+
win_len=self.win_len,
|
131
|
+
win_step=self.win_step,
|
132
|
+
anc_allele_file=self.anc_allele_file,
|
133
|
+
num_src=self.num_src,
|
134
|
+
)
|
135
|
+
|
136
|
+
items = []
|
137
|
+
|
138
|
+
for item in window_generator.get():
|
139
|
+
items.extend(self.feature_preprocessor.run(**item))
|
140
|
+
|
141
|
+
return items
|
142
|
+
|
143
|
+
def process_items(self, items: list[dict[str, Any]]) -> None:
|
144
|
+
"""
|
145
|
+
Processes and writes computed feature vectors to the output.
|
146
|
+
|
147
|
+
Parameters
|
148
|
+
----------
|
149
|
+
items : list of dict of {str: Any}
|
150
|
+
A list of dictionaries containing computed feature vectors for each genomic window.
|
151
|
+
"""
|
152
|
+
self.feature_preprocessor.process_items(items)
|
@@ -0,0 +1,94 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
from typing import Any
|
22
|
+
from abc import ABC, abstractmethod
|
23
|
+
|
24
|
+
|
25
|
+
class DataPreprocessor(ABC):
|
26
|
+
"""
|
27
|
+
Abstract base class for preprocessing genomic data.
|
28
|
+
|
29
|
+
This class defines a common interface for various data preprocessing operations,
|
30
|
+
such as filtering, normalization, and transformation of genomic data. Subclasses
|
31
|
+
should implement specific methods to handle data processing tasks, ensuring a
|
32
|
+
consistent way to run operations and manage the output of processed data.
|
33
|
+
|
34
|
+
Methods:
|
35
|
+
--------
|
36
|
+
run(**kwargs) -> Any:
|
37
|
+
Execute the core data processing task. Subclasses must define this method
|
38
|
+
to carry out specific preprocessing tasks such as filtering, normalization,
|
39
|
+
or transformation. This method should return the processed data, which
|
40
|
+
will then be handled by the main process to manage further steps or output.
|
41
|
+
|
42
|
+
process_items(items, **kwargs) -> None:
|
43
|
+
Handle the output or further processing of data once the `run` method
|
44
|
+
has completed. This allows subclasses to define how processed data
|
45
|
+
should be managed, such as saving results to a file, database, or converting
|
46
|
+
the data to a specific format for future analysis.
|
47
|
+
"""
|
48
|
+
|
49
|
+
@abstractmethod
|
50
|
+
def run(self, **kwargs) -> Any:
|
51
|
+
"""
|
52
|
+
Abstract method to run the preprocessing operations.
|
53
|
+
|
54
|
+
Subclasses must implement this method to perform specific preprocessing
|
55
|
+
tasks based on the initialized parameters and any additional keyword
|
56
|
+
arguments.
|
57
|
+
|
58
|
+
Parameters:
|
59
|
+
-----------
|
60
|
+
**kwargs : dict
|
61
|
+
Additional keyword arguments that may be required for specific
|
62
|
+
preprocessing operations.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
--------
|
66
|
+
processed_data : Any
|
67
|
+
The result of the preprocessing task, which can be further handled
|
68
|
+
by the `process_items` method.
|
69
|
+
"""
|
70
|
+
pass
|
71
|
+
|
72
|
+
@abstractmethod
|
73
|
+
def process_items(self, items: Any, **kwargs) -> None:
|
74
|
+
"""
|
75
|
+
Abstract method to handle the output or post-processing of data.
|
76
|
+
|
77
|
+
Subclasses must implement this method to define how the processed data
|
78
|
+
should be managed. This could include saving the data to a file,
|
79
|
+
transforming it into a new format, or preparing it for the next step
|
80
|
+
of analysis.
|
81
|
+
|
82
|
+
Parameters:
|
83
|
+
-----------
|
84
|
+
items : Any
|
85
|
+
The processed data returned by the `run` method, which will be managed
|
86
|
+
or output according to the logic defined in this method.
|
87
|
+
|
88
|
+
**kwargs : dict
|
89
|
+
Additional keyword arguments that can be used for customizing the
|
90
|
+
output process. For example, this may include options like `output_file`
|
91
|
+
to specify where the data should be saved or other settings to control
|
92
|
+
the output format.
|
93
|
+
"""
|
94
|
+
pass
|
@@ -0,0 +1,211 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
import numpy as np
|
22
|
+
from typing import Any
|
23
|
+
from sai.stats.features import calc_u, calc_q
|
24
|
+
from sai.utils.preprocessors import DataPreprocessor
|
25
|
+
|
26
|
+
|
27
|
+
class FeaturePreprocessor(DataPreprocessor):
|
28
|
+
"""
|
29
|
+
A preprocessor subclass for generating feature vectors from genomic data.
|
30
|
+
|
31
|
+
This class extends DataPreprocessor to include additional functionality for creating
|
32
|
+
feature vectors based on genomic variants, reference and target individual genotypes,
|
33
|
+
and window-based genomic statistics.
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
w: float,
|
39
|
+
y: list[float],
|
40
|
+
output_file: str,
|
41
|
+
stat_type: str,
|
42
|
+
anc_allele_available: bool = False,
|
43
|
+
):
|
44
|
+
"""
|
45
|
+
Initializes FeatureVectorsPreprocessor with specific frequency thresholds
|
46
|
+
and output file for storing generated feature vectors.
|
47
|
+
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
w : float
|
51
|
+
Frequency threshold for `calc_u` and `calc_q`.
|
52
|
+
y : list[float]
|
53
|
+
List of frequency thresholds for `calc_u` and `calc_q`.
|
54
|
+
output_file : str
|
55
|
+
Path to the output file to save processed feature vectors.
|
56
|
+
stat_type: str,
|
57
|
+
Specifies the type of statistic to compute.
|
58
|
+
- "UXX" (e.g., "U50", "U90") : Compute the U statistic using `calc_u()`.
|
59
|
+
- "QXX" (e.g., "Q95", "Q50") : Compute the Q statistic using `calc_q()`,
|
60
|
+
anc_allele_available: bool, optional
|
61
|
+
If True, ancestral allele information is available.
|
62
|
+
If False, ancestral allele information is unavailable.
|
63
|
+
Default is False.
|
64
|
+
|
65
|
+
Raises
|
66
|
+
------
|
67
|
+
ValueError
|
68
|
+
If `stat_type` is not in a valid format. Must be either: 'UXX' or 'QXX'.
|
69
|
+
"""
|
70
|
+
self.w = w
|
71
|
+
self.y = y
|
72
|
+
self.output_file = output_file
|
73
|
+
self.anc_allele_available = anc_allele_available
|
74
|
+
if not (
|
75
|
+
len(stat_type) == 3
|
76
|
+
and stat_type[0] in {"U", "Q"}
|
77
|
+
and stat_type[1:].isdigit()
|
78
|
+
):
|
79
|
+
raise ValueError(
|
80
|
+
f"Invalid stat_type format: {stat_type}. Expected format 'UXX' or 'QXX' (e.g., 'U50' or 'Q95')."
|
81
|
+
)
|
82
|
+
self.stat_prefix = stat_type[0]
|
83
|
+
self.threshold = int(stat_type[1:]) / 100
|
84
|
+
|
85
|
+
def run(
|
86
|
+
self,
|
87
|
+
chr_name: str,
|
88
|
+
ref_pop: str,
|
89
|
+
tgt_pop: str,
|
90
|
+
src_pop_list: list[str],
|
91
|
+
start: int,
|
92
|
+
end: int,
|
93
|
+
pos: np.ndarray,
|
94
|
+
ref_gts: np.ndarray,
|
95
|
+
tgt_gts: np.ndarray,
|
96
|
+
src_gts_list: list[np.ndarray],
|
97
|
+
ploidy: int,
|
98
|
+
) -> list[dict[str, Any]]:
|
99
|
+
"""
|
100
|
+
Generates feature vectors for a specified genomic window.
|
101
|
+
|
102
|
+
Parameters
|
103
|
+
----------
|
104
|
+
chr_name : str
|
105
|
+
Chromosome name.
|
106
|
+
ref_pop : str
|
107
|
+
Reference population name.
|
108
|
+
tgt_pop : str
|
109
|
+
Target population name.
|
110
|
+
src_pop_list : list[str]
|
111
|
+
List of source population names.
|
112
|
+
start : int
|
113
|
+
Start position of the genomic window.
|
114
|
+
end : int
|
115
|
+
End position of the genomic window.
|
116
|
+
pos : np.ndarray
|
117
|
+
A 1D numpy array where each element represents the genomic position.
|
118
|
+
ref_gts : np.ndarray
|
119
|
+
Genotype data for the reference population.
|
120
|
+
tgt_gts : np.ndarray
|
121
|
+
Genotype data for the target population.
|
122
|
+
src_gts_list : list[np.ndarray]
|
123
|
+
List of genotype arrays for each source population.
|
124
|
+
ploidy: int
|
125
|
+
Ploidy of the genome.
|
126
|
+
|
127
|
+
Returns
|
128
|
+
-------
|
129
|
+
list[dict[str, Any]]
|
130
|
+
A list containing a dictionary of calculated feature vectors for the genomic window.
|
131
|
+
"""
|
132
|
+
items = {
|
133
|
+
"chr_name": chr_name,
|
134
|
+
"start": start,
|
135
|
+
"end": end,
|
136
|
+
"ref_pop": ref_pop,
|
137
|
+
"tgt_pop": tgt_pop,
|
138
|
+
"src_pop_list": src_pop_list,
|
139
|
+
"nsnps": len(pos),
|
140
|
+
}
|
141
|
+
|
142
|
+
if (
|
143
|
+
(ref_gts is None)
|
144
|
+
or (tgt_gts is None)
|
145
|
+
or (src_gts_list is None)
|
146
|
+
or (ploidy is None)
|
147
|
+
):
|
148
|
+
items["statistic"] = np.nan
|
149
|
+
items["candidates"] = np.array([])
|
150
|
+
elif self.stat_prefix == "U":
|
151
|
+
items["statistic"], items["candidates"] = calc_u(
|
152
|
+
ref_gts=ref_gts,
|
153
|
+
tgt_gts=tgt_gts,
|
154
|
+
src_gts_list=src_gts_list,
|
155
|
+
pos=pos,
|
156
|
+
w=self.w,
|
157
|
+
x=self.threshold,
|
158
|
+
y_list=self.y,
|
159
|
+
ploidy=ploidy,
|
160
|
+
anc_allele_available=self.anc_allele_available,
|
161
|
+
)
|
162
|
+
elif self.stat_prefix == "Q":
|
163
|
+
items["statistic"], items["candidates"] = calc_q(
|
164
|
+
ref_gts=ref_gts,
|
165
|
+
tgt_gts=tgt_gts,
|
166
|
+
src_gts_list=src_gts_list,
|
167
|
+
pos=pos,
|
168
|
+
w=self.w,
|
169
|
+
y_list=self.y,
|
170
|
+
quantile=self.threshold,
|
171
|
+
ploidy=ploidy,
|
172
|
+
anc_allele_available=self.anc_allele_available,
|
173
|
+
)
|
174
|
+
else:
|
175
|
+
raise ValueError(
|
176
|
+
f"Invalid stat_type: {self.stat_type}. Must be 'U' or 'QXX' (e.g., 'Q95')."
|
177
|
+
)
|
178
|
+
|
179
|
+
return [items]
|
180
|
+
|
181
|
+
def process_items(self, items: list[dict[str, Any]]) -> None:
|
182
|
+
"""
|
183
|
+
Processes and writes a single dictionary of feature vectors to the output file.
|
184
|
+
|
185
|
+
Parameters
|
186
|
+
----------
|
187
|
+
items : dict[str, Any]
|
188
|
+
A dictionary containing feature vectors for a genomic window.
|
189
|
+
"""
|
190
|
+
with open(
|
191
|
+
self.output_file, "a"
|
192
|
+
) as f: # Open in append mode for continuous writing
|
193
|
+
lines = []
|
194
|
+
for item in items:
|
195
|
+
src_pop_str = ",".join(item["src_pop_list"])
|
196
|
+
candidates = (
|
197
|
+
"NA"
|
198
|
+
if item["candidates"].size == 0
|
199
|
+
else ",".join(
|
200
|
+
f"{item['chr_name']}:{pos}" for pos in item["candidates"]
|
201
|
+
)
|
202
|
+
)
|
203
|
+
|
204
|
+
line = (
|
205
|
+
f"{item['chr_name']}\t{item['start']}\t{item['end']}\t"
|
206
|
+
f"{item['ref_pop']}\t{item['tgt_pop']}\t{src_pop_str}\t"
|
207
|
+
f"{item['nsnps']}\t{item['statistic']}\t{candidates}\n"
|
208
|
+
)
|
209
|
+
lines.append(line)
|
210
|
+
|
211
|
+
f.writelines(lines)
|