sai-pg 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sai/__init__.py +2 -0
- sai/__main__.py +6 -3
- sai/configs/__init__.py +24 -0
- sai/configs/global_config.py +83 -0
- sai/configs/ploidy_config.py +94 -0
- sai/configs/pop_config.py +82 -0
- sai/configs/stat_config.py +220 -0
- sai/{utils/generators → generators}/chunk_generator.py +1 -1
- sai/{utils/generators → generators}/window_generator.py +81 -37
- sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
- sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
- sai/parsers/outlier_parser.py +4 -3
- sai/parsers/score_parser.py +8 -119
- sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
- sai/preprocessors/feature_preprocessor.py +236 -0
- sai/registries/__init__.py +22 -0
- sai/registries/generic_registry.py +89 -0
- sai/registries/stat_registry.py +30 -0
- sai/sai.py +124 -220
- sai/stats/__init__.py +11 -0
- sai/stats/danc_statistic.py +83 -0
- sai/stats/dd_statistic.py +77 -0
- sai/stats/df_statistic.py +84 -0
- sai/stats/dplus_statistic.py +86 -0
- sai/stats/fd_statistic.py +92 -0
- sai/stats/generic_statistic.py +93 -0
- sai/stats/q_statistic.py +104 -0
- sai/stats/stat_utils.py +259 -0
- sai/stats/u_statistic.py +99 -0
- sai/utils/utils.py +213 -142
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
- sai_pg-1.1.0.dist-info/RECORD +70 -0
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
- sai_pg-1.1.0.dist-info/top_level.txt +2 -0
- tests/configs/test_global_config.py +163 -0
- tests/configs/test_ploidy_config.py +93 -0
- tests/configs/test_pop_config.py +90 -0
- tests/configs/test_stat_config.py +171 -0
- tests/generators/test_chunk_generator.py +51 -0
- tests/generators/test_window_generator.py +164 -0
- tests/multiprocessing/test_mp_manager.py +92 -0
- tests/multiprocessing/test_mp_pool.py +79 -0
- tests/parsers/test_argument_validation.py +133 -0
- tests/parsers/test_outlier_parser.py +53 -0
- tests/parsers/test_score_parser.py +63 -0
- tests/preprocessors/test_chunk_preprocessor.py +79 -0
- tests/preprocessors/test_feature_preprocessor.py +223 -0
- tests/registries/test_registries.py +74 -0
- tests/stats/test_danc_statistic.py +51 -0
- tests/stats/test_dd_statistic.py +45 -0
- tests/stats/test_df_statistic.py +73 -0
- tests/stats/test_dplus_statistic.py +79 -0
- tests/stats/test_fd_statistic.py +68 -0
- tests/stats/test_q_statistic.py +268 -0
- tests/stats/test_stat_utils.py +354 -0
- tests/stats/test_u_statistic.py +233 -0
- tests/test___main__.py +51 -0
- tests/test_sai.py +102 -0
- tests/utils/test_utils.py +511 -0
- sai/parsers/plot_parser.py +0 -152
- sai/stats/features.py +0 -302
- sai/utils/preprocessors/feature_preprocessor.py +0 -211
- sai_pg-1.0.1.dist-info/RECORD +0 -30
- sai_pg-1.0.1.dist-info/top_level.txt +0 -1
- /sai/{utils/generators → generators}/__init__.py +0 -0
- /sai/{utils/generators → generators}/data_generator.py +0 -0
- /sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,233 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
import pytest
|
22
|
+
import numpy as np
|
23
|
+
from sai.stats import UStatistic
|
24
|
+
|
25
|
+
|
26
|
+
def test_UStatistic_compute_basic():
|
27
|
+
# Test data
|
28
|
+
ref_gts = np.array([[0, 0, 1], [0, 0, 0], [1, 1, 1]])
|
29
|
+
tgt_gts = np.array([[1, 1, 1], [1, 0, 0], [0, 1, 0]])
|
30
|
+
src_gts = np.array([[0, 0, 0], [1, 1, 1], [1, 0, 1]])
|
31
|
+
pos = np.array([0, 1, 2])
|
32
|
+
w, x, y = 0.5, 0.5, ("=", 0)
|
33
|
+
|
34
|
+
expected_result = 1 # Only the first site meets the criteria
|
35
|
+
expected_positions = np.array([0])
|
36
|
+
|
37
|
+
u_stat = UStatistic(
|
38
|
+
ref_gts=ref_gts,
|
39
|
+
tgt_gts=tgt_gts,
|
40
|
+
src_gts_list=[src_gts],
|
41
|
+
ref_ploidy=1,
|
42
|
+
tgt_ploidy=1,
|
43
|
+
src_ploidy_list=[1],
|
44
|
+
)
|
45
|
+
results = u_stat.compute(
|
46
|
+
pos=pos,
|
47
|
+
w=w,
|
48
|
+
x=x,
|
49
|
+
y_list=[y],
|
50
|
+
anc_allele_available=False,
|
51
|
+
)
|
52
|
+
|
53
|
+
assert results["name"] == "U"
|
54
|
+
assert (
|
55
|
+
results["value"] == expected_result
|
56
|
+
), f"Expected {expected_result}, got {results['value']}"
|
57
|
+
assert np.array_equal(results["cdd_pos"], expected_positions)
|
58
|
+
|
59
|
+
w, x, y = 0.5, 0.5, ("=", 1)
|
60
|
+
expected_result = 0
|
61
|
+
expected_positions = np.array([])
|
62
|
+
results = u_stat.compute(
|
63
|
+
pos=pos,
|
64
|
+
w=w,
|
65
|
+
x=x,
|
66
|
+
y_list=[y],
|
67
|
+
anc_allele_available=True,
|
68
|
+
)
|
69
|
+
|
70
|
+
assert (
|
71
|
+
results["value"] == expected_result
|
72
|
+
), f"Expected {expected_result}, got {results['value']}"
|
73
|
+
assert np.array_equal(results["cdd_pos"], expected_positions)
|
74
|
+
|
75
|
+
|
76
|
+
def test_UStatistic_compute_no_match():
|
77
|
+
# Test data with no matching sites
|
78
|
+
ref_gts = np.array([[0, 1, 1], [1, 1, 1]])
|
79
|
+
tgt_gts = np.array([[0, 0, 0], [1, 0, 1]])
|
80
|
+
src_gts = np.array([[1, 1, 1], [1, 1, 1]])
|
81
|
+
pos = np.array([0, 1])
|
82
|
+
|
83
|
+
# Parameters for testing with no matching sites
|
84
|
+
w, x, y = 0.3, 0.5, ("=", 0)
|
85
|
+
|
86
|
+
expected_result = 0 # No sites meet the criteria
|
87
|
+
expected_positions = np.array([])
|
88
|
+
|
89
|
+
u_stat = UStatistic(
|
90
|
+
ref_gts=ref_gts,
|
91
|
+
tgt_gts=tgt_gts,
|
92
|
+
src_gts_list=[src_gts],
|
93
|
+
ref_ploidy=1,
|
94
|
+
tgt_ploidy=1,
|
95
|
+
src_ploidy_list=[1],
|
96
|
+
)
|
97
|
+
results = u_stat.compute(
|
98
|
+
pos=pos,
|
99
|
+
w=w,
|
100
|
+
x=x,
|
101
|
+
y_list=[y],
|
102
|
+
anc_allele_available=False,
|
103
|
+
)
|
104
|
+
|
105
|
+
assert (
|
106
|
+
results["value"] == expected_result
|
107
|
+
), f"Expected {expected_result}, got {results['value']}"
|
108
|
+
assert np.array_equal(results["cdd_pos"], expected_positions)
|
109
|
+
|
110
|
+
|
111
|
+
def test_UStatistic_compute_all_match():
|
112
|
+
# Test data where all sites meet the criteria
|
113
|
+
ref_gts = np.array([[0, 0, 0], [0, 0, 0]])
|
114
|
+
tgt_gts = np.array([[1, 1, 1], [1, 1, 1]])
|
115
|
+
src_gts = np.array([[0, 0, 0], [0, 0, 0]])
|
116
|
+
pos = np.array([0, 1])
|
117
|
+
|
118
|
+
# Parameters for testing all sites matching
|
119
|
+
w, x, y = 0.5, 0.5, ("=", 0)
|
120
|
+
|
121
|
+
expected_result = 2 # All sites meet the criteria
|
122
|
+
expected_positions = np.array([0, 1])
|
123
|
+
|
124
|
+
u_stat = UStatistic(
|
125
|
+
ref_gts=ref_gts,
|
126
|
+
tgt_gts=tgt_gts,
|
127
|
+
src_gts_list=[src_gts],
|
128
|
+
ref_ploidy=1,
|
129
|
+
tgt_ploidy=1,
|
130
|
+
src_ploidy_list=[1],
|
131
|
+
)
|
132
|
+
results = u_stat.compute(
|
133
|
+
pos=pos,
|
134
|
+
w=w,
|
135
|
+
x=x,
|
136
|
+
y_list=[y],
|
137
|
+
anc_allele_available=False,
|
138
|
+
)
|
139
|
+
|
140
|
+
assert (
|
141
|
+
results["value"] == expected_result
|
142
|
+
), f"Expected {expected_result}, got {results['value']}"
|
143
|
+
assert np.array_equal(results["cdd_pos"], expected_positions)
|
144
|
+
|
145
|
+
|
146
|
+
def test_UStatistic_compute_with_two_sources():
|
147
|
+
# Test data
|
148
|
+
ref_gts = np.array([[0, 0, 1], [0, 0, 0], [1, 1, 1]])
|
149
|
+
tgt_gts = np.array([[0, 1, 1], [0, 0, 1], [1, 1, 1]])
|
150
|
+
src_gts1 = np.array([[1, 1, 1], [0, 1, 1], [1, 1, 1]])
|
151
|
+
src_gts2 = np.array([[1, 1, 1], [0, 1, 1], [1, 1, 1]])
|
152
|
+
pos = np.array([0, 1, 2])
|
153
|
+
w, x, y_list = 0.5, 0.5, [("=", 1.0), ("=", 1.0)]
|
154
|
+
|
155
|
+
# Expected result: only loci where ref_freq < w, tgt_freq > x, src_freq1 == y1, and src_freq2 == y2
|
156
|
+
expected_result = 1 # Only one locus meets all criteria
|
157
|
+
expected_positions = np.array([0])
|
158
|
+
|
159
|
+
u_stat = UStatistic(
|
160
|
+
ref_gts=ref_gts,
|
161
|
+
tgt_gts=tgt_gts,
|
162
|
+
src_gts_list=[src_gts1, src_gts2],
|
163
|
+
ref_ploidy=1,
|
164
|
+
tgt_ploidy=1,
|
165
|
+
src_ploidy_list=[1, 1],
|
166
|
+
)
|
167
|
+
results = u_stat.compute(
|
168
|
+
pos=pos,
|
169
|
+
w=w,
|
170
|
+
x=x,
|
171
|
+
y_list=y_list,
|
172
|
+
anc_allele_available=False,
|
173
|
+
)
|
174
|
+
|
175
|
+
assert (
|
176
|
+
results["value"] == expected_result
|
177
|
+
), f"Expected {expected_result}, got {results['value']}"
|
178
|
+
assert np.array_equal(results["cdd_pos"], expected_positions)
|
179
|
+
|
180
|
+
|
181
|
+
def test_UStatistic_compute_with_mixed_ploidy():
|
182
|
+
ref_gts = np.array([[0, 1, 0], [0, 1, 0], [2, 1, 0]])
|
183
|
+
tgt_gts = np.array([[1, 1, 0], [1, 1, 1], [1, 1, 1]])
|
184
|
+
src_gts = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]])
|
185
|
+
pos = np.array([0, 1, 2])
|
186
|
+
w, x, y = 0.5, 0.5, ("=", 0)
|
187
|
+
expected_result = 2
|
188
|
+
expected_positions = np.array([0, 2])
|
189
|
+
|
190
|
+
u_stat = UStatistic(
|
191
|
+
ref_gts=ref_gts,
|
192
|
+
tgt_gts=tgt_gts,
|
193
|
+
src_gts_list=[src_gts],
|
194
|
+
ref_ploidy=3,
|
195
|
+
tgt_ploidy=1,
|
196
|
+
src_ploidy_list=[2],
|
197
|
+
)
|
198
|
+
results = u_stat.compute(
|
199
|
+
pos=pos,
|
200
|
+
w=w,
|
201
|
+
x=x,
|
202
|
+
y_list=[y],
|
203
|
+
anc_allele_available=False,
|
204
|
+
)
|
205
|
+
|
206
|
+
assert (
|
207
|
+
results["value"] == expected_result
|
208
|
+
), f"Expected {expected_result}, got {results['value']}"
|
209
|
+
assert np.array_equal(results["cdd_pos"], expected_positions)
|
210
|
+
|
211
|
+
|
212
|
+
def test_UStatistic_compute_with_missing_keys():
|
213
|
+
ref_gts = np.array([[0, 1, 0], [0, 1, 0], [2, 1, 0]])
|
214
|
+
tgt_gts = np.array([[1, 1, 0], [1, 1, 1], [1, 1, 1]])
|
215
|
+
src_gts = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]])
|
216
|
+
pos = np.array([0, 1, 2])
|
217
|
+
w, x, y = 0.5, 0.5, ("=", 0)
|
218
|
+
|
219
|
+
with pytest.raises(ValueError):
|
220
|
+
u_stat = UStatistic(
|
221
|
+
ref_gts=ref_gts,
|
222
|
+
tgt_gts=tgt_gts,
|
223
|
+
src_gts_list=[src_gts],
|
224
|
+
ref_ploidy=3,
|
225
|
+
tgt_ploidy=1,
|
226
|
+
src_ploidy_list=[2],
|
227
|
+
)
|
228
|
+
u_stat.compute(
|
229
|
+
pos=pos,
|
230
|
+
w=w,
|
231
|
+
x=x,
|
232
|
+
y_list=[y],
|
233
|
+
)
|
tests/test___main__.py
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
from unittest.mock import patch, MagicMock
|
22
|
+
from sai.__main__ import main
|
23
|
+
|
24
|
+
|
25
|
+
@patch("sai.__main__._sai_cli_parser") # Mock _sai_cli_parser to control its output
|
26
|
+
@patch(
|
27
|
+
"sai.__main__._set_sigpipe_handler"
|
28
|
+
) # Mock _set_sigpipe_handler as it doesn’t need testing
|
29
|
+
def test_main(mock_set_sigpipe_handler, mock_sai_cli_parser):
|
30
|
+
# Mock parser and its return values
|
31
|
+
mock_parser = MagicMock()
|
32
|
+
mock_args = MagicMock()
|
33
|
+
mock_args.runner = MagicMock()
|
34
|
+
|
35
|
+
# Configure _sai_cli_parser to return the mock parser
|
36
|
+
mock_sai_cli_parser.return_value = mock_parser
|
37
|
+
# Configure the mock parser to return mock_args when parse_args is called
|
38
|
+
mock_parser.parse_args.return_value = mock_args
|
39
|
+
|
40
|
+
# Call the main function with a test argument list
|
41
|
+
test_args = ["score", "--vcf", "tests/data/example.vcf", "--chr-name", "chr1"]
|
42
|
+
main(test_args)
|
43
|
+
|
44
|
+
# Check if _set_sigpipe_handler was called
|
45
|
+
mock_set_sigpipe_handler.assert_called_once()
|
46
|
+
|
47
|
+
# Verify parse_args was called with test_args
|
48
|
+
mock_parser.parse_args.assert_called_once_with(test_args)
|
49
|
+
|
50
|
+
# Ensure runner was called with the parsed arguments
|
51
|
+
mock_args.runner.assert_called_once_with(mock_args)
|
tests/test_sai.py
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
import pytest
|
22
|
+
import pandas as pd
|
23
|
+
import sai.stats
|
24
|
+
from sai.sai import score, outlier
|
25
|
+
|
26
|
+
|
27
|
+
@pytest.fixture
|
28
|
+
def example_data(tmp_path):
|
29
|
+
# Define example file paths
|
30
|
+
pytest.example_vcf = "tests/data/example.vcf"
|
31
|
+
pytest.example_config = "tests/data/test_sai.config.yaml"
|
32
|
+
|
33
|
+
# Create a temporary output file path for the score function
|
34
|
+
temp_output_file = tmp_path / "output.tsv"
|
35
|
+
|
36
|
+
return {
|
37
|
+
"vcf_file": pytest.example_vcf,
|
38
|
+
"output_file": str(temp_output_file),
|
39
|
+
"output_dir": tmp_path,
|
40
|
+
"config": pytest.example_config,
|
41
|
+
}
|
42
|
+
|
43
|
+
|
44
|
+
def test_score(example_data):
|
45
|
+
# Run score function and capture output
|
46
|
+
score(
|
47
|
+
vcf_file=example_data["vcf_file"],
|
48
|
+
chr_name="21",
|
49
|
+
win_len=6666,
|
50
|
+
win_step=6666,
|
51
|
+
anc_allele_file=None,
|
52
|
+
output_file=example_data["output_file"],
|
53
|
+
config=example_data["config"],
|
54
|
+
num_workers=1,
|
55
|
+
)
|
56
|
+
|
57
|
+
# Read the generated output file and validate contents
|
58
|
+
df = pd.read_csv(example_data["output_file"], sep="\t")
|
59
|
+
|
60
|
+
col_name = [col for col in df.columns if col.startswith("Q")][0]
|
61
|
+
|
62
|
+
assert df[col_name].iloc[0] == 0.9, "Unexpected value in 'Q' column"
|
63
|
+
|
64
|
+
|
65
|
+
def test_score_mixed_ploidy(example_data):
|
66
|
+
score(
|
67
|
+
vcf_file="tests/data/test.mixed.ploidy.data.vcf",
|
68
|
+
chr_name="21",
|
69
|
+
win_len=50000,
|
70
|
+
win_step=50000,
|
71
|
+
anc_allele_file=None,
|
72
|
+
output_file=example_data["output_file"],
|
73
|
+
config="tests/data/test_mixed_ploidy.config.yaml",
|
74
|
+
num_workers=1,
|
75
|
+
)
|
76
|
+
|
77
|
+
df = pd.read_csv(example_data["output_file"], sep="\t")
|
78
|
+
|
79
|
+
assert df["U"].iloc[0] == 0, "Unexpected value in 'U' column"
|
80
|
+
assert df["U"].iloc[1] == 1, "Unexpected value in 'U' column"
|
81
|
+
|
82
|
+
|
83
|
+
def test_outlier(example_data):
|
84
|
+
output_prefix = f"{example_data['output_dir']}/outliers"
|
85
|
+
outlier(
|
86
|
+
score_file="tests/data/test.q.scores",
|
87
|
+
output_prefix=output_prefix,
|
88
|
+
quantile=0.25,
|
89
|
+
)
|
90
|
+
|
91
|
+
df = pd.read_csv(f"{output_prefix}.Q.0.25.outliers.tsv", sep="\t")
|
92
|
+
assert df["Q"].iloc[0] == 0.7
|
93
|
+
|
94
|
+
outlier(
|
95
|
+
score_file="tests/data/test.q.scores",
|
96
|
+
output_prefix=output_prefix,
|
97
|
+
quantile=0.75,
|
98
|
+
)
|
99
|
+
|
100
|
+
df = pd.read_csv(f"{output_prefix}.Q.0.75.outliers.tsv", sep="\t")
|
101
|
+
assert df["Q"].iloc[0] == 1.0
|
102
|
+
assert df["Q"].iloc[1] == 1.0
|