sai-pg 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sai/__init__.py +2 -0
- sai/__main__.py +6 -3
- sai/configs/__init__.py +24 -0
- sai/configs/global_config.py +83 -0
- sai/configs/ploidy_config.py +94 -0
- sai/configs/pop_config.py +82 -0
- sai/configs/stat_config.py +220 -0
- sai/{utils/generators → generators}/chunk_generator.py +1 -1
- sai/{utils/generators → generators}/window_generator.py +81 -37
- sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
- sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
- sai/parsers/outlier_parser.py +4 -3
- sai/parsers/score_parser.py +8 -119
- sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
- sai/preprocessors/feature_preprocessor.py +236 -0
- sai/registries/__init__.py +22 -0
- sai/registries/generic_registry.py +89 -0
- sai/registries/stat_registry.py +30 -0
- sai/sai.py +124 -220
- sai/stats/__init__.py +11 -0
- sai/stats/danc_statistic.py +83 -0
- sai/stats/dd_statistic.py +77 -0
- sai/stats/df_statistic.py +84 -0
- sai/stats/dplus_statistic.py +86 -0
- sai/stats/fd_statistic.py +92 -0
- sai/stats/generic_statistic.py +93 -0
- sai/stats/q_statistic.py +104 -0
- sai/stats/stat_utils.py +259 -0
- sai/stats/u_statistic.py +99 -0
- sai/utils/utils.py +213 -142
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
- sai_pg-1.1.0.dist-info/RECORD +70 -0
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
- sai_pg-1.1.0.dist-info/top_level.txt +2 -0
- tests/configs/test_global_config.py +163 -0
- tests/configs/test_ploidy_config.py +93 -0
- tests/configs/test_pop_config.py +90 -0
- tests/configs/test_stat_config.py +171 -0
- tests/generators/test_chunk_generator.py +51 -0
- tests/generators/test_window_generator.py +164 -0
- tests/multiprocessing/test_mp_manager.py +92 -0
- tests/multiprocessing/test_mp_pool.py +79 -0
- tests/parsers/test_argument_validation.py +133 -0
- tests/parsers/test_outlier_parser.py +53 -0
- tests/parsers/test_score_parser.py +63 -0
- tests/preprocessors/test_chunk_preprocessor.py +79 -0
- tests/preprocessors/test_feature_preprocessor.py +223 -0
- tests/registries/test_registries.py +74 -0
- tests/stats/test_danc_statistic.py +51 -0
- tests/stats/test_dd_statistic.py +45 -0
- tests/stats/test_df_statistic.py +73 -0
- tests/stats/test_dplus_statistic.py +79 -0
- tests/stats/test_fd_statistic.py +68 -0
- tests/stats/test_q_statistic.py +268 -0
- tests/stats/test_stat_utils.py +354 -0
- tests/stats/test_u_statistic.py +233 -0
- tests/test___main__.py +51 -0
- tests/test_sai.py +102 -0
- tests/utils/test_utils.py +511 -0
- sai/parsers/plot_parser.py +0 -152
- sai/stats/features.py +0 -302
- sai/utils/preprocessors/feature_preprocessor.py +0 -211
- sai_pg-1.0.1.dist-info/RECORD +0 -30
- sai_pg-1.0.1.dist-info/top_level.txt +0 -1
- /sai/{utils/generators → generators}/__init__.py +0 -0
- /sai/{utils/generators → generators}/data_generator.py +0 -0
- /sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
- /sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
- {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,511 @@
|
|
1
|
+
# Copyright 2025 Xin Huang
|
2
|
+
#
|
3
|
+
# GNU General Public License v3.0
|
4
|
+
#
|
5
|
+
# This program is free software: you can redistribute it and/or modify
|
6
|
+
# it under the terms of the GNU General Public License as published by
|
7
|
+
# the Free Software Foundation, either version 3 of the License, or
|
8
|
+
# (at your option) any later version.
|
9
|
+
#
|
10
|
+
# This program is distributed in the hope that it will be useful,
|
11
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
+
# GNU General Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License
|
16
|
+
# along with this program. If not, please see
|
17
|
+
#
|
18
|
+
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
+
|
20
|
+
|
21
|
+
import allel
|
22
|
+
import pytest
|
23
|
+
import numpy as np
|
24
|
+
import pandas as pd
|
25
|
+
from unittest.mock import mock_open, patch
|
26
|
+
from sai.configs import PloidyConfig
|
27
|
+
from sai.utils import ChromosomeData
|
28
|
+
from sai.utils import filter_fixed_variants
|
29
|
+
from sai.utils import filter_geno_data
|
30
|
+
from sai.utils import flip_snps
|
31
|
+
from sai.utils import get_ref_alt_allele
|
32
|
+
from sai.utils import parse_ind_file
|
33
|
+
from sai.utils import read_anc_allele
|
34
|
+
from sai.utils import read_data
|
35
|
+
from sai.utils import read_geno_data
|
36
|
+
from sai.utils import split_genome
|
37
|
+
from sai.utils import natsorted_df
|
38
|
+
|
39
|
+
|
40
|
+
def test_valid_file():
|
41
|
+
# Mock the content of a valid file with categories
|
42
|
+
mock_data = "Category1 Sample1\nCategory1 Sample2\nCategory2 Sample3\n"
|
43
|
+
with patch("builtins.open", mock_open(read_data=mock_data)):
|
44
|
+
samples = parse_ind_file("mock_file.txt")
|
45
|
+
assert samples == {
|
46
|
+
"Category1": ["Sample1", "Sample2"],
|
47
|
+
"Category2": ["Sample3"],
|
48
|
+
}
|
49
|
+
|
50
|
+
|
51
|
+
def test_empty_file():
|
52
|
+
# Mock an empty file
|
53
|
+
mock_data = ""
|
54
|
+
with patch("builtins.open", mock_open(read_data=mock_data)):
|
55
|
+
with pytest.raises(ValueError) as excinfo:
|
56
|
+
parse_ind_file("mock_file.txt")
|
57
|
+
assert (
|
58
|
+
str(excinfo.value)
|
59
|
+
== "No samples found in mock_file.txt. Please check your data."
|
60
|
+
)
|
61
|
+
|
62
|
+
|
63
|
+
def test_file_not_found():
|
64
|
+
# Ensure FileNotFoundError is raised when file does not exist
|
65
|
+
with pytest.raises(FileNotFoundError):
|
66
|
+
parse_ind_file("non_existent_file.txt")
|
67
|
+
|
68
|
+
|
69
|
+
def test_ignores_empty_lines():
|
70
|
+
# Mock a file with empty lines and valid lines
|
71
|
+
mock_data = "Category1 Sample1\n\nCategory1 Sample2\n \nCategory2 Sample3\n"
|
72
|
+
with patch("builtins.open", mock_open(read_data=mock_data)):
|
73
|
+
samples = parse_ind_file("mock_file.txt")
|
74
|
+
assert samples == {
|
75
|
+
"Category1": ["Sample1", "Sample2"],
|
76
|
+
"Category2": ["Sample3"],
|
77
|
+
}
|
78
|
+
|
79
|
+
|
80
|
+
# Test data setup for filter_geno_data
|
81
|
+
@pytest.fixture
|
82
|
+
def sample_genotype_data():
|
83
|
+
return ChromosomeData(
|
84
|
+
POS=np.array([100, 200, 300, 400, 500]),
|
85
|
+
REF=np.array(["A", "T", "G", "C", "A"]),
|
86
|
+
ALT=np.array(["C", "A", "T", "G", "T"]),
|
87
|
+
GT=np.array(
|
88
|
+
[
|
89
|
+
[[0, 1], [1, 1]],
|
90
|
+
[[1, 0], [1, -1]],
|
91
|
+
[[0, 0], [0, 0]],
|
92
|
+
[[1, 1], [1, 1]],
|
93
|
+
[[0, 1], [0, 0]],
|
94
|
+
]
|
95
|
+
),
|
96
|
+
)
|
97
|
+
|
98
|
+
|
99
|
+
def test_filter_geno_data(sample_genotype_data):
|
100
|
+
# Example of filtering out the third row
|
101
|
+
index = np.array([True, False, True, True, False])
|
102
|
+
filtered = filter_geno_data(sample_genotype_data, index)
|
103
|
+
|
104
|
+
# Assertions updated to check ChromosomeData attributes
|
105
|
+
assert filtered.POS.tolist() == [100, 300, 400]
|
106
|
+
assert filtered.REF.tolist() == ["A", "G", "C"]
|
107
|
+
assert filtered.ALT.tolist() == ["C", "T", "G"]
|
108
|
+
assert filtered.GT.shape == (3, 2, 2)
|
109
|
+
|
110
|
+
|
111
|
+
# Test from files
|
112
|
+
@pytest.fixture
|
113
|
+
def data():
|
114
|
+
pytest.ref_ind_list = "./tests/data/test.ref.ind.list"
|
115
|
+
pytest.tgt_ind_list = "./tests/data/test.tgt.ind.list"
|
116
|
+
pytest.vcf = "./tests/data/test.data.vcf"
|
117
|
+
pytest.anc_allele = "./tests/data/test.anc.allele.bed"
|
118
|
+
|
119
|
+
|
120
|
+
def test_parse_ind_file_from_files(data):
|
121
|
+
ref_ind = parse_ind_file(pytest.ref_ind_list)
|
122
|
+
tgt_ind = parse_ind_file(pytest.tgt_ind_list)
|
123
|
+
|
124
|
+
exp_ref_ind = {
|
125
|
+
"ref1": ["ind5", "ind6"],
|
126
|
+
}
|
127
|
+
exp_tgt_ind = {
|
128
|
+
"tgt1": ["ind1", "ind2"],
|
129
|
+
"tgt2": ["ind3", "ind4"],
|
130
|
+
}
|
131
|
+
|
132
|
+
assert ref_ind == exp_ref_ind
|
133
|
+
assert tgt_ind == exp_tgt_ind
|
134
|
+
|
135
|
+
|
136
|
+
def test_read_geno_data_from_file(data):
|
137
|
+
ref_ind = parse_ind_file(pytest.ref_ind_list)
|
138
|
+
d = read_geno_data(
|
139
|
+
vcf=pytest.vcf,
|
140
|
+
ind_samples=ref_ind,
|
141
|
+
chr_name="21",
|
142
|
+
anc_allele_file=None,
|
143
|
+
filter_missing=False,
|
144
|
+
)
|
145
|
+
|
146
|
+
vcf = allel.read_vcf(pytest.vcf, alt_number=1, samples=ref_ind["ref1"], region="21")
|
147
|
+
|
148
|
+
assert np.array_equal(ref_ind["ref1"], vcf["samples"])
|
149
|
+
assert np.array_equal(d["ref1"].POS, vcf["variants/POS"])
|
150
|
+
assert np.array_equal(d["ref1"].REF, vcf["variants/REF"])
|
151
|
+
assert np.array_equal(d["ref1"].ALT, vcf["variants/ALT"])
|
152
|
+
assert np.array_equal(d["ref1"].GT, vcf["calldata/GT"])
|
153
|
+
|
154
|
+
|
155
|
+
def test_read_data_from_file(data):
|
156
|
+
ploidy_config = PloidyConfig(
|
157
|
+
{
|
158
|
+
"ref": {"ref1": 2},
|
159
|
+
"tgt": {"tgt1": 2, "tgt2": 2},
|
160
|
+
"src": {"src1": 2, "src2": 2},
|
161
|
+
}
|
162
|
+
)
|
163
|
+
|
164
|
+
results = read_data(
|
165
|
+
vcf_file=pytest.vcf,
|
166
|
+
chr_name="21",
|
167
|
+
ref_ind_file=pytest.ref_ind_list,
|
168
|
+
tgt_ind_file=pytest.tgt_ind_list,
|
169
|
+
src_ind_file=None,
|
170
|
+
out_ind_file=None,
|
171
|
+
anc_allele_file=None,
|
172
|
+
filter_ref=False,
|
173
|
+
filter_tgt=False,
|
174
|
+
filter_src=False,
|
175
|
+
filter_out=False,
|
176
|
+
ploidy_config=ploidy_config,
|
177
|
+
)
|
178
|
+
|
179
|
+
rs = parse_ind_file(pytest.ref_ind_list)
|
180
|
+
ts = parse_ind_file(pytest.tgt_ind_list)
|
181
|
+
|
182
|
+
assert np.array_equal(rs, results["ref"][1])
|
183
|
+
assert np.array_equal(ts, results["tgt"][1])
|
184
|
+
|
185
|
+
ref_vcf = allel.read_vcf(pytest.vcf, alt_number=1, samples=rs["ref1"], region="21")
|
186
|
+
tgt_vcf = allel.read_vcf(pytest.vcf, alt_number=1, samples=ts["tgt2"], region="21")
|
187
|
+
|
188
|
+
assert np.array_equal(rs["ref1"], ref_vcf["samples"])
|
189
|
+
assert np.array_equal(ts["tgt2"], tgt_vcf["samples"])
|
190
|
+
assert np.array_equal(results["ref"][0]["ref1"].POS, ref_vcf["variants/POS"])
|
191
|
+
assert np.array_equal(results["ref"][0]["ref1"].REF, ref_vcf["variants/REF"])
|
192
|
+
assert np.array_equal(results["ref"][0]["ref1"].ALT, ref_vcf["variants/ALT"])
|
193
|
+
assert np.array_equal(
|
194
|
+
results["ref"][0]["ref1"].GT, ref_vcf["calldata/GT"].reshape(19, 4)
|
195
|
+
)
|
196
|
+
assert np.array_equal(results["tgt"][0]["tgt2"].POS, tgt_vcf["variants/POS"])
|
197
|
+
assert np.array_equal(results["tgt"][0]["tgt2"].REF, tgt_vcf["variants/REF"])
|
198
|
+
assert np.array_equal(results["tgt"][0]["tgt2"].ALT, tgt_vcf["variants/ALT"])
|
199
|
+
assert np.array_equal(
|
200
|
+
results["tgt"][0]["tgt2"].GT, tgt_vcf["calldata/GT"].reshape(19, 4)
|
201
|
+
)
|
202
|
+
|
203
|
+
|
204
|
+
def test_read_anc_allele(data):
|
205
|
+
anc_allele = read_anc_allele(pytest.anc_allele, "21")
|
206
|
+
|
207
|
+
exp_anc_allele = {"21": {2309: "G", 7879: "A", 11484: "-", 48989: "C"}}
|
208
|
+
|
209
|
+
assert anc_allele == exp_anc_allele
|
210
|
+
|
211
|
+
|
212
|
+
def test_get_ref_alt_allele(data):
|
213
|
+
tgt_ind = parse_ind_file(pytest.tgt_ind_list)
|
214
|
+
tgt_vcf = allel.read_vcf(
|
215
|
+
pytest.vcf, alt_number=1, samples=tgt_ind["tgt1"], region="21"
|
216
|
+
)
|
217
|
+
|
218
|
+
ref_allele, alt_allele = get_ref_alt_allele(
|
219
|
+
tgt_vcf["variants/REF"], tgt_vcf["variants/ALT"], tgt_vcf["variants/POS"]
|
220
|
+
)
|
221
|
+
|
222
|
+
exp_ref_allele = {
|
223
|
+
2309: "G",
|
224
|
+
7879: "C",
|
225
|
+
11484: "A",
|
226
|
+
16249: "A",
|
227
|
+
17324: "G",
|
228
|
+
19064: "G",
|
229
|
+
19124: "G",
|
230
|
+
23559: "G",
|
231
|
+
25354: "G",
|
232
|
+
26654: "G",
|
233
|
+
29724: "G",
|
234
|
+
30769: "C",
|
235
|
+
31319: "C",
|
236
|
+
37199: "C",
|
237
|
+
38009: "C",
|
238
|
+
39444: "C",
|
239
|
+
40809: "C",
|
240
|
+
45079: "C",
|
241
|
+
48989: "C",
|
242
|
+
}
|
243
|
+
exp_alt_allele = {
|
244
|
+
2309: "A",
|
245
|
+
7879: "A",
|
246
|
+
11484: "C",
|
247
|
+
16249: "C",
|
248
|
+
17324: "T",
|
249
|
+
19064: "T",
|
250
|
+
19124: "A",
|
251
|
+
23559: "A",
|
252
|
+
25354: "T",
|
253
|
+
26654: "C",
|
254
|
+
29724: "A",
|
255
|
+
30769: "T",
|
256
|
+
31319: "T",
|
257
|
+
37199: "T",
|
258
|
+
38009: "T",
|
259
|
+
39444: "T",
|
260
|
+
40809: "T",
|
261
|
+
45079: "T",
|
262
|
+
48989: "T",
|
263
|
+
}
|
264
|
+
|
265
|
+
assert ref_allele == exp_ref_allele
|
266
|
+
assert alt_allele == exp_alt_allele
|
267
|
+
|
268
|
+
|
269
|
+
def test_check_anc_allele(data):
|
270
|
+
ploidy_config = PloidyConfig(
|
271
|
+
{
|
272
|
+
"ref": {"ref1": 2},
|
273
|
+
"tgt": {"tgt1": 2, "tgt2": 2},
|
274
|
+
"src": {"src1": 2, "src2": 2},
|
275
|
+
}
|
276
|
+
)
|
277
|
+
|
278
|
+
data = read_data(
|
279
|
+
vcf_file=pytest.vcf,
|
280
|
+
chr_name="21",
|
281
|
+
ref_ind_file=pytest.ref_ind_list,
|
282
|
+
tgt_ind_file=pytest.tgt_ind_list,
|
283
|
+
src_ind_file=None,
|
284
|
+
out_ind_file=None,
|
285
|
+
anc_allele_file=pytest.anc_allele,
|
286
|
+
filter_ref=False,
|
287
|
+
filter_tgt=False,
|
288
|
+
ploidy_config=ploidy_config,
|
289
|
+
)
|
290
|
+
|
291
|
+
exp_ref_gt = allel.GenotypeArray(
|
292
|
+
[
|
293
|
+
[[0, 0], [0, 0]],
|
294
|
+
[[1, 1], [1, 1]],
|
295
|
+
[[0, 0], [0, 0]],
|
296
|
+
],
|
297
|
+
)
|
298
|
+
exp_tgt_gt1 = allel.GenotypeArray(
|
299
|
+
[
|
300
|
+
[[1, 0], [0, 0]],
|
301
|
+
[[1, 1], [1, 0]],
|
302
|
+
[[0, 0], [0, 1]],
|
303
|
+
],
|
304
|
+
)
|
305
|
+
exp_tgt_gt2 = allel.GenotypeArray(
|
306
|
+
[
|
307
|
+
[[0, 0], [0, 0]],
|
308
|
+
[[1, 1], [1, 1]],
|
309
|
+
[[0, 0], [0, 0]],
|
310
|
+
],
|
311
|
+
)
|
312
|
+
exp_tgt_pos = [2309, 7879, 48989]
|
313
|
+
|
314
|
+
assert np.array_equal(data["ref"][0]["ref1"].GT, exp_ref_gt.reshape(3, 4))
|
315
|
+
assert np.array_equal(data["tgt"][0]["tgt1"].GT, exp_tgt_gt1.reshape(3, 4))
|
316
|
+
assert np.array_equal(data["tgt"][0]["tgt2"].GT, exp_tgt_gt2.reshape(3, 4))
|
317
|
+
assert np.array_equal(data["tgt"][0]["tgt1"].POS, exp_tgt_pos)
|
318
|
+
assert np.array_equal(data["tgt"][0]["tgt2"].POS, exp_tgt_pos)
|
319
|
+
|
320
|
+
|
321
|
+
# Test data setup for filter_fixed_variants
|
322
|
+
@pytest.fixture
|
323
|
+
def sample_data():
|
324
|
+
# Sample ChromosomeData with mixed fixed and non-fixed variants
|
325
|
+
return {
|
326
|
+
"pop1": ChromosomeData(
|
327
|
+
POS=np.array([100, 200, 300, 400]),
|
328
|
+
REF=np.array(["A", "G", "T", "C"]),
|
329
|
+
ALT=np.array(["C", "A", "G", "T"]),
|
330
|
+
GT=allel.GenotypeArray(
|
331
|
+
[
|
332
|
+
[[0, 0], [0, 0]], # Fixed ref (AA, AA)
|
333
|
+
[[1, 1], [1, 1]], # Fixed alt (CC, CC)
|
334
|
+
[[0, 1], [1, 1]], # Mixed (AG, GG)
|
335
|
+
[[0, 1], [0, 0]], # Mixed (AC, AA)
|
336
|
+
]
|
337
|
+
),
|
338
|
+
),
|
339
|
+
"pop2": ChromosomeData(
|
340
|
+
POS=np.array([150, 250]),
|
341
|
+
REF=np.array(["T", "A"]),
|
342
|
+
ALT=np.array(["G", "C"]),
|
343
|
+
GT=allel.GenotypeArray(
|
344
|
+
[
|
345
|
+
[[0, 0], [0, 0]], # Fixed ref (TT, TT)
|
346
|
+
[[1, 1], [1, 1]], # Fixed alt (CC, CC)
|
347
|
+
]
|
348
|
+
),
|
349
|
+
),
|
350
|
+
}
|
351
|
+
|
352
|
+
|
353
|
+
@pytest.fixture
|
354
|
+
def sample_info():
|
355
|
+
# Sample information for two individuals in 'pop1'
|
356
|
+
return {
|
357
|
+
"pop1": ["sample1", "sample2"],
|
358
|
+
"pop2": ["sample3", "sample4"],
|
359
|
+
}
|
360
|
+
|
361
|
+
|
362
|
+
def test_filter_fixed_variants(sample_data, sample_info):
|
363
|
+
# Apply the filter_fixed_variants function
|
364
|
+
filtered_data = filter_fixed_variants(sample_data, sample_info)
|
365
|
+
|
366
|
+
# Verify that fixed variants are removed
|
367
|
+
assert "pop1" in filtered_data
|
368
|
+
assert "pop2" in filtered_data
|
369
|
+
|
370
|
+
# Check that only non-fixed variants are retained for chr1
|
371
|
+
pop1_data = filtered_data["pop1"]
|
372
|
+
assert pop1_data.POS.tolist() == [300, 400] # Positions with mixed genotypes
|
373
|
+
assert pop1_data.REF.tolist() == ["T", "C"]
|
374
|
+
assert pop1_data.ALT.tolist() == ["G", "T"]
|
375
|
+
assert pop1_data.GT.shape == (2, 2, 2)
|
376
|
+
|
377
|
+
# Verify that all variants in chr2 are filtered out, as they are fixed
|
378
|
+
pop2_data = filtered_data["pop2"]
|
379
|
+
assert pop2_data.POS.size == 0
|
380
|
+
assert pop2_data.REF.size == 0
|
381
|
+
assert pop2_data.ALT.size == 0
|
382
|
+
assert pop2_data.GT.size == 0
|
383
|
+
|
384
|
+
|
385
|
+
# Test data setup for flip_snps
|
386
|
+
@pytest.fixture
|
387
|
+
def sample_chromosome_data():
|
388
|
+
# Sample ChromosomeData with genotypes to test flipping
|
389
|
+
return ChromosomeData(
|
390
|
+
POS=np.array([100, 200, 300, 400]),
|
391
|
+
REF=np.array(["A", "G", "T", "C"]),
|
392
|
+
ALT=np.array(["C", "A", "G", "T"]),
|
393
|
+
GT=allel.GenotypeArray(
|
394
|
+
[
|
395
|
+
[[0, 1], [1, 1]], # Mixed genotype, should be flipped
|
396
|
+
[[1, 1], [1, 1]], # ALT fixed, should be flipped
|
397
|
+
[[0, 0], [0, 1]], # Mixed genotype, should remain unchanged
|
398
|
+
[[1, 0], [0, 0]], # Mixed genotype, should be flipped
|
399
|
+
]
|
400
|
+
),
|
401
|
+
)
|
402
|
+
|
403
|
+
|
404
|
+
def test_flip_snps(sample_chromosome_data):
|
405
|
+
# Define SNP positions to be flipped
|
406
|
+
flipped_snps = [100, 200, 400]
|
407
|
+
|
408
|
+
# Apply the flip_snps function
|
409
|
+
flip_snps(sample_chromosome_data, flipped_snps)
|
410
|
+
|
411
|
+
# Check flipped genotypes
|
412
|
+
# Position 100: original [[0, 1], [1, 1]] -> flipped [[1, 0], [0, 0]]
|
413
|
+
assert sample_chromosome_data.GT[0].tolist() == [[1, 0], [0, 0]]
|
414
|
+
# Position 200: original [[1, 1], [1, 1]] -> flipped [[0, 0], [0, 0]]
|
415
|
+
assert sample_chromosome_data.GT[1].tolist() == [[0, 0], [0, 0]]
|
416
|
+
# Position 300: original [[0, 0], [0, 1]] -> should remain unchanged
|
417
|
+
assert sample_chromosome_data.GT[2].tolist() == [[0, 0], [0, 1]]
|
418
|
+
# Position 400: original [[1, 0], [0, 0]] -> flipped [[0, 1], [1, 1]]
|
419
|
+
assert sample_chromosome_data.GT[3].tolist() == [[0, 1], [1, 1]]
|
420
|
+
|
421
|
+
|
422
|
+
# Sample test function for split_genome
|
423
|
+
def test_split_genome():
|
424
|
+
# Test case 1: Basic case with regular windows
|
425
|
+
pos = np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
|
426
|
+
window_size = 30
|
427
|
+
step_size = 20
|
428
|
+
result = split_genome(pos, window_size, step_size)
|
429
|
+
expected = [(1, 30), (21, 50), (41, 70), (61, 90), (81, 110)]
|
430
|
+
assert result == expected, f"Expected {expected}, but got {result}"
|
431
|
+
|
432
|
+
# Test case 2: Step size is larger than window size
|
433
|
+
pos = np.array([0, 10, 20, 30, 40, 50])
|
434
|
+
window_size = 20
|
435
|
+
step_size = 25
|
436
|
+
|
437
|
+
with pytest.raises(
|
438
|
+
ValueError, match="`step_size` cannot be greater than `window_size`"
|
439
|
+
):
|
440
|
+
split_genome(pos, window_size, step_size)
|
441
|
+
|
442
|
+
# Test case 3: Handle empty `pos` array
|
443
|
+
pos = np.array([])
|
444
|
+
window_size = 30
|
445
|
+
step_size = 10
|
446
|
+
with pytest.raises(ValueError, match="`pos` array must not be empty"):
|
447
|
+
split_genome(pos, window_size, step_size)
|
448
|
+
|
449
|
+
|
450
|
+
def test_natsorted_df_correct_order():
|
451
|
+
df = pd.DataFrame(
|
452
|
+
{
|
453
|
+
"Chrom": ["1", "10", "2", "X", "1"],
|
454
|
+
"Start": [300, 50, 150, 10, 100],
|
455
|
+
"End": [400, 100, 200, 50, 200],
|
456
|
+
}
|
457
|
+
)
|
458
|
+
|
459
|
+
sorted_df = natsorted_df(df)
|
460
|
+
|
461
|
+
expected_df = pd.DataFrame(
|
462
|
+
{
|
463
|
+
"Chrom": ["1", "1", "2", "10", "X"],
|
464
|
+
"Start": [100, 300, 150, 50, 10],
|
465
|
+
"End": [200, 400, 200, 100, 50],
|
466
|
+
}
|
467
|
+
).reset_index(drop=True)
|
468
|
+
|
469
|
+
pd.testing.assert_frame_equal(sorted_df, expected_df)
|
470
|
+
|
471
|
+
|
472
|
+
def test_natsorted_df_missing_columns():
|
473
|
+
df_missing = pd.DataFrame(
|
474
|
+
{
|
475
|
+
"Chrom": ["1", "2", "X"],
|
476
|
+
"Start": [100, 200, 300],
|
477
|
+
}
|
478
|
+
)
|
479
|
+
|
480
|
+
with pytest.raises(ValueError, match="Missing required columns: End"):
|
481
|
+
natsorted_df(df_missing)
|
482
|
+
|
483
|
+
|
484
|
+
def test_natsorted_df_empty_dataframe():
|
485
|
+
df_empty = pd.DataFrame(columns=["Chrom", "Start", "End"])
|
486
|
+
sorted_df = natsorted_df(df_empty)
|
487
|
+
|
488
|
+
assert sorted_df.empty
|
489
|
+
|
490
|
+
|
491
|
+
def test_natsorted_df_single_row():
|
492
|
+
df_single = pd.DataFrame({"Chrom": ["1"], "Start": [100], "End": [200]})
|
493
|
+
|
494
|
+
sorted_df = natsorted_df(df_single)
|
495
|
+
|
496
|
+
pd.testing.assert_frame_equal(sorted_df, df_single)
|
497
|
+
|
498
|
+
|
499
|
+
def test_natsorted_df_integer_start_end():
|
500
|
+
df_mixed_types = pd.DataFrame(
|
501
|
+
{
|
502
|
+
"Chrom": ["1", "2", "X"],
|
503
|
+
"Start": ["100", "200", "300"],
|
504
|
+
"End": ["150", "250", "350"],
|
505
|
+
}
|
506
|
+
)
|
507
|
+
|
508
|
+
sorted_df = natsorted_df(df_mixed_types)
|
509
|
+
|
510
|
+
assert sorted_df["Start"].dtype == int
|
511
|
+
assert sorted_df["End"].dtype == int
|
sai/parsers/plot_parser.py
DELETED
@@ -1,152 +0,0 @@
|
|
1
|
-
# Copyright 2025 Xin Huang
|
2
|
-
#
|
3
|
-
# GNU General Public License v3.0
|
4
|
-
#
|
5
|
-
# This program is free software: you can redistribute it and/or modify
|
6
|
-
# it under the terms of the GNU General Public License as published by
|
7
|
-
# the Free Software Foundation, either version 3 of the License, or
|
8
|
-
# (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program. If not, please see
|
17
|
-
#
|
18
|
-
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
19
|
-
|
20
|
-
|
21
|
-
import argparse
|
22
|
-
from sai.parsers.argument_validation import positive_int
|
23
|
-
from sai.parsers.argument_validation import positive_number
|
24
|
-
from sai.parsers.argument_validation import existed_file
|
25
|
-
from sai.sai import plot
|
26
|
-
|
27
|
-
|
28
|
-
def _run_plot(args: argparse.Namespace) -> None:
|
29
|
-
"""
|
30
|
-
Runs the plotting process based on command-line arguments.
|
31
|
-
|
32
|
-
Parameters
|
33
|
-
----------
|
34
|
-
args : argparse.Namespace
|
35
|
-
Parsed command-line arguments containing input files, output file,
|
36
|
-
xlabel, ylabel, title, figsize_x, figsize_y, dpi, alpha,
|
37
|
-
marker_size, marker_color, and marker_style.
|
38
|
-
"""
|
39
|
-
plot(
|
40
|
-
u_file=args.u_file,
|
41
|
-
q_file=args.q_file,
|
42
|
-
output=args.output,
|
43
|
-
xlabel=args.xlabel,
|
44
|
-
ylabel=args.ylabel,
|
45
|
-
title=args.title,
|
46
|
-
figsize_x=args.figsize_x,
|
47
|
-
figsize_y=args.figsize_y,
|
48
|
-
dpi=args.dpi,
|
49
|
-
alpha=args.alpha,
|
50
|
-
marker_size=args.marker_size,
|
51
|
-
marker_color=args.marker_color,
|
52
|
-
marker_style=args.marker_style,
|
53
|
-
)
|
54
|
-
|
55
|
-
|
56
|
-
def add_plot_parser(subparsers: argparse.ArgumentParser) -> None:
|
57
|
-
"""
|
58
|
-
Initializes and configures the command-line interface parser
|
59
|
-
for the plot subcommand.
|
60
|
-
|
61
|
-
Parameters
|
62
|
-
----------
|
63
|
-
subparsers : argparse.ArgumentParser
|
64
|
-
A command-line interface parser to be configured.
|
65
|
-
"""
|
66
|
-
parser = subparsers.add_parser(
|
67
|
-
"plot", help="Generate a scatter plot of U vs Q statistics."
|
68
|
-
)
|
69
|
-
parser.add_argument(
|
70
|
-
"--u-file",
|
71
|
-
dest="u_file",
|
72
|
-
type=existed_file,
|
73
|
-
required=True,
|
74
|
-
help="Path to the U score/outlier file.",
|
75
|
-
)
|
76
|
-
parser.add_argument(
|
77
|
-
"--q-file",
|
78
|
-
dest="q_file",
|
79
|
-
type=existed_file,
|
80
|
-
required=True,
|
81
|
-
help="Path to the Q score/outlier file.",
|
82
|
-
)
|
83
|
-
parser.add_argument(
|
84
|
-
"--output",
|
85
|
-
type=str,
|
86
|
-
required=True,
|
87
|
-
help="Path to save the output plot file. The format depends on the file extension (e.g., `.png`, `.pdf`).",
|
88
|
-
)
|
89
|
-
parser.add_argument(
|
90
|
-
"--xlabel",
|
91
|
-
type=str,
|
92
|
-
default="Q Statistic",
|
93
|
-
help="Label for the X-axis. Default: Q Statistic.",
|
94
|
-
)
|
95
|
-
parser.add_argument(
|
96
|
-
"--ylabel",
|
97
|
-
type=str,
|
98
|
-
default="U Statistic",
|
99
|
-
help="Label for the Y-axis. Default: U Statistic.",
|
100
|
-
)
|
101
|
-
parser.add_argument(
|
102
|
-
"--title",
|
103
|
-
type=str,
|
104
|
-
default="Scatter Plot of U vs Q",
|
105
|
-
help="Title of the plot. Default: Scatter Plot of U vs Q.",
|
106
|
-
)
|
107
|
-
parser.add_argument(
|
108
|
-
"--figsize-x",
|
109
|
-
type=positive_number,
|
110
|
-
default=6,
|
111
|
-
help="Width of the figure (in inches). Default: 6.",
|
112
|
-
)
|
113
|
-
parser.add_argument(
|
114
|
-
"--figsize-y",
|
115
|
-
type=positive_number,
|
116
|
-
default=6,
|
117
|
-
help="Height of the figure (in inches). Default: 6.",
|
118
|
-
)
|
119
|
-
parser.add_argument(
|
120
|
-
"--dpi",
|
121
|
-
type=positive_int,
|
122
|
-
default=300,
|
123
|
-
help="Resolution of the saved plot. Default: 300.",
|
124
|
-
)
|
125
|
-
parser.add_argument(
|
126
|
-
"--alpha",
|
127
|
-
type=positive_number,
|
128
|
-
default=0.6,
|
129
|
-
help="Transparency level of scatter points. Default: 0.6.",
|
130
|
-
)
|
131
|
-
parser.add_argument(
|
132
|
-
"--marker-size",
|
133
|
-
dest="marker_size",
|
134
|
-
type=positive_number,
|
135
|
-
default=20,
|
136
|
-
help="Size of the scatter plot markers. See matplotlib.pyplot.scatter. Default: 20.",
|
137
|
-
)
|
138
|
-
parser.add_argument(
|
139
|
-
"--marker-color",
|
140
|
-
dest="marker_color",
|
141
|
-
type=str,
|
142
|
-
default="blue",
|
143
|
-
help="Color of the markers. See matplotlib.pyplot.scatter. Default: blue.",
|
144
|
-
)
|
145
|
-
parser.add_argument(
|
146
|
-
"--marker-style",
|
147
|
-
dest="marker_style",
|
148
|
-
type=str,
|
149
|
-
default="o",
|
150
|
-
help="Shape of the markers. See matplotlib.pyplot.scatter. Default: o.",
|
151
|
-
)
|
152
|
-
parser.set_defaults(runner=_run_plot)
|