sai-pg 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. sai/__init__.py +2 -0
  2. sai/__main__.py +6 -3
  3. sai/configs/__init__.py +24 -0
  4. sai/configs/global_config.py +83 -0
  5. sai/configs/ploidy_config.py +94 -0
  6. sai/configs/pop_config.py +82 -0
  7. sai/configs/stat_config.py +220 -0
  8. sai/{utils/generators → generators}/chunk_generator.py +2 -8
  9. sai/{utils/generators → generators}/window_generator.py +82 -37
  10. sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
  11. sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
  12. sai/parsers/outlier_parser.py +4 -3
  13. sai/parsers/score_parser.py +8 -119
  14. sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
  15. sai/preprocessors/feature_preprocessor.py +236 -0
  16. sai/registries/__init__.py +22 -0
  17. sai/registries/generic_registry.py +89 -0
  18. sai/registries/stat_registry.py +30 -0
  19. sai/sai.py +124 -220
  20. sai/stats/__init__.py +11 -0
  21. sai/stats/danc_statistic.py +83 -0
  22. sai/stats/dd_statistic.py +77 -0
  23. sai/stats/df_statistic.py +84 -0
  24. sai/stats/dplus_statistic.py +86 -0
  25. sai/stats/fd_statistic.py +92 -0
  26. sai/stats/generic_statistic.py +93 -0
  27. sai/stats/q_statistic.py +104 -0
  28. sai/stats/stat_utils.py +259 -0
  29. sai/stats/u_statistic.py +99 -0
  30. sai/utils/utils.py +220 -143
  31. {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
  32. sai_pg-1.1.0.dist-info/RECORD +70 -0
  33. {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
  34. sai_pg-1.1.0.dist-info/top_level.txt +2 -0
  35. tests/configs/test_global_config.py +163 -0
  36. tests/configs/test_ploidy_config.py +93 -0
  37. tests/configs/test_pop_config.py +90 -0
  38. tests/configs/test_stat_config.py +171 -0
  39. tests/generators/test_chunk_generator.py +51 -0
  40. tests/generators/test_window_generator.py +164 -0
  41. tests/multiprocessing/test_mp_manager.py +92 -0
  42. tests/multiprocessing/test_mp_pool.py +79 -0
  43. tests/parsers/test_argument_validation.py +133 -0
  44. tests/parsers/test_outlier_parser.py +53 -0
  45. tests/parsers/test_score_parser.py +63 -0
  46. tests/preprocessors/test_chunk_preprocessor.py +79 -0
  47. tests/preprocessors/test_feature_preprocessor.py +223 -0
  48. tests/registries/test_registries.py +74 -0
  49. tests/stats/test_danc_statistic.py +51 -0
  50. tests/stats/test_dd_statistic.py +45 -0
  51. tests/stats/test_df_statistic.py +73 -0
  52. tests/stats/test_dplus_statistic.py +79 -0
  53. tests/stats/test_fd_statistic.py +68 -0
  54. tests/stats/test_q_statistic.py +268 -0
  55. tests/stats/test_stat_utils.py +354 -0
  56. tests/stats/test_u_statistic.py +233 -0
  57. tests/test___main__.py +51 -0
  58. tests/test_sai.py +102 -0
  59. tests/utils/test_utils.py +511 -0
  60. sai/parsers/plot_parser.py +0 -152
  61. sai/stats/features.py +0 -302
  62. sai/utils/preprocessors/feature_preprocessor.py +0 -211
  63. sai_pg-1.0.0.dist-info/RECORD +0 -30
  64. sai_pg-1.0.0.dist-info/top_level.txt +0 -1
  65. /sai/{utils/generators → generators}/__init__.py +0 -0
  66. /sai/{utils/generators → generators}/data_generator.py +0 -0
  67. /sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
  68. /sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
  69. /sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
  70. {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
  71. {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,511 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ import allel
22
+ import pytest
23
+ import numpy as np
24
+ import pandas as pd
25
+ from unittest.mock import mock_open, patch
26
+ from sai.configs import PloidyConfig
27
+ from sai.utils import ChromosomeData
28
+ from sai.utils import filter_fixed_variants
29
+ from sai.utils import filter_geno_data
30
+ from sai.utils import flip_snps
31
+ from sai.utils import get_ref_alt_allele
32
+ from sai.utils import parse_ind_file
33
+ from sai.utils import read_anc_allele
34
+ from sai.utils import read_data
35
+ from sai.utils import read_geno_data
36
+ from sai.utils import split_genome
37
+ from sai.utils import natsorted_df
38
+
39
+
40
+ def test_valid_file():
41
+ # Mock the content of a valid file with categories
42
+ mock_data = "Category1 Sample1\nCategory1 Sample2\nCategory2 Sample3\n"
43
+ with patch("builtins.open", mock_open(read_data=mock_data)):
44
+ samples = parse_ind_file("mock_file.txt")
45
+ assert samples == {
46
+ "Category1": ["Sample1", "Sample2"],
47
+ "Category2": ["Sample3"],
48
+ }
49
+
50
+
51
+ def test_empty_file():
52
+ # Mock an empty file
53
+ mock_data = ""
54
+ with patch("builtins.open", mock_open(read_data=mock_data)):
55
+ with pytest.raises(ValueError) as excinfo:
56
+ parse_ind_file("mock_file.txt")
57
+ assert (
58
+ str(excinfo.value)
59
+ == "No samples found in mock_file.txt. Please check your data."
60
+ )
61
+
62
+
63
+ def test_file_not_found():
64
+ # Ensure FileNotFoundError is raised when file does not exist
65
+ with pytest.raises(FileNotFoundError):
66
+ parse_ind_file("non_existent_file.txt")
67
+
68
+
69
+ def test_ignores_empty_lines():
70
+ # Mock a file with empty lines and valid lines
71
+ mock_data = "Category1 Sample1\n\nCategory1 Sample2\n \nCategory2 Sample3\n"
72
+ with patch("builtins.open", mock_open(read_data=mock_data)):
73
+ samples = parse_ind_file("mock_file.txt")
74
+ assert samples == {
75
+ "Category1": ["Sample1", "Sample2"],
76
+ "Category2": ["Sample3"],
77
+ }
78
+
79
+
80
+ # Test data setup for filter_geno_data
81
+ @pytest.fixture
82
+ def sample_genotype_data():
83
+ return ChromosomeData(
84
+ POS=np.array([100, 200, 300, 400, 500]),
85
+ REF=np.array(["A", "T", "G", "C", "A"]),
86
+ ALT=np.array(["C", "A", "T", "G", "T"]),
87
+ GT=np.array(
88
+ [
89
+ [[0, 1], [1, 1]],
90
+ [[1, 0], [1, -1]],
91
+ [[0, 0], [0, 0]],
92
+ [[1, 1], [1, 1]],
93
+ [[0, 1], [0, 0]],
94
+ ]
95
+ ),
96
+ )
97
+
98
+
99
+ def test_filter_geno_data(sample_genotype_data):
100
+ # Example of filtering out the third row
101
+ index = np.array([True, False, True, True, False])
102
+ filtered = filter_geno_data(sample_genotype_data, index)
103
+
104
+ # Assertions updated to check ChromosomeData attributes
105
+ assert filtered.POS.tolist() == [100, 300, 400]
106
+ assert filtered.REF.tolist() == ["A", "G", "C"]
107
+ assert filtered.ALT.tolist() == ["C", "T", "G"]
108
+ assert filtered.GT.shape == (3, 2, 2)
109
+
110
+
111
+ # Test from files
112
+ @pytest.fixture
113
+ def data():
114
+ pytest.ref_ind_list = "./tests/data/test.ref.ind.list"
115
+ pytest.tgt_ind_list = "./tests/data/test.tgt.ind.list"
116
+ pytest.vcf = "./tests/data/test.data.vcf"
117
+ pytest.anc_allele = "./tests/data/test.anc.allele.bed"
118
+
119
+
120
+ def test_parse_ind_file_from_files(data):
121
+ ref_ind = parse_ind_file(pytest.ref_ind_list)
122
+ tgt_ind = parse_ind_file(pytest.tgt_ind_list)
123
+
124
+ exp_ref_ind = {
125
+ "ref1": ["ind5", "ind6"],
126
+ }
127
+ exp_tgt_ind = {
128
+ "tgt1": ["ind1", "ind2"],
129
+ "tgt2": ["ind3", "ind4"],
130
+ }
131
+
132
+ assert ref_ind == exp_ref_ind
133
+ assert tgt_ind == exp_tgt_ind
134
+
135
+
136
+ def test_read_geno_data_from_file(data):
137
+ ref_ind = parse_ind_file(pytest.ref_ind_list)
138
+ d = read_geno_data(
139
+ vcf=pytest.vcf,
140
+ ind_samples=ref_ind,
141
+ chr_name="21",
142
+ anc_allele_file=None,
143
+ filter_missing=False,
144
+ )
145
+
146
+ vcf = allel.read_vcf(pytest.vcf, alt_number=1, samples=ref_ind["ref1"], region="21")
147
+
148
+ assert np.array_equal(ref_ind["ref1"], vcf["samples"])
149
+ assert np.array_equal(d["ref1"].POS, vcf["variants/POS"])
150
+ assert np.array_equal(d["ref1"].REF, vcf["variants/REF"])
151
+ assert np.array_equal(d["ref1"].ALT, vcf["variants/ALT"])
152
+ assert np.array_equal(d["ref1"].GT, vcf["calldata/GT"])
153
+
154
+
155
+ def test_read_data_from_file(data):
156
+ ploidy_config = PloidyConfig(
157
+ {
158
+ "ref": {"ref1": 2},
159
+ "tgt": {"tgt1": 2, "tgt2": 2},
160
+ "src": {"src1": 2, "src2": 2},
161
+ }
162
+ )
163
+
164
+ results = read_data(
165
+ vcf_file=pytest.vcf,
166
+ chr_name="21",
167
+ ref_ind_file=pytest.ref_ind_list,
168
+ tgt_ind_file=pytest.tgt_ind_list,
169
+ src_ind_file=None,
170
+ out_ind_file=None,
171
+ anc_allele_file=None,
172
+ filter_ref=False,
173
+ filter_tgt=False,
174
+ filter_src=False,
175
+ filter_out=False,
176
+ ploidy_config=ploidy_config,
177
+ )
178
+
179
+ rs = parse_ind_file(pytest.ref_ind_list)
180
+ ts = parse_ind_file(pytest.tgt_ind_list)
181
+
182
+ assert np.array_equal(rs, results["ref"][1])
183
+ assert np.array_equal(ts, results["tgt"][1])
184
+
185
+ ref_vcf = allel.read_vcf(pytest.vcf, alt_number=1, samples=rs["ref1"], region="21")
186
+ tgt_vcf = allel.read_vcf(pytest.vcf, alt_number=1, samples=ts["tgt2"], region="21")
187
+
188
+ assert np.array_equal(rs["ref1"], ref_vcf["samples"])
189
+ assert np.array_equal(ts["tgt2"], tgt_vcf["samples"])
190
+ assert np.array_equal(results["ref"][0]["ref1"].POS, ref_vcf["variants/POS"])
191
+ assert np.array_equal(results["ref"][0]["ref1"].REF, ref_vcf["variants/REF"])
192
+ assert np.array_equal(results["ref"][0]["ref1"].ALT, ref_vcf["variants/ALT"])
193
+ assert np.array_equal(
194
+ results["ref"][0]["ref1"].GT, ref_vcf["calldata/GT"].reshape(19, 4)
195
+ )
196
+ assert np.array_equal(results["tgt"][0]["tgt2"].POS, tgt_vcf["variants/POS"])
197
+ assert np.array_equal(results["tgt"][0]["tgt2"].REF, tgt_vcf["variants/REF"])
198
+ assert np.array_equal(results["tgt"][0]["tgt2"].ALT, tgt_vcf["variants/ALT"])
199
+ assert np.array_equal(
200
+ results["tgt"][0]["tgt2"].GT, tgt_vcf["calldata/GT"].reshape(19, 4)
201
+ )
202
+
203
+
204
+ def test_read_anc_allele(data):
205
+ anc_allele = read_anc_allele(pytest.anc_allele, "21")
206
+
207
+ exp_anc_allele = {"21": {2309: "G", 7879: "A", 11484: "-", 48989: "C"}}
208
+
209
+ assert anc_allele == exp_anc_allele
210
+
211
+
212
+ def test_get_ref_alt_allele(data):
213
+ tgt_ind = parse_ind_file(pytest.tgt_ind_list)
214
+ tgt_vcf = allel.read_vcf(
215
+ pytest.vcf, alt_number=1, samples=tgt_ind["tgt1"], region="21"
216
+ )
217
+
218
+ ref_allele, alt_allele = get_ref_alt_allele(
219
+ tgt_vcf["variants/REF"], tgt_vcf["variants/ALT"], tgt_vcf["variants/POS"]
220
+ )
221
+
222
+ exp_ref_allele = {
223
+ 2309: "G",
224
+ 7879: "C",
225
+ 11484: "A",
226
+ 16249: "A",
227
+ 17324: "G",
228
+ 19064: "G",
229
+ 19124: "G",
230
+ 23559: "G",
231
+ 25354: "G",
232
+ 26654: "G",
233
+ 29724: "G",
234
+ 30769: "C",
235
+ 31319: "C",
236
+ 37199: "C",
237
+ 38009: "C",
238
+ 39444: "C",
239
+ 40809: "C",
240
+ 45079: "C",
241
+ 48989: "C",
242
+ }
243
+ exp_alt_allele = {
244
+ 2309: "A",
245
+ 7879: "A",
246
+ 11484: "C",
247
+ 16249: "C",
248
+ 17324: "T",
249
+ 19064: "T",
250
+ 19124: "A",
251
+ 23559: "A",
252
+ 25354: "T",
253
+ 26654: "C",
254
+ 29724: "A",
255
+ 30769: "T",
256
+ 31319: "T",
257
+ 37199: "T",
258
+ 38009: "T",
259
+ 39444: "T",
260
+ 40809: "T",
261
+ 45079: "T",
262
+ 48989: "T",
263
+ }
264
+
265
+ assert ref_allele == exp_ref_allele
266
+ assert alt_allele == exp_alt_allele
267
+
268
+
269
+ def test_check_anc_allele(data):
270
+ ploidy_config = PloidyConfig(
271
+ {
272
+ "ref": {"ref1": 2},
273
+ "tgt": {"tgt1": 2, "tgt2": 2},
274
+ "src": {"src1": 2, "src2": 2},
275
+ }
276
+ )
277
+
278
+ data = read_data(
279
+ vcf_file=pytest.vcf,
280
+ chr_name="21",
281
+ ref_ind_file=pytest.ref_ind_list,
282
+ tgt_ind_file=pytest.tgt_ind_list,
283
+ src_ind_file=None,
284
+ out_ind_file=None,
285
+ anc_allele_file=pytest.anc_allele,
286
+ filter_ref=False,
287
+ filter_tgt=False,
288
+ ploidy_config=ploidy_config,
289
+ )
290
+
291
+ exp_ref_gt = allel.GenotypeArray(
292
+ [
293
+ [[0, 0], [0, 0]],
294
+ [[1, 1], [1, 1]],
295
+ [[0, 0], [0, 0]],
296
+ ],
297
+ )
298
+ exp_tgt_gt1 = allel.GenotypeArray(
299
+ [
300
+ [[1, 0], [0, 0]],
301
+ [[1, 1], [1, 0]],
302
+ [[0, 0], [0, 1]],
303
+ ],
304
+ )
305
+ exp_tgt_gt2 = allel.GenotypeArray(
306
+ [
307
+ [[0, 0], [0, 0]],
308
+ [[1, 1], [1, 1]],
309
+ [[0, 0], [0, 0]],
310
+ ],
311
+ )
312
+ exp_tgt_pos = [2309, 7879, 48989]
313
+
314
+ assert np.array_equal(data["ref"][0]["ref1"].GT, exp_ref_gt.reshape(3, 4))
315
+ assert np.array_equal(data["tgt"][0]["tgt1"].GT, exp_tgt_gt1.reshape(3, 4))
316
+ assert np.array_equal(data["tgt"][0]["tgt2"].GT, exp_tgt_gt2.reshape(3, 4))
317
+ assert np.array_equal(data["tgt"][0]["tgt1"].POS, exp_tgt_pos)
318
+ assert np.array_equal(data["tgt"][0]["tgt2"].POS, exp_tgt_pos)
319
+
320
+
321
+ # Test data setup for filter_fixed_variants
322
+ @pytest.fixture
323
+ def sample_data():
324
+ # Sample ChromosomeData with mixed fixed and non-fixed variants
325
+ return {
326
+ "pop1": ChromosomeData(
327
+ POS=np.array([100, 200, 300, 400]),
328
+ REF=np.array(["A", "G", "T", "C"]),
329
+ ALT=np.array(["C", "A", "G", "T"]),
330
+ GT=allel.GenotypeArray(
331
+ [
332
+ [[0, 0], [0, 0]], # Fixed ref (AA, AA)
333
+ [[1, 1], [1, 1]], # Fixed alt (CC, CC)
334
+ [[0, 1], [1, 1]], # Mixed (AG, GG)
335
+ [[0, 1], [0, 0]], # Mixed (AC, AA)
336
+ ]
337
+ ),
338
+ ),
339
+ "pop2": ChromosomeData(
340
+ POS=np.array([150, 250]),
341
+ REF=np.array(["T", "A"]),
342
+ ALT=np.array(["G", "C"]),
343
+ GT=allel.GenotypeArray(
344
+ [
345
+ [[0, 0], [0, 0]], # Fixed ref (TT, TT)
346
+ [[1, 1], [1, 1]], # Fixed alt (CC, CC)
347
+ ]
348
+ ),
349
+ ),
350
+ }
351
+
352
+
353
+ @pytest.fixture
354
+ def sample_info():
355
+ # Sample information for two individuals in 'pop1'
356
+ return {
357
+ "pop1": ["sample1", "sample2"],
358
+ "pop2": ["sample3", "sample4"],
359
+ }
360
+
361
+
362
+ def test_filter_fixed_variants(sample_data, sample_info):
363
+ # Apply the filter_fixed_variants function
364
+ filtered_data = filter_fixed_variants(sample_data, sample_info)
365
+
366
+ # Verify that fixed variants are removed
367
+ assert "pop1" in filtered_data
368
+ assert "pop2" in filtered_data
369
+
370
+ # Check that only non-fixed variants are retained for chr1
371
+ pop1_data = filtered_data["pop1"]
372
+ assert pop1_data.POS.tolist() == [300, 400] # Positions with mixed genotypes
373
+ assert pop1_data.REF.tolist() == ["T", "C"]
374
+ assert pop1_data.ALT.tolist() == ["G", "T"]
375
+ assert pop1_data.GT.shape == (2, 2, 2)
376
+
377
+ # Verify that all variants in chr2 are filtered out, as they are fixed
378
+ pop2_data = filtered_data["pop2"]
379
+ assert pop2_data.POS.size == 0
380
+ assert pop2_data.REF.size == 0
381
+ assert pop2_data.ALT.size == 0
382
+ assert pop2_data.GT.size == 0
383
+
384
+
385
+ # Test data setup for flip_snps
386
+ @pytest.fixture
387
+ def sample_chromosome_data():
388
+ # Sample ChromosomeData with genotypes to test flipping
389
+ return ChromosomeData(
390
+ POS=np.array([100, 200, 300, 400]),
391
+ REF=np.array(["A", "G", "T", "C"]),
392
+ ALT=np.array(["C", "A", "G", "T"]),
393
+ GT=allel.GenotypeArray(
394
+ [
395
+ [[0, 1], [1, 1]], # Mixed genotype, should be flipped
396
+ [[1, 1], [1, 1]], # ALT fixed, should be flipped
397
+ [[0, 0], [0, 1]], # Mixed genotype, should remain unchanged
398
+ [[1, 0], [0, 0]], # Mixed genotype, should be flipped
399
+ ]
400
+ ),
401
+ )
402
+
403
+
404
+ def test_flip_snps(sample_chromosome_data):
405
+ # Define SNP positions to be flipped
406
+ flipped_snps = [100, 200, 400]
407
+
408
+ # Apply the flip_snps function
409
+ flip_snps(sample_chromosome_data, flipped_snps)
410
+
411
+ # Check flipped genotypes
412
+ # Position 100: original [[0, 1], [1, 1]] -> flipped [[1, 0], [0, 0]]
413
+ assert sample_chromosome_data.GT[0].tolist() == [[1, 0], [0, 0]]
414
+ # Position 200: original [[1, 1], [1, 1]] -> flipped [[0, 0], [0, 0]]
415
+ assert sample_chromosome_data.GT[1].tolist() == [[0, 0], [0, 0]]
416
+ # Position 300: original [[0, 0], [0, 1]] -> should remain unchanged
417
+ assert sample_chromosome_data.GT[2].tolist() == [[0, 0], [0, 1]]
418
+ # Position 400: original [[1, 0], [0, 0]] -> flipped [[0, 1], [1, 1]]
419
+ assert sample_chromosome_data.GT[3].tolist() == [[0, 1], [1, 1]]
420
+
421
+
422
+ # Sample test function for split_genome
423
+ def test_split_genome():
424
+ # Test case 1: Basic case with regular windows
425
+ pos = np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
426
+ window_size = 30
427
+ step_size = 20
428
+ result = split_genome(pos, window_size, step_size)
429
+ expected = [(1, 30), (21, 50), (41, 70), (61, 90), (81, 110)]
430
+ assert result == expected, f"Expected {expected}, but got {result}"
431
+
432
+ # Test case 2: Step size is larger than window size
433
+ pos = np.array([0, 10, 20, 30, 40, 50])
434
+ window_size = 20
435
+ step_size = 25
436
+
437
+ with pytest.raises(
438
+ ValueError, match="`step_size` cannot be greater than `window_size`"
439
+ ):
440
+ split_genome(pos, window_size, step_size)
441
+
442
+ # Test case 3: Handle empty `pos` array
443
+ pos = np.array([])
444
+ window_size = 30
445
+ step_size = 10
446
+ with pytest.raises(ValueError, match="`pos` array must not be empty"):
447
+ split_genome(pos, window_size, step_size)
448
+
449
+
450
+ def test_natsorted_df_correct_order():
451
+ df = pd.DataFrame(
452
+ {
453
+ "Chrom": ["1", "10", "2", "X", "1"],
454
+ "Start": [300, 50, 150, 10, 100],
455
+ "End": [400, 100, 200, 50, 200],
456
+ }
457
+ )
458
+
459
+ sorted_df = natsorted_df(df)
460
+
461
+ expected_df = pd.DataFrame(
462
+ {
463
+ "Chrom": ["1", "1", "2", "10", "X"],
464
+ "Start": [100, 300, 150, 50, 10],
465
+ "End": [200, 400, 200, 100, 50],
466
+ }
467
+ ).reset_index(drop=True)
468
+
469
+ pd.testing.assert_frame_equal(sorted_df, expected_df)
470
+
471
+
472
+ def test_natsorted_df_missing_columns():
473
+ df_missing = pd.DataFrame(
474
+ {
475
+ "Chrom": ["1", "2", "X"],
476
+ "Start": [100, 200, 300],
477
+ }
478
+ )
479
+
480
+ with pytest.raises(ValueError, match="Missing required columns: End"):
481
+ natsorted_df(df_missing)
482
+
483
+
484
+ def test_natsorted_df_empty_dataframe():
485
+ df_empty = pd.DataFrame(columns=["Chrom", "Start", "End"])
486
+ sorted_df = natsorted_df(df_empty)
487
+
488
+ assert sorted_df.empty
489
+
490
+
491
+ def test_natsorted_df_single_row():
492
+ df_single = pd.DataFrame({"Chrom": ["1"], "Start": [100], "End": [200]})
493
+
494
+ sorted_df = natsorted_df(df_single)
495
+
496
+ pd.testing.assert_frame_equal(sorted_df, df_single)
497
+
498
+
499
+ def test_natsorted_df_integer_start_end():
500
+ df_mixed_types = pd.DataFrame(
501
+ {
502
+ "Chrom": ["1", "2", "X"],
503
+ "Start": ["100", "200", "300"],
504
+ "End": ["150", "250", "350"],
505
+ }
506
+ )
507
+
508
+ sorted_df = natsorted_df(df_mixed_types)
509
+
510
+ assert sorted_df["Start"].dtype == int
511
+ assert sorted_df["End"].dtype == int
@@ -1,152 +0,0 @@
1
- # Copyright 2025 Xin Huang
2
- #
3
- # GNU General Public License v3.0
4
- #
5
- # This program is free software: you can redistribute it and/or modify
6
- # it under the terms of the GNU General Public License as published by
7
- # the Free Software Foundation, either version 3 of the License, or
8
- # (at your option) any later version.
9
- #
10
- # This program is distributed in the hope that it will be useful,
11
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
- # GNU General Public License for more details.
14
- #
15
- # You should have received a copy of the GNU General Public License
16
- # along with this program. If not, please see
17
- #
18
- # https://www.gnu.org/licenses/gpl-3.0.en.html
19
-
20
-
21
- import argparse
22
- from sai.parsers.argument_validation import positive_int
23
- from sai.parsers.argument_validation import positive_number
24
- from sai.parsers.argument_validation import existed_file
25
- from sai.sai import plot
26
-
27
-
28
- def _run_plot(args: argparse.Namespace) -> None:
29
- """
30
- Runs the plotting process based on command-line arguments.
31
-
32
- Parameters
33
- ----------
34
- args : argparse.Namespace
35
- Parsed command-line arguments containing input files, output file,
36
- xlabel, ylabel, title, figsize_x, figsize_y, dpi, alpha,
37
- marker_size, marker_color, and marker_style.
38
- """
39
- plot(
40
- u_file=args.u_file,
41
- q_file=args.q_file,
42
- output=args.output,
43
- xlabel=args.xlabel,
44
- ylabel=args.ylabel,
45
- title=args.title,
46
- figsize_x=args.figsize_x,
47
- figsize_y=args.figsize_y,
48
- dpi=args.dpi,
49
- alpha=args.alpha,
50
- marker_size=args.marker_size,
51
- marker_color=args.marker_color,
52
- marker_style=args.marker_style,
53
- )
54
-
55
-
56
- def add_plot_parser(subparsers: argparse.ArgumentParser) -> None:
57
- """
58
- Initializes and configures the command-line interface parser
59
- for the plot subcommand.
60
-
61
- Parameters
62
- ----------
63
- subparsers : argparse.ArgumentParser
64
- A command-line interface parser to be configured.
65
- """
66
- parser = subparsers.add_parser(
67
- "plot", help="Generate a scatter plot of U vs Q statistics."
68
- )
69
- parser.add_argument(
70
- "--u-file",
71
- dest="u_file",
72
- type=existed_file,
73
- required=True,
74
- help="Path to the U score/outlier file.",
75
- )
76
- parser.add_argument(
77
- "--q-file",
78
- dest="q_file",
79
- type=existed_file,
80
- required=True,
81
- help="Path to the Q score/outlier file.",
82
- )
83
- parser.add_argument(
84
- "--output",
85
- type=str,
86
- required=True,
87
- help="Path to save the output plot file. The format depends on the file extension (e.g., `.png`, `.pdf`).",
88
- )
89
- parser.add_argument(
90
- "--xlabel",
91
- type=str,
92
- default="Q Statistic",
93
- help="Label for the X-axis. Default: Q Statistic.",
94
- )
95
- parser.add_argument(
96
- "--ylabel",
97
- type=str,
98
- default="U Statistic",
99
- help="Label for the Y-axis. Default: U Statistic.",
100
- )
101
- parser.add_argument(
102
- "--title",
103
- type=str,
104
- default="Scatter Plot of U vs Q",
105
- help="Title of the plot. Default: Scatter Plot of U vs Q.",
106
- )
107
- parser.add_argument(
108
- "--figsize-x",
109
- type=positive_number,
110
- default=6,
111
- help="Width of the figure (in inches). Default: 6.",
112
- )
113
- parser.add_argument(
114
- "--figsize-y",
115
- type=positive_number,
116
- default=6,
117
- help="Height of the figure (in inches). Default: 6.",
118
- )
119
- parser.add_argument(
120
- "--dpi",
121
- type=positive_int,
122
- default=300,
123
- help="Resolution of the saved plot. Default: 300.",
124
- )
125
- parser.add_argument(
126
- "--alpha",
127
- type=positive_number,
128
- default=0.6,
129
- help="Transparency level of scatter points. Default: 0.6.",
130
- )
131
- parser.add_argument(
132
- "--marker-size",
133
- dest="marker_size",
134
- type=positive_number,
135
- default=20,
136
- help="Size of the scatter plot markers. See matplotlib.pyplot.scatter. Default: 20.",
137
- )
138
- parser.add_argument(
139
- "--marker-color",
140
- dest="marker_color",
141
- type=str,
142
- default="blue",
143
- help="Color of the markers. See matplotlib.pyplot.scatter. Default: blue.",
144
- )
145
- parser.add_argument(
146
- "--marker-style",
147
- dest="marker_style",
148
- type=str,
149
- default="o",
150
- help="Shape of the markers. See matplotlib.pyplot.scatter. Default: o.",
151
- )
152
- parser.set_defaults(runner=_run_plot)