sai-pg 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. sai/__init__.py +2 -0
  2. sai/__main__.py +6 -3
  3. sai/configs/__init__.py +24 -0
  4. sai/configs/global_config.py +83 -0
  5. sai/configs/ploidy_config.py +94 -0
  6. sai/configs/pop_config.py +82 -0
  7. sai/configs/stat_config.py +220 -0
  8. sai/{utils/generators → generators}/chunk_generator.py +1 -1
  9. sai/{utils/generators → generators}/window_generator.py +81 -37
  10. sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
  11. sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
  12. sai/parsers/outlier_parser.py +4 -3
  13. sai/parsers/score_parser.py +8 -119
  14. sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
  15. sai/preprocessors/feature_preprocessor.py +236 -0
  16. sai/registries/__init__.py +22 -0
  17. sai/registries/generic_registry.py +89 -0
  18. sai/registries/stat_registry.py +30 -0
  19. sai/sai.py +124 -220
  20. sai/stats/__init__.py +11 -0
  21. sai/stats/danc_statistic.py +83 -0
  22. sai/stats/dd_statistic.py +77 -0
  23. sai/stats/df_statistic.py +84 -0
  24. sai/stats/dplus_statistic.py +86 -0
  25. sai/stats/fd_statistic.py +92 -0
  26. sai/stats/generic_statistic.py +93 -0
  27. sai/stats/q_statistic.py +104 -0
  28. sai/stats/stat_utils.py +259 -0
  29. sai/stats/u_statistic.py +99 -0
  30. sai/utils/utils.py +213 -142
  31. {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
  32. sai_pg-1.1.0.dist-info/RECORD +70 -0
  33. {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
  34. sai_pg-1.1.0.dist-info/top_level.txt +2 -0
  35. tests/configs/test_global_config.py +163 -0
  36. tests/configs/test_ploidy_config.py +93 -0
  37. tests/configs/test_pop_config.py +90 -0
  38. tests/configs/test_stat_config.py +171 -0
  39. tests/generators/test_chunk_generator.py +51 -0
  40. tests/generators/test_window_generator.py +164 -0
  41. tests/multiprocessing/test_mp_manager.py +92 -0
  42. tests/multiprocessing/test_mp_pool.py +79 -0
  43. tests/parsers/test_argument_validation.py +133 -0
  44. tests/parsers/test_outlier_parser.py +53 -0
  45. tests/parsers/test_score_parser.py +63 -0
  46. tests/preprocessors/test_chunk_preprocessor.py +79 -0
  47. tests/preprocessors/test_feature_preprocessor.py +223 -0
  48. tests/registries/test_registries.py +74 -0
  49. tests/stats/test_danc_statistic.py +51 -0
  50. tests/stats/test_dd_statistic.py +45 -0
  51. tests/stats/test_df_statistic.py +73 -0
  52. tests/stats/test_dplus_statistic.py +79 -0
  53. tests/stats/test_fd_statistic.py +68 -0
  54. tests/stats/test_q_statistic.py +268 -0
  55. tests/stats/test_stat_utils.py +354 -0
  56. tests/stats/test_u_statistic.py +233 -0
  57. tests/test___main__.py +51 -0
  58. tests/test_sai.py +102 -0
  59. tests/utils/test_utils.py +511 -0
  60. sai/parsers/plot_parser.py +0 -152
  61. sai/stats/features.py +0 -302
  62. sai/utils/preprocessors/feature_preprocessor.py +0 -211
  63. sai_pg-1.0.1.dist-info/RECORD +0 -30
  64. sai_pg-1.0.1.dist-info/top_level.txt +0 -1
  65. /sai/{utils/generators → generators}/__init__.py +0 -0
  66. /sai/{utils/generators → generators}/data_generator.py +0 -0
  67. /sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
  68. /sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
  69. /sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
  70. {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
  71. {sai_pg-1.0.1.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,233 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ import pytest
22
+ import numpy as np
23
+ from sai.stats import UStatistic
24
+
25
+
26
+ def test_UStatistic_compute_basic():
27
+ # Test data
28
+ ref_gts = np.array([[0, 0, 1], [0, 0, 0], [1, 1, 1]])
29
+ tgt_gts = np.array([[1, 1, 1], [1, 0, 0], [0, 1, 0]])
30
+ src_gts = np.array([[0, 0, 0], [1, 1, 1], [1, 0, 1]])
31
+ pos = np.array([0, 1, 2])
32
+ w, x, y = 0.5, 0.5, ("=", 0)
33
+
34
+ expected_result = 1 # Only the first site meets the criteria
35
+ expected_positions = np.array([0])
36
+
37
+ u_stat = UStatistic(
38
+ ref_gts=ref_gts,
39
+ tgt_gts=tgt_gts,
40
+ src_gts_list=[src_gts],
41
+ ref_ploidy=1,
42
+ tgt_ploidy=1,
43
+ src_ploidy_list=[1],
44
+ )
45
+ results = u_stat.compute(
46
+ pos=pos,
47
+ w=w,
48
+ x=x,
49
+ y_list=[y],
50
+ anc_allele_available=False,
51
+ )
52
+
53
+ assert results["name"] == "U"
54
+ assert (
55
+ results["value"] == expected_result
56
+ ), f"Expected {expected_result}, got {results['value']}"
57
+ assert np.array_equal(results["cdd_pos"], expected_positions)
58
+
59
+ w, x, y = 0.5, 0.5, ("=", 1)
60
+ expected_result = 0
61
+ expected_positions = np.array([])
62
+ results = u_stat.compute(
63
+ pos=pos,
64
+ w=w,
65
+ x=x,
66
+ y_list=[y],
67
+ anc_allele_available=True,
68
+ )
69
+
70
+ assert (
71
+ results["value"] == expected_result
72
+ ), f"Expected {expected_result}, got {results['value']}"
73
+ assert np.array_equal(results["cdd_pos"], expected_positions)
74
+
75
+
76
+ def test_UStatistic_compute_no_match():
77
+ # Test data with no matching sites
78
+ ref_gts = np.array([[0, 1, 1], [1, 1, 1]])
79
+ tgt_gts = np.array([[0, 0, 0], [1, 0, 1]])
80
+ src_gts = np.array([[1, 1, 1], [1, 1, 1]])
81
+ pos = np.array([0, 1])
82
+
83
+ # Parameters for testing with no matching sites
84
+ w, x, y = 0.3, 0.5, ("=", 0)
85
+
86
+ expected_result = 0 # No sites meet the criteria
87
+ expected_positions = np.array([])
88
+
89
+ u_stat = UStatistic(
90
+ ref_gts=ref_gts,
91
+ tgt_gts=tgt_gts,
92
+ src_gts_list=[src_gts],
93
+ ref_ploidy=1,
94
+ tgt_ploidy=1,
95
+ src_ploidy_list=[1],
96
+ )
97
+ results = u_stat.compute(
98
+ pos=pos,
99
+ w=w,
100
+ x=x,
101
+ y_list=[y],
102
+ anc_allele_available=False,
103
+ )
104
+
105
+ assert (
106
+ results["value"] == expected_result
107
+ ), f"Expected {expected_result}, got {results['value']}"
108
+ assert np.array_equal(results["cdd_pos"], expected_positions)
109
+
110
+
111
+ def test_UStatistic_compute_all_match():
112
+ # Test data where all sites meet the criteria
113
+ ref_gts = np.array([[0, 0, 0], [0, 0, 0]])
114
+ tgt_gts = np.array([[1, 1, 1], [1, 1, 1]])
115
+ src_gts = np.array([[0, 0, 0], [0, 0, 0]])
116
+ pos = np.array([0, 1])
117
+
118
+ # Parameters for testing all sites matching
119
+ w, x, y = 0.5, 0.5, ("=", 0)
120
+
121
+ expected_result = 2 # All sites meet the criteria
122
+ expected_positions = np.array([0, 1])
123
+
124
+ u_stat = UStatistic(
125
+ ref_gts=ref_gts,
126
+ tgt_gts=tgt_gts,
127
+ src_gts_list=[src_gts],
128
+ ref_ploidy=1,
129
+ tgt_ploidy=1,
130
+ src_ploidy_list=[1],
131
+ )
132
+ results = u_stat.compute(
133
+ pos=pos,
134
+ w=w,
135
+ x=x,
136
+ y_list=[y],
137
+ anc_allele_available=False,
138
+ )
139
+
140
+ assert (
141
+ results["value"] == expected_result
142
+ ), f"Expected {expected_result}, got {results['value']}"
143
+ assert np.array_equal(results["cdd_pos"], expected_positions)
144
+
145
+
146
+ def test_UStatistic_compute_with_two_sources():
147
+ # Test data
148
+ ref_gts = np.array([[0, 0, 1], [0, 0, 0], [1, 1, 1]])
149
+ tgt_gts = np.array([[0, 1, 1], [0, 0, 1], [1, 1, 1]])
150
+ src_gts1 = np.array([[1, 1, 1], [0, 1, 1], [1, 1, 1]])
151
+ src_gts2 = np.array([[1, 1, 1], [0, 1, 1], [1, 1, 1]])
152
+ pos = np.array([0, 1, 2])
153
+ w, x, y_list = 0.5, 0.5, [("=", 1.0), ("=", 1.0)]
154
+
155
+ # Expected result: only loci where ref_freq < w, tgt_freq > x, src_freq1 == y1, and src_freq2 == y2
156
+ expected_result = 1 # Only one locus meets all criteria
157
+ expected_positions = np.array([0])
158
+
159
+ u_stat = UStatistic(
160
+ ref_gts=ref_gts,
161
+ tgt_gts=tgt_gts,
162
+ src_gts_list=[src_gts1, src_gts2],
163
+ ref_ploidy=1,
164
+ tgt_ploidy=1,
165
+ src_ploidy_list=[1, 1],
166
+ )
167
+ results = u_stat.compute(
168
+ pos=pos,
169
+ w=w,
170
+ x=x,
171
+ y_list=y_list,
172
+ anc_allele_available=False,
173
+ )
174
+
175
+ assert (
176
+ results["value"] == expected_result
177
+ ), f"Expected {expected_result}, got {results['value']}"
178
+ assert np.array_equal(results["cdd_pos"], expected_positions)
179
+
180
+
181
+ def test_UStatistic_compute_with_mixed_ploidy():
182
+ ref_gts = np.array([[0, 1, 0], [0, 1, 0], [2, 1, 0]])
183
+ tgt_gts = np.array([[1, 1, 0], [1, 1, 1], [1, 1, 1]])
184
+ src_gts = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]])
185
+ pos = np.array([0, 1, 2])
186
+ w, x, y = 0.5, 0.5, ("=", 0)
187
+ expected_result = 2
188
+ expected_positions = np.array([0, 2])
189
+
190
+ u_stat = UStatistic(
191
+ ref_gts=ref_gts,
192
+ tgt_gts=tgt_gts,
193
+ src_gts_list=[src_gts],
194
+ ref_ploidy=3,
195
+ tgt_ploidy=1,
196
+ src_ploidy_list=[2],
197
+ )
198
+ results = u_stat.compute(
199
+ pos=pos,
200
+ w=w,
201
+ x=x,
202
+ y_list=[y],
203
+ anc_allele_available=False,
204
+ )
205
+
206
+ assert (
207
+ results["value"] == expected_result
208
+ ), f"Expected {expected_result}, got {results['value']}"
209
+ assert np.array_equal(results["cdd_pos"], expected_positions)
210
+
211
+
212
+ def test_UStatistic_compute_with_missing_keys():
213
+ ref_gts = np.array([[0, 1, 0], [0, 1, 0], [2, 1, 0]])
214
+ tgt_gts = np.array([[1, 1, 0], [1, 1, 1], [1, 1, 1]])
215
+ src_gts = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]])
216
+ pos = np.array([0, 1, 2])
217
+ w, x, y = 0.5, 0.5, ("=", 0)
218
+
219
+ with pytest.raises(ValueError):
220
+ u_stat = UStatistic(
221
+ ref_gts=ref_gts,
222
+ tgt_gts=tgt_gts,
223
+ src_gts_list=[src_gts],
224
+ ref_ploidy=3,
225
+ tgt_ploidy=1,
226
+ src_ploidy_list=[2],
227
+ )
228
+ u_stat.compute(
229
+ pos=pos,
230
+ w=w,
231
+ x=x,
232
+ y_list=[y],
233
+ )
tests/test___main__.py ADDED
@@ -0,0 +1,51 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ from unittest.mock import patch, MagicMock
22
+ from sai.__main__ import main
23
+
24
+
25
+ @patch("sai.__main__._sai_cli_parser") # Mock _sai_cli_parser to control its output
26
+ @patch(
27
+ "sai.__main__._set_sigpipe_handler"
28
+ ) # Mock _set_sigpipe_handler as it doesn’t need testing
29
+ def test_main(mock_set_sigpipe_handler, mock_sai_cli_parser):
30
+ # Mock parser and its return values
31
+ mock_parser = MagicMock()
32
+ mock_args = MagicMock()
33
+ mock_args.runner = MagicMock()
34
+
35
+ # Configure _sai_cli_parser to return the mock parser
36
+ mock_sai_cli_parser.return_value = mock_parser
37
+ # Configure the mock parser to return mock_args when parse_args is called
38
+ mock_parser.parse_args.return_value = mock_args
39
+
40
+ # Call the main function with a test argument list
41
+ test_args = ["score", "--vcf", "tests/data/example.vcf", "--chr-name", "chr1"]
42
+ main(test_args)
43
+
44
+ # Check if _set_sigpipe_handler was called
45
+ mock_set_sigpipe_handler.assert_called_once()
46
+
47
+ # Verify parse_args was called with test_args
48
+ mock_parser.parse_args.assert_called_once_with(test_args)
49
+
50
+ # Ensure runner was called with the parsed arguments
51
+ mock_args.runner.assert_called_once_with(mock_args)
tests/test_sai.py ADDED
@@ -0,0 +1,102 @@
1
+ # Copyright 2025 Xin Huang
2
+ #
3
+ # GNU General Public License v3.0
4
+ #
5
+ # This program is free software: you can redistribute it and/or modify
6
+ # it under the terms of the GNU General Public License as published by
7
+ # the Free Software Foundation, either version 3 of the License, or
8
+ # (at your option) any later version.
9
+ #
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ # GNU General Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License
16
+ # along with this program. If not, please see
17
+ #
18
+ # https://www.gnu.org/licenses/gpl-3.0.en.html
19
+
20
+
21
+ import pytest
22
+ import pandas as pd
23
+ import sai.stats
24
+ from sai.sai import score, outlier
25
+
26
+
27
+ @pytest.fixture
28
+ def example_data(tmp_path):
29
+ # Define example file paths
30
+ pytest.example_vcf = "tests/data/example.vcf"
31
+ pytest.example_config = "tests/data/test_sai.config.yaml"
32
+
33
+ # Create a temporary output file path for the score function
34
+ temp_output_file = tmp_path / "output.tsv"
35
+
36
+ return {
37
+ "vcf_file": pytest.example_vcf,
38
+ "output_file": str(temp_output_file),
39
+ "output_dir": tmp_path,
40
+ "config": pytest.example_config,
41
+ }
42
+
43
+
44
+ def test_score(example_data):
45
+ # Run score function and capture output
46
+ score(
47
+ vcf_file=example_data["vcf_file"],
48
+ chr_name="21",
49
+ win_len=6666,
50
+ win_step=6666,
51
+ anc_allele_file=None,
52
+ output_file=example_data["output_file"],
53
+ config=example_data["config"],
54
+ num_workers=1,
55
+ )
56
+
57
+ # Read the generated output file and validate contents
58
+ df = pd.read_csv(example_data["output_file"], sep="\t")
59
+
60
+ col_name = [col for col in df.columns if col.startswith("Q")][0]
61
+
62
+ assert df[col_name].iloc[0] == 0.9, "Unexpected value in 'Q' column"
63
+
64
+
65
+ def test_score_mixed_ploidy(example_data):
66
+ score(
67
+ vcf_file="tests/data/test.mixed.ploidy.data.vcf",
68
+ chr_name="21",
69
+ win_len=50000,
70
+ win_step=50000,
71
+ anc_allele_file=None,
72
+ output_file=example_data["output_file"],
73
+ config="tests/data/test_mixed_ploidy.config.yaml",
74
+ num_workers=1,
75
+ )
76
+
77
+ df = pd.read_csv(example_data["output_file"], sep="\t")
78
+
79
+ assert df["U"].iloc[0] == 0, "Unexpected value in 'U' column"
80
+ assert df["U"].iloc[1] == 1, "Unexpected value in 'U' column"
81
+
82
+
83
+ def test_outlier(example_data):
84
+ output_prefix = f"{example_data['output_dir']}/outliers"
85
+ outlier(
86
+ score_file="tests/data/test.q.scores",
87
+ output_prefix=output_prefix,
88
+ quantile=0.25,
89
+ )
90
+
91
+ df = pd.read_csv(f"{output_prefix}.Q.0.25.outliers.tsv", sep="\t")
92
+ assert df["Q"].iloc[0] == 0.7
93
+
94
+ outlier(
95
+ score_file="tests/data/test.q.scores",
96
+ output_prefix=output_prefix,
97
+ quantile=0.75,
98
+ )
99
+
100
+ df = pd.read_csv(f"{output_prefix}.Q.0.75.outliers.tsv", sep="\t")
101
+ assert df["Q"].iloc[0] == 1.0
102
+ assert df["Q"].iloc[1] == 1.0