sai-pg 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. sai/__init__.py +2 -0
  2. sai/__main__.py +6 -3
  3. sai/configs/__init__.py +24 -0
  4. sai/configs/global_config.py +83 -0
  5. sai/configs/ploidy_config.py +94 -0
  6. sai/configs/pop_config.py +82 -0
  7. sai/configs/stat_config.py +220 -0
  8. sai/{utils/generators → generators}/chunk_generator.py +2 -8
  9. sai/{utils/generators → generators}/window_generator.py +82 -37
  10. sai/{utils/multiprocessing → multiprocessing}/mp_manager.py +2 -2
  11. sai/{utils/multiprocessing → multiprocessing}/mp_pool.py +2 -2
  12. sai/parsers/outlier_parser.py +4 -3
  13. sai/parsers/score_parser.py +8 -119
  14. sai/{utils/preprocessors → preprocessors}/chunk_preprocessor.py +21 -15
  15. sai/preprocessors/feature_preprocessor.py +236 -0
  16. sai/registries/__init__.py +22 -0
  17. sai/registries/generic_registry.py +89 -0
  18. sai/registries/stat_registry.py +30 -0
  19. sai/sai.py +124 -220
  20. sai/stats/__init__.py +11 -0
  21. sai/stats/danc_statistic.py +83 -0
  22. sai/stats/dd_statistic.py +77 -0
  23. sai/stats/df_statistic.py +84 -0
  24. sai/stats/dplus_statistic.py +86 -0
  25. sai/stats/fd_statistic.py +92 -0
  26. sai/stats/generic_statistic.py +93 -0
  27. sai/stats/q_statistic.py +104 -0
  28. sai/stats/stat_utils.py +259 -0
  29. sai/stats/u_statistic.py +99 -0
  30. sai/utils/utils.py +220 -143
  31. {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/METADATA +3 -14
  32. sai_pg-1.1.0.dist-info/RECORD +70 -0
  33. {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/WHEEL +1 -1
  34. sai_pg-1.1.0.dist-info/top_level.txt +2 -0
  35. tests/configs/test_global_config.py +163 -0
  36. tests/configs/test_ploidy_config.py +93 -0
  37. tests/configs/test_pop_config.py +90 -0
  38. tests/configs/test_stat_config.py +171 -0
  39. tests/generators/test_chunk_generator.py +51 -0
  40. tests/generators/test_window_generator.py +164 -0
  41. tests/multiprocessing/test_mp_manager.py +92 -0
  42. tests/multiprocessing/test_mp_pool.py +79 -0
  43. tests/parsers/test_argument_validation.py +133 -0
  44. tests/parsers/test_outlier_parser.py +53 -0
  45. tests/parsers/test_score_parser.py +63 -0
  46. tests/preprocessors/test_chunk_preprocessor.py +79 -0
  47. tests/preprocessors/test_feature_preprocessor.py +223 -0
  48. tests/registries/test_registries.py +74 -0
  49. tests/stats/test_danc_statistic.py +51 -0
  50. tests/stats/test_dd_statistic.py +45 -0
  51. tests/stats/test_df_statistic.py +73 -0
  52. tests/stats/test_dplus_statistic.py +79 -0
  53. tests/stats/test_fd_statistic.py +68 -0
  54. tests/stats/test_q_statistic.py +268 -0
  55. tests/stats/test_stat_utils.py +354 -0
  56. tests/stats/test_u_statistic.py +233 -0
  57. tests/test___main__.py +51 -0
  58. tests/test_sai.py +102 -0
  59. tests/utils/test_utils.py +511 -0
  60. sai/parsers/plot_parser.py +0 -152
  61. sai/stats/features.py +0 -302
  62. sai/utils/preprocessors/feature_preprocessor.py +0 -211
  63. sai_pg-1.0.0.dist-info/RECORD +0 -30
  64. sai_pg-1.0.0.dist-info/top_level.txt +0 -1
  65. /sai/{utils/generators → generators}/__init__.py +0 -0
  66. /sai/{utils/generators → generators}/data_generator.py +0 -0
  67. /sai/{utils/multiprocessing → multiprocessing}/__init__.py +0 -0
  68. /sai/{utils/preprocessors → preprocessors}/__init__.py +0 -0
  69. /sai/{utils/preprocessors → preprocessors}/data_preprocessor.py +0 -0
  70. {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/entry_points.txt +0 -0
  71. {sai_pg-1.0.0.dist-info → sai_pg-1.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -18,10 +18,12 @@
18
18
  # https://www.gnu.org/licenses/gpl-3.0.en.html
19
19
 
20
20
 
21
+ import numpy as np
21
22
  from itertools import combinations, product
22
23
  from typing import Iterator, Any
23
24
  from sai.utils import read_data, split_genome
24
- from sai.utils.generators import DataGenerator
25
+ from sai.generators import DataGenerator
26
+ from sai.configs import PloidyConfig
25
27
 
26
28
 
27
29
  class WindowGenerator(DataGenerator):
@@ -37,8 +39,10 @@ class WindowGenerator(DataGenerator):
37
39
  ref_ind_file: str,
38
40
  tgt_ind_file: str,
39
41
  src_ind_file: str,
42
+ out_ind_file: str,
40
43
  win_len: int,
41
44
  win_step: int,
45
+ ploidy_config: PloidyConfig,
42
46
  start: int = None,
43
47
  end: int = None,
44
48
  anc_allele_file: str = None,
@@ -59,10 +63,14 @@ class WindowGenerator(DataGenerator):
59
63
  The path to the file containing identifiers for target populations.
60
64
  src_ind_file : str
61
65
  The path to the file containing identifiers for source populations.
66
+ out_ind_file : str
67
+ The path to the file containing identifiers for outgroup populations.
62
68
  win_len : int
63
69
  The length of each window in base pairs.
64
70
  win_step : int
65
71
  The step size between windows in base pairs.
72
+ ploidy_config : PloidyConfig
73
+ Configuration specifying ploidy levels for each population involved in the analysis.
66
74
  start: int, optional
67
75
  The starting position (1-based, inclusive) on the chromosome. Default: None.
68
76
  end: int, optional
@@ -88,17 +96,10 @@ class WindowGenerator(DataGenerator):
88
96
  self.win_step = win_step
89
97
  self.num_src = num_src
90
98
  self.chr_name = chr_name
99
+ self.ploidy_config = ploidy_config
91
100
 
92
101
  # Load data
93
- (
94
- self.ref_data,
95
- self.ref_samples,
96
- self.tgt_data,
97
- self.tgt_samples,
98
- self.src_data,
99
- self.src_samples,
100
- self.ploidy,
101
- ) = read_data(
102
+ results = read_data(
102
103
  vcf_file=vcf_file,
103
104
  chr_name=self.chr_name,
104
105
  start=start,
@@ -106,13 +107,25 @@ class WindowGenerator(DataGenerator):
106
107
  ref_ind_file=ref_ind_file,
107
108
  tgt_ind_file=tgt_ind_file,
108
109
  src_ind_file=src_ind_file,
110
+ out_ind_file=out_ind_file,
111
+ ploidy_config=ploidy_config,
109
112
  anc_allele_file=anc_allele_file,
110
113
  is_phased=False,
111
114
  filter_ref=False,
112
115
  filter_tgt=False,
113
116
  filter_src=False,
117
+ filter_missing=True,
114
118
  )
115
119
 
120
+ self.ref_data = results["ref"][0]
121
+ self.tgt_data = results["tgt"][0]
122
+ self.src_data = results["src"][0]
123
+ self.out_data = results["outgroup"][0]
124
+ self.ref_samples = results["ref"][1]
125
+ self.tgt_samples = results["tgt"][1]
126
+ self.src_samples = results["src"][1]
127
+ self.out_samples = results["outgroup"][1]
128
+
116
129
  self.src_combinations = list(
117
130
  combinations(self.src_samples.keys(), self.num_src)
118
131
  )
@@ -125,6 +138,7 @@ class WindowGenerator(DataGenerator):
125
138
  ),
126
139
  window_size=self.win_len,
127
140
  step_size=self.win_step,
141
+ start=start,
128
142
  )
129
143
  for tgt_pop in self.tgt_samples
130
144
  }
@@ -148,39 +162,70 @@ class WindowGenerator(DataGenerator):
148
162
  for ref_pop, tgt_pop, src_comb in product(
149
163
  self.ref_samples, self.tgt_samples, self.src_combinations
150
164
  ):
151
- tgt_pos = self.tgt_data[tgt_pop].POS
152
165
  for start, end in self.tgt_windows[tgt_pop]:
153
- ref_gts = self.ref_data[ref_pop].GT[
154
- (self.ref_data[ref_pop].POS >= start)
155
- & (self.ref_data[ref_pop].POS < end)
166
+ ref_data = self.ref_data[ref_pop]
167
+ tgt_data = self.tgt_data[tgt_pop]
168
+ src_data_list = [self.src_data[src_pop] for src_pop in src_comb]
169
+
170
+ ref_mask = (ref_data.POS >= start) & (ref_data.POS <= end)
171
+ tgt_mask = (tgt_data.POS >= start) & (tgt_data.POS <= end)
172
+ src_masks = [
173
+ (src_data.POS >= start) & (src_data.POS <= end)
174
+ for src_data in src_data_list
156
175
  ]
157
- tgt_gts = self.tgt_data[tgt_pop].GT[
158
- (self.tgt_data[tgt_pop].POS >= start)
159
- & (self.tgt_data[tgt_pop].POS < end)
176
+
177
+ ref_pos = ref_data.POS[ref_mask]
178
+ tgt_pos = tgt_data.POS[tgt_mask]
179
+ src_pos_list = [
180
+ src_data.POS[mask]
181
+ for src_data, mask in zip(src_data_list, src_masks)
160
182
  ]
183
+
184
+ common_pos = np.intersect1d(ref_pos, tgt_pos)
185
+ for src_pos in src_pos_list:
186
+ common_pos = np.intersect1d(common_pos, src_pos)
187
+
188
+ ref_gts = ref_data.GT.compress(
189
+ np.isin(ref_data.POS, common_pos), axis=0
190
+ )
191
+ tgt_gts = tgt_data.GT.compress(
192
+ np.isin(tgt_data.POS, common_pos), axis=0
193
+ )
161
194
  src_gts_list = [
162
- self.src_data[src_pop].GT[
163
- (self.src_data[src_pop].POS >= start)
164
- & (self.src_data[src_pop].POS < end)
165
- ]
166
- for src_pop in src_comb
195
+ src_data.GT.compress(np.isin(src_data.POS, common_pos), axis=0)
196
+ for src_data in src_data_list
167
197
  ]
168
198
 
169
- sub_pos = tgt_pos[(tgt_pos >= start) & (tgt_pos < end)]
199
+ sub_pos = common_pos
170
200
 
171
- yield {
172
- "chr_name": self.chr_name,
173
- "ref_pop": ref_pop,
174
- "tgt_pop": tgt_pop,
175
- "src_pop_list": src_comb, # List of source populations in this combination
176
- "start": start,
177
- "end": end,
178
- "pos": sub_pos,
179
- "ref_gts": ref_gts,
180
- "tgt_gts": tgt_gts,
181
- "src_gts_list": src_gts_list, # List of genotypes for each source population in src_comb
182
- "ploidy": self.ploidy,
183
- }
201
+ if len(sub_pos) == 0:
202
+ yield {
203
+ "chr_name": self.chr_name,
204
+ "ref_pop": ref_pop,
205
+ "tgt_pop": tgt_pop,
206
+ "src_pop_list": src_comb,
207
+ "start": start,
208
+ "end": end,
209
+ "pos": [],
210
+ "ref_gts": None,
211
+ "tgt_gts": None,
212
+ "src_gts_list": None,
213
+ "ploidy_config": self.ploidy_config,
214
+ }
215
+ else:
216
+ yield {
217
+ "chr_name": self.chr_name,
218
+ "ref_pop": ref_pop,
219
+ "tgt_pop": tgt_pop,
220
+ "src_pop_list": src_comb, # List of source populations in this combination
221
+ "start": start,
222
+ "end": end,
223
+ "pos": sub_pos,
224
+ "ref_gts": ref_gts,
225
+ "tgt_gts": tgt_gts,
226
+ "src_gts_list": src_gts_list, # List of genotypes for each source population in src_comb
227
+ "ploidy_config": self.ploidy_config,
228
+ }
184
229
 
185
230
  def _none_window_generator(self) -> Iterator[dict[str, Any]]:
186
231
  """
@@ -217,7 +262,7 @@ class WindowGenerator(DataGenerator):
217
262
  "ref_gts": None,
218
263
  "tgt_gts": None,
219
264
  "src_gts_list": None,
220
- "ploidy": None,
265
+ "ploidy_config": self.ploidy_config,
221
266
  }
222
267
 
223
268
  def get(self) -> Iterator[dict[str, Any]]:
@@ -25,8 +25,8 @@ from multiprocessing import current_process
25
25
  from multiprocessing import Manager
26
26
  from multiprocessing import Process
27
27
  from threading import Thread
28
- from sai.utils.generators import DataGenerator
29
- from sai.utils.preprocessors import DataPreprocessor
28
+ from sai.generators import DataGenerator
29
+ from sai.preprocessors import DataPreprocessor
30
30
 
31
31
 
32
32
  def monitor(shared_dict: dict, workers: list[multiprocessing.Process]) -> None:
@@ -20,8 +20,8 @@
20
20
 
21
21
  from multiprocessing import Pool
22
22
  from typing import Any
23
- from sai.utils.generators import DataGenerator
24
- from sai.utils.preprocessors import DataPreprocessor
23
+ from sai.generators import DataGenerator
24
+ from sai.preprocessors import DataPreprocessor
25
25
 
26
26
 
27
27
  def mp_worker(params: tuple[DataPreprocessor, dict]) -> Any:
@@ -37,7 +37,7 @@ def _run_outlier(args: argparse.Namespace) -> None:
37
37
  # Call the outlier function with parsed arguments
38
38
  outlier(
39
39
  score_file=args.score,
40
- output=args.output,
40
+ output_prefix=args.output_prefix,
41
41
  quantile=args.quantile,
42
42
  )
43
43
 
@@ -62,10 +62,11 @@ def add_outlier_parser(subparsers: argparse.ArgumentParser) -> None:
62
62
  help="Path to the input score file.",
63
63
  )
64
64
  parser.add_argument(
65
- "--output",
65
+ "--output-prefix",
66
+ dest="output_prefix",
66
67
  type=str,
67
68
  required=True,
68
- help="Path to save the output file.",
69
+ help="Prefix of the output files.",
69
70
  )
70
71
  parser.add_argument(
71
72
  "--quantile",
@@ -19,13 +19,9 @@
19
19
 
20
20
 
21
21
  import argparse
22
- import re
23
22
  from sai.parsers.argument_validation import positive_int
24
23
  from sai.parsers.argument_validation import existed_file
25
- from sai.parsers.argument_validation import between_zero_and_one
26
- from sai.parsers.argument_validation import validate_stat_type
27
24
  from sai.sai import score
28
- from sai.utils.utils import parse_ind_file
29
25
 
30
26
 
31
27
  def _run_score(args: argparse.Namespace) -> None:
@@ -42,108 +38,36 @@ def _run_score(args: argparse.Namespace) -> None:
42
38
  Path to the VCF file containing variant data.
43
39
  - chr_name : str
44
40
  Name of the chromosome to be analyzed.
45
- - ref : str
46
- Path to the reference group individual file.
47
- - tgt : str
48
- Path to the target group individual file.
49
- - src : str
50
- Path to the source population individual file.
51
41
  - win_len : int
52
42
  Length of each analysis window.
53
43
  - win_step : int
54
44
  Step size for moving the window along the sequence.
55
- - num_src : int
56
- Number of source populations. The length of `args.y` should match `num_src`.
57
45
  - anc_alleles : str
58
46
  Path to the ancestral allele file.
59
- - w : float
60
- Allele frequency threshold for the reference group.
61
- - y : list of float
62
- List of allele frequency thresholds for each source population. Its length must match `num_src`.
63
47
  - output : str
64
48
  Path to the output file for storing results.
65
- - stat_type: str
66
- Specifies the type of statistic to compute.
49
+ - stat_config: str
50
+ Path to the YAML configuration file specifying the statistics, ploidy levels, and populations to compute.
67
51
 
68
52
  Raises
69
53
  ------
70
54
  ValueError
71
- If the length of `args.y` does not match the expected number of source populations (`args.num_src`),
55
+ If fewer than three ploidy values are provided,
56
+ or if the number of ploidy values for source populations does not match `num_src`.
72
57
  or if other input parameters do not meet expected conditions.
73
58
  """
74
- src_samples = parse_ind_file(args.src)
75
- num_src = len(src_samples.keys())
76
- if len(args.y) != num_src:
77
- raise ValueError(
78
- f"The length of y ({len(args.y)}) does not match the number of source populations ({num_src}) found in {args.src}."
79
- )
80
-
81
59
  score(
82
60
  vcf_file=args.vcf,
83
61
  chr_name=args.chr_name,
84
- ref_ind_file=args.ref,
85
- tgt_ind_file=args.tgt,
86
- src_ind_file=args.src,
87
62
  win_len=args.win_len,
88
63
  win_step=args.win_step,
89
- num_src=num_src,
90
64
  anc_allele_file=args.anc_alleles,
91
- w=args.w,
92
- y=args.y,
93
65
  output_file=args.output,
94
- stat_type=args.stat,
66
+ config=args.config,
95
67
  num_workers=1,
96
68
  )
97
69
 
98
70
 
99
- def _parse_y_thresholds(value: str) -> tuple[str, float]:
100
- """
101
- Parses the --y parameter value to extract an operator and a numerical threshold.
102
-
103
- This function ensures that the input is correctly formatted as one of the following:
104
- - `=X` (equality condition)
105
- - `>X` (greater than condition)
106
- - `<X` (less than condition)
107
- - `>=X` (greater than or equal to condition)
108
- - `<=X` (less than or equal to condition)
109
-
110
- The numerical value `X` must be within the range [0, 1].
111
-
112
- Parameters
113
- ----------
114
- value : str
115
- A string representing the allele frequency threshold condition, e.g., "=0.7", ">0.8", "<=0.2".
116
-
117
- Returns
118
- -------
119
- tuple[str, float]
120
- A tuple containing:
121
- - A string representing the comparison operator (`=`, `<`, `>`, `<=`, `>=`).
122
- - A float representing the threshold value.
123
-
124
- Raises
125
- ------
126
- argparse.ArgumentTypeError
127
- If the input format is invalid or the numerical threshold is outside the range [0, 1].
128
- """
129
- match = re.match(r"^(=|<|>|<=|>=)(\d*\.?\d+)$", value)
130
- if not match:
131
- raise argparse.ArgumentTypeError(
132
- f"Invalid format for --y: {value}. Must be in the form =X, >X, <X, >=X, or <=X "
133
- f"(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2)."
134
- )
135
-
136
- operator, num_str = match.groups()
137
- num = float(num_str)
138
-
139
- if not (0 <= num <= 1):
140
- raise argparse.ArgumentTypeError(
141
- f"Value for --y must be between 0 and 1, got {num}."
142
- )
143
-
144
- return operator, num
145
-
146
-
147
71
  def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
148
72
  """
149
73
  Initializes and configures the command-line interface parser
@@ -170,24 +94,6 @@ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
170
94
  required=True,
171
95
  help="Chromosome name to analyze from the VCF file.",
172
96
  )
173
- parser.add_argument(
174
- "--ref",
175
- type=existed_file,
176
- required=True,
177
- help="Path to the file with reference population identifiers.",
178
- )
179
- parser.add_argument(
180
- "--tgt",
181
- type=existed_file,
182
- required=True,
183
- help="Path to the file with target population identifiers.",
184
- )
185
- parser.add_argument(
186
- "--src",
187
- type=existed_file,
188
- required=True,
189
- help="Path to the file with source population identifiers.",
190
- )
191
97
  parser.add_argument(
192
98
  "--win-len",
193
99
  dest="win_len",
@@ -209,23 +115,6 @@ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
209
115
  default=None,
210
116
  help="Path to the BED file with ancestral allele information. If ancestral allele information is not provided, filtering will be performed for each variant based on whether the allele frequency of any allele (assuming biallelic) meets the specified condition during the calculation of the statistics. Default: None.",
211
117
  )
212
- parser.add_argument(
213
- "--w",
214
- type=between_zero_and_one,
215
- default=0.01,
216
- help="Frequency threshold for variants in the reference population; only variants with frequencies below this threshold are included in the analysis. Default: 0.01.",
217
- )
218
- parser.add_argument(
219
- "--y",
220
- type=_parse_y_thresholds,
221
- nargs="+",
222
- default=[("=", 1.0)],
223
- help="List of allele frequency conditions for the source populations. "
224
- "Each value must be in the form =X, >X, <X, >=X, or <=X "
225
- "(e.g., =0.7, >0.8, <0.1, >=0.5, <=0.2). "
226
- "The number of values must match the number of source populations in the file specified by `--src`; "
227
- "the order of the allele frequency conditions should also correspond to the order of source populations in that file. Default: =1",
228
- )
229
118
  parser.add_argument(
230
119
  "--output",
231
120
  type=str,
@@ -233,9 +122,9 @@ def add_score_parser(subparsers: argparse.ArgumentParser) -> None:
233
122
  help="Output file path for saving results.",
234
123
  )
235
124
  parser.add_argument(
236
- "--stat",
237
- type=validate_stat_type,
125
+ "--config",
126
+ type=existed_file,
238
127
  required=True,
239
- help="Type of statistic to compute: UXX or QXX, where XX is a percentage-like index indicating a threshold in the target population. For example, `U50` means the allele frequency is greater than 0.5, and `Q95` means the allele frequency is greater than or equal to the 95th percentile among sites meeting the specified conditions.",
128
+ help="Path to the YAML configuration file specifying the statistics to compute, ploidy settings, and population group file paths.",
240
129
  )
241
130
  parser.set_defaults(runner=_run_score)
@@ -19,8 +19,10 @@
19
19
 
20
20
 
21
21
  from typing import Any
22
- from sai.utils.generators import WindowGenerator
23
- from sai.utils.preprocessors import DataPreprocessor
22
+ from sai.utils import parse_ind_file
23
+ from sai.generators import WindowGenerator
24
+ from sai.preprocessors import DataPreprocessor
25
+ from sai.configs import PloidyConfig, StatConfig
24
26
  from .feature_preprocessor import FeaturePreprocessor
25
27
 
26
28
 
@@ -39,12 +41,12 @@ class ChunkPreprocessor(DataPreprocessor):
39
41
  ref_ind_file: str,
40
42
  tgt_ind_file: str,
41
43
  src_ind_file: str,
44
+ out_ind_file: str,
42
45
  win_len: int,
43
46
  win_step: int,
44
- w: float,
45
- y: list[float],
46
47
  output_file: str,
47
- stat_type: str,
48
+ ploidy_config: PloidyConfig,
49
+ stat_config: StatConfig,
48
50
  anc_allele_file: str = None,
49
51
  num_src: int = 1,
50
52
  ):
@@ -61,18 +63,18 @@ class ChunkPreprocessor(DataPreprocessor):
61
63
  Path to the file containing target individual IDs.
62
64
  src_ind_file : str
63
65
  Path to the file containing source individual IDs.
66
+ out_ind_file : str
67
+ Path to the file containing outgroup individual IDs.
64
68
  win_len : int
65
69
  Window length for generating genomic windows.
66
70
  win_step : int
67
71
  Step size for sliding windows across the genome.
68
- w : float
69
- Parameter w for feature vector computation.
70
- y : list of float
71
- List of y parameters for feature vector computation.
72
72
  output_file : str
73
73
  Path to the output file for storing feature vectors.
74
- stat_type : str
75
- Type of statistic to compute for feature vectors.
74
+ ploidy_config : PloidyConfig
75
+ Configuration specifying ploidy levels for each population involved in the analysis.
76
+ stat_config : StatConfig
77
+ Configuration of statistics to compute for feature vectors.
76
78
  anc_allele_file : str, optional
77
79
  Path to the ancestral allele file. If None, ancestral allele
78
80
  information is considered unavailable.
@@ -83,18 +85,20 @@ class ChunkPreprocessor(DataPreprocessor):
83
85
  self.ref_ind_file = ref_ind_file
84
86
  self.tgt_ind_file = tgt_ind_file
85
87
  self.src_ind_file = src_ind_file
88
+ self.out_ind_file = out_ind_file
86
89
  self.win_len = win_len
87
90
  self.win_step = win_step
91
+ self.ploidy_config = ploidy_config
88
92
  self.anc_allele_file = anc_allele_file
89
- self.num_src = num_src
93
+
94
+ src_samples = parse_ind_file(src_ind_file)
95
+ self.num_src = len(src_samples.keys())
90
96
 
91
97
  anc_allele_available = anc_allele_file is not None
92
98
 
93
99
  self.feature_preprocessor = FeaturePreprocessor(
94
- w=w,
95
- y=y,
96
100
  output_file=output_file,
97
- stat_type=stat_type,
101
+ stat_config=stat_config,
98
102
  anc_allele_available=anc_allele_available,
99
103
  )
100
104
 
@@ -127,8 +131,10 @@ class ChunkPreprocessor(DataPreprocessor):
127
131
  ref_ind_file=self.ref_ind_file,
128
132
  tgt_ind_file=self.tgt_ind_file,
129
133
  src_ind_file=self.src_ind_file,
134
+ out_ind_file=self.out_ind_file,
130
135
  win_len=self.win_len,
131
136
  win_step=self.win_step,
137
+ ploidy_config=self.ploidy_config,
132
138
  anc_allele_file=self.anc_allele_file,
133
139
  num_src=self.num_src,
134
140
  )