pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pg-sui might be problematic. Click here for more details.

Files changed (112) hide show
  1. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
  2. pg_sui-1.6.8.dist-info/RECORD +78 -0
  3. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
  4. pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
  5. pg_sui-1.6.8.dist-info/top_level.txt +1 -0
  6. pgsui/__init__.py +35 -54
  7. pgsui/_version.py +34 -0
  8. pgsui/cli.py +635 -0
  9. pgsui/data_processing/config.py +576 -0
  10. pgsui/data_processing/containers.py +1782 -0
  11. pgsui/data_processing/transformers.py +121 -1103
  12. pgsui/electron/app/__main__.py +5 -0
  13. pgsui/electron/app/icons/icons/1024x1024.png +0 -0
  14. pgsui/electron/app/icons/icons/128x128.png +0 -0
  15. pgsui/electron/app/icons/icons/16x16.png +0 -0
  16. pgsui/electron/app/icons/icons/24x24.png +0 -0
  17. pgsui/electron/app/icons/icons/256x256.png +0 -0
  18. pgsui/electron/app/icons/icons/32x32.png +0 -0
  19. pgsui/electron/app/icons/icons/48x48.png +0 -0
  20. pgsui/electron/app/icons/icons/512x512.png +0 -0
  21. pgsui/electron/app/icons/icons/64x64.png +0 -0
  22. pgsui/electron/app/icons/icons/icon.icns +0 -0
  23. pgsui/electron/app/icons/icons/icon.ico +0 -0
  24. pgsui/electron/app/main.js +189 -0
  25. pgsui/electron/app/package-lock.json +6893 -0
  26. pgsui/electron/app/package.json +50 -0
  27. pgsui/electron/app/preload.js +15 -0
  28. pgsui/electron/app/server.py +146 -0
  29. pgsui/electron/app/ui/logo.png +0 -0
  30. pgsui/electron/app/ui/renderer.js +130 -0
  31. pgsui/electron/app/ui/styles.css +59 -0
  32. pgsui/electron/app/ui/ui_shim.js +72 -0
  33. pgsui/electron/bootstrap.py +43 -0
  34. pgsui/electron/launch.py +59 -0
  35. pgsui/electron/package.json +14 -0
  36. pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
  37. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
  38. pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
  39. pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
  40. pgsui/impute/deterministic/imputers/mode.py +679 -0
  41. pgsui/impute/deterministic/imputers/nmf.py +221 -0
  42. pgsui/impute/deterministic/imputers/phylo.py +971 -0
  43. pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
  44. pgsui/impute/supervised/base.py +339 -0
  45. pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
  46. pgsui/impute/supervised/imputers/random_forest.py +287 -0
  47. pgsui/impute/unsupervised/base.py +924 -0
  48. pgsui/impute/unsupervised/callbacks.py +89 -263
  49. pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
  50. pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
  51. pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
  52. pgsui/impute/unsupervised/imputers/vae.py +957 -0
  53. pgsui/impute/unsupervised/loss_functions.py +158 -0
  54. pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
  55. pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
  56. pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
  57. pgsui/impute/unsupervised/models/vae_model.py +259 -618
  58. pgsui/impute/unsupervised/nn_scorers.py +215 -0
  59. pgsui/utils/classification_viz.py +591 -0
  60. pgsui/utils/misc.py +35 -480
  61. pgsui/utils/plotting.py +514 -824
  62. pgsui/utils/scorers.py +212 -438
  63. pg_sui-1.0.2.1.dist-info/RECORD +0 -75
  64. pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
  65. pgsui/example_data/phylip_files/test_n10.phy +0 -118
  66. pgsui/example_data/phylip_files/test_n100.phy +0 -118
  67. pgsui/example_data/phylip_files/test_n2.phy +0 -118
  68. pgsui/example_data/phylip_files/test_n500.phy +0 -118
  69. pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
  70. pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
  71. pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
  72. pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
  73. pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
  74. pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
  75. pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
  76. pgsui/example_data/trees/test.iqtree +0 -376
  77. pgsui/example_data/trees/test.qmat +0 -5
  78. pgsui/example_data/trees/test.rate +0 -2033
  79. pgsui/example_data/trees/test.tre +0 -1
  80. pgsui/example_data/trees/test_n10.rate +0 -19
  81. pgsui/example_data/trees/test_n100.rate +0 -109
  82. pgsui/example_data/trees/test_n500.rate +0 -509
  83. pgsui/example_data/trees/test_siterates.txt +0 -2024
  84. pgsui/example_data/trees/test_siterates_n10.txt +0 -10
  85. pgsui/example_data/trees/test_siterates_n100.txt +0 -100
  86. pgsui/example_data/trees/test_siterates_n500.txt +0 -500
  87. pgsui/example_data/vcf_files/test.vcf +0 -244
  88. pgsui/example_data/vcf_files/test.vcf.gz +0 -0
  89. pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
  90. pgsui/impute/estimators.py +0 -735
  91. pgsui/impute/impute.py +0 -1486
  92. pgsui/impute/simple_imputers.py +0 -1439
  93. pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
  94. pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
  95. pgsui/impute/unsupervised/keras_classifiers.py +0 -702
  96. pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
  97. pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
  98. pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
  99. pgsui/pg_sui.py +0 -261
  100. pgsui/utils/sequence_tools.py +0 -407
  101. simulation/sim_benchmarks.py +0 -333
  102. simulation/sim_treeparams.py +0 -475
  103. test/__init__.py +0 -0
  104. test/pg_sui_simtest.py +0 -215
  105. test/pg_sui_testing.py +0 -523
  106. test/test.py +0 -297
  107. test/test_pgsui.py +0 -374
  108. test/test_tkc.py +0 -214
  109. {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
  110. /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
  111. /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
  112. {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
pgsui/cli.py ADDED
@@ -0,0 +1,635 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """PG-SUI Imputation CLI
5
+
6
+ Argument-precedence model:
7
+ code defaults < preset (--preset) < YAML (--config) < explicit CLI flags < --set k=v
8
+
9
+ Notes
10
+ -----
11
+ - Preset is a CLI-only choice and will be respected unless overridden by YAML or CLI.
12
+ - YAML entries override preset (a 'preset' key in YAML is ignored with a warning).
13
+ - CLI flags only override when explicitly provided (argparse uses SUPPRESS).
14
+ - --set key=value has the highest precedence and applies dot-path overrides.
15
+
16
+ Examples
17
+ --------
18
+ python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix run1
19
+ python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix tuned --tune
20
+ python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix demo \
21
+ --models ImputeUBP ImputeVAE ImputeMostFrequent --seed deterministic --verbose
22
+ python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix subset \
23
+ --include-pops EA GU TT ON --device cpu
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import argparse
29
+ import ast
30
+ import logging
31
+ import sys
32
+ from pathlib import Path
33
+ from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple
34
+
35
+ from snpio import GenePopReader, PhylipReader, StructureReader, VCFReader
36
+
37
+ from pgsui import (
38
+ ImputeAutoencoder,
39
+ ImputeMostFrequent,
40
+ ImputeNLPCA,
41
+ ImputeRefAllele,
42
+ ImputeUBP,
43
+ ImputeVAE,
44
+ )
45
+ from pgsui.data_processing.config import (
46
+ apply_dot_overrides,
47
+ dataclass_to_yaml,
48
+ load_yaml_to_dataclass,
49
+ save_dataclass_yaml,
50
+ )
51
+ from pgsui import (
52
+ AutoencoderConfig,
53
+ MostFrequentConfig,
54
+ NLPCAConfig,
55
+ RefAlleleConfig,
56
+ UBPConfig,
57
+ VAEConfig,
58
+ )
59
+
60
+
61
+ # ----------------------------- CLI Utilities ----------------------------- #
62
+ def _configure_logging(verbose: bool, log_file: Optional[str] = None) -> None:
63
+ """Configure root logger.
64
+
65
+ Args:
66
+ verbose: If True, DEBUG; else INFO.
67
+ log_file: Optional file to tee logs to.
68
+ """
69
+ level = logging.DEBUG if verbose else logging.INFO
70
+ handlers: List[logging.Handler] = [logging.StreamHandler(sys.stdout)]
71
+ if log_file:
72
+ handlers.append(logging.FileHandler(log_file, mode="w", encoding="utf-8"))
73
+ logging.basicConfig(
74
+ level=level,
75
+ format="%(asctime)s - %(levelname)s - %(message)s",
76
+ handlers=handlers,
77
+ )
78
+
79
+
80
+ def _parse_seed(seed_arg: str) -> Optional[int]:
81
+ """Parse --seed argument into an int or None."""
82
+ s = seed_arg.strip().lower()
83
+ if s == "random":
84
+ return None
85
+ if s == "deterministic":
86
+ return 42
87
+ try:
88
+ return int(seed_arg)
89
+ except ValueError as e:
90
+ raise argparse.ArgumentTypeError(
91
+ "Invalid --seed. Use 'random', 'deterministic', or an integer."
92
+ ) from e
93
+
94
+
95
+ def _parse_models(models: Iterable[str]) -> Tuple[str, ...]:
96
+ """Validate and canonicalize model names."""
97
+ valid = {
98
+ "ImputeUBP",
99
+ "ImputeVAE",
100
+ "ImputeAutoencoder",
101
+ "ImputeNLPCA",
102
+ "ImputeMostFrequent",
103
+ "ImputeRefAllele",
104
+ }
105
+ selected = tuple(models) if models else tuple(valid)
106
+ unknown = [m for m in selected if m not in valid]
107
+ if unknown:
108
+ raise argparse.ArgumentTypeError(
109
+ f"Unknown model(s): {unknown}. Valid options: {sorted(valid)}"
110
+ )
111
+ return selected
112
+
113
+
114
+ def _parse_overrides(pairs: list[str]) -> dict:
115
+ """Parse --set key=value into typed values via literal_eval."""
116
+ out: dict = {}
117
+ for kv in pairs or []:
118
+ if "=" not in kv:
119
+ raise argparse.ArgumentTypeError(f"--set expects key=value, got '{kv}'")
120
+ k, v = kv.split("=", 1)
121
+ v = v.strip()
122
+ try:
123
+ out[k] = ast.literal_eval(v)
124
+ except Exception:
125
+ out[k] = v # raw string fallback
126
+ return out
127
+
128
+
129
+ def _args_to_cli_overrides(args: argparse.Namespace) -> dict:
130
+ """Convert explicitly provided CLI flags into config dot-overrides."""
131
+ overrides: dict = {}
132
+
133
+ # IO / top-level controls
134
+ if hasattr(args, "prefix") and args.prefix is not None:
135
+ overrides["io.prefix"] = args.prefix
136
+ else:
137
+ overrides["io.prefix"] = str(Path(args.vcf).stem)
138
+
139
+ if hasattr(args, "verbose"):
140
+ overrides["io.verbose"] = bool(args.verbose)
141
+ if hasattr(args, "n_jobs"):
142
+ overrides["io.n_jobs"] = int(args.n_jobs)
143
+ if hasattr(args, "seed"):
144
+ overrides["io.seed"] = _parse_seed(args.seed)
145
+
146
+ # Train
147
+ if hasattr(args, "batch_size"):
148
+ overrides["train.batch_size"] = int(args.batch_size)
149
+ if hasattr(args, "device"):
150
+ dev = args.device
151
+ if dev == "cuda":
152
+ dev = "gpu"
153
+ overrides["train.device"] = dev
154
+
155
+ # Plot
156
+ if hasattr(args, "plot_format"):
157
+ overrides["plot.fmt"] = args.plot_format
158
+
159
+ # Tuning
160
+ if hasattr(args, "tune"):
161
+ overrides["tune.enabled"] = bool(args.tune)
162
+ if hasattr(args, "tune_n_trials"):
163
+ overrides["tune.n_trials"] = int(args.tune_n_trials)
164
+
165
+ return overrides
166
+
167
+
168
+ # ------------------------------ Core Runner ------------------------------ #
169
+ def build_genotype_data(
170
+ input_path: str,
171
+ fmt: Literal["vcf", "phylip", "genepop"],
172
+ popmap_path: str | None,
173
+ force_popmap: bool,
174
+ verbose: bool,
175
+ include_pops: List[str] | None,
176
+ plot_format: Literal["pdf", "png", "jpg", "jpeg"],
177
+ ):
178
+ """Load genotype data from heterogeneous inputs."""
179
+ logging.info(f"Loading {fmt.upper()} and popmap data...")
180
+ fmt = fmt.lower()
181
+
182
+ kwargs = {
183
+ "filename": input_path,
184
+ "popmapfile": popmap_path,
185
+ "force_popmap": force_popmap,
186
+ "verbose": verbose,
187
+ "include_pops": include_pops if include_pops else None,
188
+ "prefix": f"snpio_{Path(input_path).stem}",
189
+ "plot_format": plot_format,
190
+ }
191
+
192
+ if fmt == "vcf":
193
+ gd = VCFReader(**kwargs)
194
+ elif fmt == "phylip":
195
+ gd = PhylipReader(**kwargs)
196
+ elif fmt == "genepop":
197
+ gd = GenePopReader(**kwargs)
198
+ else:
199
+ raise ValueError(f"Unsupported genotype data format: {fmt}")
200
+
201
+ logging.info("Loaded genotype data.")
202
+ return gd
203
+
204
+
205
+ def run_model_safely(model_name: str, builder, *, warn_only: bool = True) -> None:
206
+ """Run model builder + fit/transform with error isolation."""
207
+ logging.info(f"▶ Running {model_name} ...")
208
+ try:
209
+ model = builder()
210
+ model.fit()
211
+ X_imputed = model.transform()
212
+ logging.info(f"✓ {model_name} completed.")
213
+ return X_imputed
214
+ except Exception as e:
215
+ if warn_only:
216
+ logging.warning(f"⚠ {model_name} failed: {e}", exc_info=True)
217
+ else:
218
+ raise
219
+
220
+
221
+ # -------------------------- Model Registry ------------------------------- #
222
+ # Add config-driven models here by listing the class and its config dataclass.
223
+ MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
224
+ "ImputeUBP": {"cls": ImputeUBP, "config_cls": UBPConfig},
225
+ "ImputeNLPCA": {"cls": ImputeNLPCA, "config_cls": NLPCAConfig},
226
+ "ImputeAutoencoder": {"cls": ImputeAutoencoder, "config_cls": AutoencoderConfig},
227
+ "ImputeVAE": {"cls": ImputeVAE, "config_cls": VAEConfig},
228
+ "ImputeMostFrequent": {"cls": ImputeMostFrequent, "config_cls": MostFrequentConfig},
229
+ "ImputeRefAllele": {"cls": ImputeRefAllele, "config_cls": RefAlleleConfig},
230
+ }
231
+
232
+
233
+ def _build_effective_config_for_model(
234
+ model_name: str, args: argparse.Namespace
235
+ ) -> Any | None:
236
+ """Build the effective config object for a specific model (if it has one).
237
+
238
+ Precedence (lowest → highest):
239
+ defaults < preset (--preset) < YAML (--config) < explicit CLI flags < --set
240
+
241
+ Returns:
242
+ Config dataclass instance or None (for models without config dataclasses).
243
+ """
244
+ reg = MODEL_REGISTRY[model_name]
245
+ cfg_cls = reg.get("config_cls")
246
+
247
+ if cfg_cls is None:
248
+ return None
249
+
250
+ # 0) Start from pure dataclass defaults.
251
+ cfg = cfg_cls()
252
+
253
+ # 1) If user explicitly provided a preset, overlay it.
254
+ if hasattr(args, "preset"):
255
+ preset_name = args.preset
256
+ cfg = cfg_cls.from_preset(preset_name)
257
+ logging.info(f"Initialized {model_name} from '{preset_name}' preset.")
258
+ else:
259
+ logging.info(f"Initialized {model_name} from dataclass defaults (no preset).")
260
+
261
+ # 2) YAML overlays preset/defaults (boss). Ignore any 'preset' in YAML.
262
+ yaml_path = getattr(args, "config", None)
263
+
264
+ if yaml_path:
265
+ cfg = load_yaml_to_dataclass(
266
+ yaml_path,
267
+ cfg_cls,
268
+ base=cfg,
269
+ yaml_preset_behavior="ignore", # 'preset' key in YAML ignored with warning
270
+ )
271
+ logging.info(
272
+ f"Loaded YAML config for {model_name} from {yaml_path} (ignored 'preset' in YAML if present)."
273
+ )
274
+
275
+ # 3) Explicit CLI flags overlay YAML.
276
+ cli_overrides = _args_to_cli_overrides(args)
277
+ if cli_overrides:
278
+ cfg = apply_dot_overrides(cfg, cli_overrides)
279
+
280
+ # 4) --set has highest precedence.
281
+ user_overrides = _parse_overrides(getattr(args, "set", []))
282
+
283
+ if user_overrides:
284
+ try:
285
+ cfg = apply_dot_overrides(cfg, user_overrides)
286
+ except Exception as e:
287
+ if model_name in {
288
+ "ImputeUBP",
289
+ "ImputeNLPCA",
290
+ "ImputeAutoencoder",
291
+ "ImputeVAE",
292
+ }:
293
+ logging.error(
294
+ f"Error applying --set overrides to {model_name} config: {e}"
295
+ )
296
+ raise
297
+ else:
298
+ pass # non-config-driven models ignore --set
299
+
300
+ return cfg
301
+
302
+
303
+ def _maybe_print_or_dump_configs(
304
+ cfgs_by_model: Dict[str, Any], args: argparse.Namespace
305
+ ) -> bool:
306
+ """Handle --print-config / --dump-config for ALL config-driven models selected.
307
+
308
+ Returns:
309
+ True if we printed/dumped and should exit; else False.
310
+ """
311
+ did_io = False
312
+ if getattr(args, "print_config", False):
313
+ for m, cfg in cfgs_by_model.items():
314
+ if cfg is None:
315
+ continue
316
+ print(f"# --- {m} effective config ---")
317
+ print(dataclass_to_yaml(cfg))
318
+ print()
319
+ did_io = True
320
+
321
+ if hasattr(args, "dump_config") and args.dump_config:
322
+ # If multiple models, add suffix per model (before extension if possible)
323
+ dump_base = args.dump_config
324
+ for m, cfg in cfgs_by_model.items():
325
+ if cfg is None:
326
+ continue
327
+ if "." in dump_base:
328
+ stem, ext = dump_base.rsplit(".", 1)
329
+ path = f"{stem}.{m}.{ext}"
330
+ else:
331
+ path = f"{dump_base}.{m}.yaml"
332
+ save_dataclass_yaml(cfg, path)
333
+ logging.info(f"Saved {m} config to {path}")
334
+ did_io = True
335
+
336
+ return did_io
337
+
338
+
339
+ def main(argv: Optional[List[str]] = None) -> int:
340
+ parser = argparse.ArgumentParser(
341
+ prog="pgsui-cli",
342
+ description="Run PG-SUI imputation models on a VCF with minimal fuss.",
343
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
344
+ )
345
+
346
+ # ----------------------------- Required I/O ----------------------------- #
347
+ parser.add_argument(
348
+ "--input",
349
+ default=argparse.SUPPRESS,
350
+ help="Path to input file (VCF/PHYLIP/STRUCTURE/GENEPOP).",
351
+ )
352
+ parser.add_argument(
353
+ "--format",
354
+ choices=("vcf", "phylip", "structure", "genepop"),
355
+ default=argparse.SUPPRESS,
356
+ help="Input format; defaults to 'vcf' when --vcf is used.",
357
+ )
358
+ # Back-compat: --vcf retained; if both provided, --input wins.
359
+ parser.add_argument(
360
+ "--vcf", default=argparse.SUPPRESS, help="Path to input VCF(.gz) file."
361
+ )
362
+ parser.add_argument(
363
+ "--popmap", default=argparse.SUPPRESS, help="Path to population map file."
364
+ )
365
+ parser.add_argument(
366
+ "--prefix",
367
+ default=argparse.SUPPRESS,
368
+ help="Run/output prefix; overrides config if provided.",
369
+ )
370
+
371
+ # ---------------------- Generic Config Inputs -------------------------- #
372
+ parser.add_argument(
373
+ "--config",
374
+ default=argparse.SUPPRESS,
375
+ help="YAML config for config-driven models (NLPCA/UBP/Autoencoder/VAE).",
376
+ )
377
+ parser.add_argument(
378
+ "--preset",
379
+ choices=("fast", "balanced", "thorough"),
380
+ default=argparse.SUPPRESS, # <-- no default; optional
381
+ help="If provided, initialize config(s) from this preset; otherwise start from dataclass defaults.",
382
+ )
383
+ parser.add_argument(
384
+ "--set",
385
+ action="append",
386
+ default=argparse.SUPPRESS,
387
+ help="Dot-key overrides, e.g. --set model.latent_dim=4",
388
+ )
389
+ parser.add_argument(
390
+ "--print-config",
391
+ action="store_true",
392
+ help="Print effective config(s) and exit.",
393
+ )
394
+ parser.add_argument(
395
+ "--dump-config",
396
+ default=argparse.SUPPRESS,
397
+ help="Write effective config(s) YAML to this path (multi-model gets suffixed).",
398
+ )
399
+
400
+ # ------------------------------ Toggles -------------------------------- #
401
+ parser.add_argument(
402
+ "--tune",
403
+ action="store_true",
404
+ default=argparse.SUPPRESS,
405
+ help="Enable hyperparameter tuning (if supported).",
406
+ )
407
+ parser.add_argument(
408
+ "--tune-n-trials",
409
+ type=int,
410
+ default=argparse.SUPPRESS,
411
+ help="Optuna trials when --tune is set.",
412
+ )
413
+ parser.add_argument(
414
+ "--batch-size",
415
+ type=int,
416
+ default=argparse.SUPPRESS,
417
+ help="Batch size for NN-based models.",
418
+ )
419
+ parser.add_argument(
420
+ "--device",
421
+ choices=("cpu", "cuda", "mps"),
422
+ default=argparse.SUPPRESS,
423
+ help="Compute device for NN-based models.",
424
+ )
425
+ parser.add_argument(
426
+ "--n-jobs",
427
+ type=int,
428
+ default=argparse.SUPPRESS,
429
+ help="Parallel workers for various steps.",
430
+ )
431
+ parser.add_argument(
432
+ "--plot-format",
433
+ choices=("png", "pdf", "svg"),
434
+ default=argparse.SUPPRESS,
435
+ help="Figure format for model plots.",
436
+ )
437
+
438
+ # --------------------------- Seed & logging ---------------------------- #
439
+ parser.add_argument(
440
+ "--seed",
441
+ default=argparse.SUPPRESS,
442
+ help="Random seed: 'random', 'deterministic', or an integer.",
443
+ )
444
+ parser.add_argument("--verbose", action="store_true", help="Debug-level logging.")
445
+ parser.add_argument(
446
+ "--log-file", default=argparse.SUPPRESS, help="Also write logs to a file."
447
+ )
448
+
449
+ # ---------------------------- Data filtering --------------------------- #
450
+ parser.add_argument(
451
+ "--include-pops",
452
+ nargs="+",
453
+ default=argparse.SUPPRESS,
454
+ help="Optional list of population IDs to include.",
455
+ )
456
+ parser.add_argument(
457
+ "--force-popmap",
458
+ action="store_true",
459
+ default=False,
460
+ help="Require popmap (error if absent).",
461
+ )
462
+
463
+ # ---------------------------- Model selection -------------------------- #
464
+ parser.add_argument(
465
+ "--models",
466
+ nargs="+",
467
+ default=argparse.SUPPRESS,
468
+ help=(
469
+ "Which models to run. Choices: "
470
+ "ImputeUBP ImputeVAE ImputeAutoencoder ImputeNLPCA "
471
+ "ImputeMostFrequent ImputeRefAllele. Default is all."
472
+ ),
473
+ )
474
+
475
+ # ------------------------------ Safety/UX ------------------------------ #
476
+ parser.add_argument(
477
+ "--dry-run",
478
+ action="store_true",
479
+ help="Parse args and load data, but skip model training.",
480
+ )
481
+
482
+ args = parser.parse_args(argv)
483
+
484
+ # Logging (verbose default is False unless passed)
485
+ _configure_logging(
486
+ verbose=getattr(args, "verbose", False),
487
+ log_file=getattr(args, "log_file", None),
488
+ )
489
+
490
+ # Models selection (default to all if not explicitly provided)
491
+ try:
492
+ selected_models = _parse_models(getattr(args, "models", ()))
493
+ except argparse.ArgumentTypeError as e:
494
+ parser.error(str(e))
495
+ return 2
496
+
497
+ # Input resolution
498
+ input_path = getattr(args, "input", None)
499
+ if input_path is None and hasattr(args, "vcf"):
500
+ input_path = args.vcf
501
+ if not hasattr(args, "format"):
502
+ setattr(args, "format", "vcf")
503
+
504
+ if input_path is None:
505
+ parser.error("You must provide --input (or legacy --vcf).")
506
+ return 2
507
+
508
+ fmt = getattr(args, "format", "vcf").lower()
509
+ popmap_path = getattr(args, "popmap", None)
510
+ include_pops = getattr(args, "include_pops", None)
511
+ verbose_flag = getattr(args, "verbose", False)
512
+ force_popmap = bool(getattr(args, "force_popmap", False))
513
+
514
+ # Load genotype data
515
+ gd = build_genotype_data(
516
+ input_path=input_path,
517
+ fmt=fmt,
518
+ popmap_path=popmap_path,
519
+ force_popmap=force_popmap,
520
+ verbose=verbose_flag,
521
+ include_pops=include_pops,
522
+ plot_format=getattr(args, "plot_format", "pdf"),
523
+ )
524
+
525
+ if getattr(args, "dry_run", False):
526
+ logging.info("Dry run complete. Exiting without training models.")
527
+ return 0
528
+
529
+ # ---------------- Build config(s) per selected model ------------------- #
530
+ cfgs_by_model: Dict[str, Any] = {
531
+ m: _build_effective_config_for_model(m, args) for m in selected_models
532
+ }
533
+
534
+ # Maybe print/dump configs and exit
535
+ if _maybe_print_or_dump_configs(cfgs_by_model, args):
536
+ return 0
537
+
538
+ # ------------------------- Model Builders ------------------------------ #
539
+ def build_impute_ubp():
540
+ cfg = cfgs_by_model.get("ImputeUBP")
541
+ if cfg is None:
542
+ cfg = (
543
+ UBPConfig.from_preset(args.preset)
544
+ if hasattr(args, "preset")
545
+ else UBPConfig()
546
+ )
547
+ return ImputeUBP(genotype_data=gd, config=cfg)
548
+
549
+ def build_impute_nlpca():
550
+ cfg = cfgs_by_model.get("ImputeNLPCA")
551
+ if cfg is None:
552
+ cfg = (
553
+ NLPCAConfig.from_preset(args.preset)
554
+ if hasattr(args, "preset")
555
+ else NLPCAConfig()
556
+ )
557
+ return ImputeNLPCA(genotype_data=gd, config=cfg)
558
+
559
+ def build_impute_vae():
560
+ cfg = cfgs_by_model.get("ImputeVAE")
561
+ if cfg is None:
562
+ cfg = (
563
+ VAEConfig.from_preset(args.preset)
564
+ if hasattr(args, "preset")
565
+ else VAEConfig()
566
+ )
567
+ return ImputeVAE(genotype_data=gd, config=cfg)
568
+
569
+ def build_impute_autoencoder():
570
+ cfg = cfgs_by_model.get("ImputeAutoencoder")
571
+ if cfg is None:
572
+ cfg = (
573
+ AutoencoderConfig.from_preset(args.preset)
574
+ if hasattr(args, "preset")
575
+ else AutoencoderConfig()
576
+ )
577
+ return ImputeAutoencoder(genotype_data=gd, config=cfg)
578
+
579
+ def build_impute_mostfreq():
580
+ cfg = cfgs_by_model.get("ImputeMostFrequent")
581
+ if cfg is None:
582
+ cfg = (
583
+ MostFrequentConfig.from_preset(args.preset)
584
+ if hasattr(args, "preset")
585
+ else MostFrequentConfig()
586
+ )
587
+ return ImputeMostFrequent(gd, config=cfg)
588
+
589
+ def build_impute_refallele():
590
+ cfg = cfgs_by_model.get("ImputeRefAllele")
591
+ if cfg is None:
592
+ cfg = (
593
+ RefAlleleConfig.from_preset(args.preset)
594
+ if hasattr(args, "preset")
595
+ else RefAlleleConfig()
596
+ )
597
+ return ImputeRefAllele(gd, config=cfg)
598
+
599
+ model_builders = {
600
+ "ImputeUBP": build_impute_ubp,
601
+ "ImputeVAE": build_impute_vae,
602
+ "ImputeAutoencoder": build_impute_autoencoder,
603
+ "ImputeNLPCA": build_impute_nlpca,
604
+ "ImputeMostFrequent": build_impute_mostfreq,
605
+ "ImputeRefAllele": build_impute_refallele,
606
+ }
607
+
608
+ logging.info(f"Selected models: {', '.join(selected_models)}")
609
+ for name in selected_models:
610
+ X_imputed = run_model_safely(name, model_builders[name], warn_only=True)
611
+ gd_imp = gd.copy()
612
+ gd_imp.snp_data = X_imputed
613
+
614
+ if name in {"ImputeUBP", "ImputeVAE", "ImputeAutoencoder", "ImputeNLPCA"}:
615
+ family = "Unsupervised"
616
+ elif name in {"ImputeMostFrequent", "ImputeRefAllele"}:
617
+ family = "Deterministic"
618
+ elif name in {"ImputeHistGradientBoosting", "ImputeRandomForest"}:
619
+ family = "Supervised"
620
+ else:
621
+ raise ValueError(f"Unknown model family for {name}")
622
+
623
+ prefix = getattr(args, "prefix", str(Path(input_path).stem))
624
+ pth = Path(f"{prefix}_output/{family}/imputed/{name}")
625
+ pth.mkdir(parents=True, exist_ok=True)
626
+
627
+ logging.info(f"Writing imputed VCF for {name} to {pth} ...")
628
+ gd_imp.write_vcf(pth / f"{name.lower()}_imputed.vcf.gz")
629
+
630
+ logging.info("All requested models processed.")
631
+ return 0
632
+
633
+
634
+ if __name__ == "__main__":
635
+ raise SystemExit(main())