pg-sui 1.0.2.1__py3-none-any.whl → 1.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pg-sui might be problematic. Click here for more details.
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/METADATA +51 -70
- pg_sui-1.6.8.dist-info/RECORD +78 -0
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info}/WHEEL +1 -1
- pg_sui-1.6.8.dist-info/entry_points.txt +4 -0
- pg_sui-1.6.8.dist-info/top_level.txt +1 -0
- pgsui/__init__.py +35 -54
- pgsui/_version.py +34 -0
- pgsui/cli.py +635 -0
- pgsui/data_processing/config.py +576 -0
- pgsui/data_processing/containers.py +1782 -0
- pgsui/data_processing/transformers.py +121 -1103
- pgsui/electron/app/__main__.py +5 -0
- pgsui/electron/app/icons/icons/1024x1024.png +0 -0
- pgsui/electron/app/icons/icons/128x128.png +0 -0
- pgsui/electron/app/icons/icons/16x16.png +0 -0
- pgsui/electron/app/icons/icons/24x24.png +0 -0
- pgsui/electron/app/icons/icons/256x256.png +0 -0
- pgsui/electron/app/icons/icons/32x32.png +0 -0
- pgsui/electron/app/icons/icons/48x48.png +0 -0
- pgsui/electron/app/icons/icons/512x512.png +0 -0
- pgsui/electron/app/icons/icons/64x64.png +0 -0
- pgsui/electron/app/icons/icons/icon.icns +0 -0
- pgsui/electron/app/icons/icons/icon.ico +0 -0
- pgsui/electron/app/main.js +189 -0
- pgsui/electron/app/package-lock.json +6893 -0
- pgsui/electron/app/package.json +50 -0
- pgsui/electron/app/preload.js +15 -0
- pgsui/electron/app/server.py +146 -0
- pgsui/electron/app/ui/logo.png +0 -0
- pgsui/electron/app/ui/renderer.js +130 -0
- pgsui/electron/app/ui/styles.css +59 -0
- pgsui/electron/app/ui/ui_shim.js +72 -0
- pgsui/electron/bootstrap.py +43 -0
- pgsui/electron/launch.py +59 -0
- pgsui/electron/package.json +14 -0
- pgsui/example_data/popmaps/{test.popmap → phylogen_nomx.popmap} +185 -99
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz +0 -0
- pgsui/example_data/vcf_files/phylogen_subset14K.vcf.gz.tbi +0 -0
- pgsui/impute/deterministic/imputers/allele_freq.py +691 -0
- pgsui/impute/deterministic/imputers/mode.py +679 -0
- pgsui/impute/deterministic/imputers/nmf.py +221 -0
- pgsui/impute/deterministic/imputers/phylo.py +971 -0
- pgsui/impute/deterministic/imputers/ref_allele.py +530 -0
- pgsui/impute/supervised/base.py +339 -0
- pgsui/impute/supervised/imputers/hist_gradient_boosting.py +293 -0
- pgsui/impute/supervised/imputers/random_forest.py +287 -0
- pgsui/impute/unsupervised/base.py +924 -0
- pgsui/impute/unsupervised/callbacks.py +89 -263
- pgsui/impute/unsupervised/imputers/autoencoder.py +972 -0
- pgsui/impute/unsupervised/imputers/nlpca.py +1264 -0
- pgsui/impute/unsupervised/imputers/ubp.py +1288 -0
- pgsui/impute/unsupervised/imputers/vae.py +957 -0
- pgsui/impute/unsupervised/loss_functions.py +158 -0
- pgsui/impute/unsupervised/models/autoencoder_model.py +208 -558
- pgsui/impute/unsupervised/models/nlpca_model.py +149 -468
- pgsui/impute/unsupervised/models/ubp_model.py +198 -1317
- pgsui/impute/unsupervised/models/vae_model.py +259 -618
- pgsui/impute/unsupervised/nn_scorers.py +215 -0
- pgsui/utils/classification_viz.py +591 -0
- pgsui/utils/misc.py +35 -480
- pgsui/utils/plotting.py +514 -824
- pgsui/utils/scorers.py +212 -438
- pg_sui-1.0.2.1.dist-info/RECORD +0 -75
- pg_sui-1.0.2.1.dist-info/top_level.txt +0 -3
- pgsui/example_data/phylip_files/test_n10.phy +0 -118
- pgsui/example_data/phylip_files/test_n100.phy +0 -118
- pgsui/example_data/phylip_files/test_n2.phy +0 -118
- pgsui/example_data/phylip_files/test_n500.phy +0 -118
- pgsui/example_data/structure_files/test.nopops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.nopops.2row.100sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.10sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.30sites.str +0 -234
- pgsui/example_data/structure_files/test.nopops.2row.allsites.str +0 -234
- pgsui/example_data/structure_files/test.pops.1row.10sites.str +0 -117
- pgsui/example_data/structure_files/test.pops.2row.10sites.str +0 -234
- pgsui/example_data/trees/test.iqtree +0 -376
- pgsui/example_data/trees/test.qmat +0 -5
- pgsui/example_data/trees/test.rate +0 -2033
- pgsui/example_data/trees/test.tre +0 -1
- pgsui/example_data/trees/test_n10.rate +0 -19
- pgsui/example_data/trees/test_n100.rate +0 -109
- pgsui/example_data/trees/test_n500.rate +0 -509
- pgsui/example_data/trees/test_siterates.txt +0 -2024
- pgsui/example_data/trees/test_siterates_n10.txt +0 -10
- pgsui/example_data/trees/test_siterates_n100.txt +0 -100
- pgsui/example_data/trees/test_siterates_n500.txt +0 -500
- pgsui/example_data/vcf_files/test.vcf +0 -244
- pgsui/example_data/vcf_files/test.vcf.gz +0 -0
- pgsui/example_data/vcf_files/test.vcf.gz.tbi +0 -0
- pgsui/impute/estimators.py +0 -735
- pgsui/impute/impute.py +0 -1486
- pgsui/impute/simple_imputers.py +0 -1439
- pgsui/impute/supervised/iterative_imputer_fixedparams.py +0 -785
- pgsui/impute/supervised/iterative_imputer_gridsearch.py +0 -1027
- pgsui/impute/unsupervised/keras_classifiers.py +0 -702
- pgsui/impute/unsupervised/models/in_development/cnn_model.py +0 -486
- pgsui/impute/unsupervised/neural_network_imputers.py +0 -1424
- pgsui/impute/unsupervised/neural_network_methods.py +0 -1549
- pgsui/pg_sui.py +0 -261
- pgsui/utils/sequence_tools.py +0 -407
- simulation/sim_benchmarks.py +0 -333
- simulation/sim_treeparams.py +0 -475
- test/__init__.py +0 -0
- test/pg_sui_simtest.py +0 -215
- test/pg_sui_testing.py +0 -523
- test/test.py +0 -297
- test/test_pgsui.py +0 -374
- test/test_tkc.py +0 -214
- {pg_sui-1.0.2.1.dist-info → pg_sui-1.6.8.dist-info/licenses}/LICENSE +0 -0
- /pgsui/{example_data/trees → electron/app}/__init__.py +0 -0
- /pgsui/impute/{unsupervised/models/in_development → supervised/imputers}/__init__.py +0 -0
- {simulation → pgsui/impute/unsupervised/imputers}/__init__.py +0 -0
pgsui/cli.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
|
|
4
|
+
"""PG-SUI Imputation CLI
|
|
5
|
+
|
|
6
|
+
Argument-precedence model:
|
|
7
|
+
code defaults < preset (--preset) < YAML (--config) < explicit CLI flags < --set k=v
|
|
8
|
+
|
|
9
|
+
Notes
|
|
10
|
+
-----
|
|
11
|
+
- Preset is a CLI-only choice and will be respected unless overridden by YAML or CLI.
|
|
12
|
+
- YAML entries override preset (a 'preset' key in YAML is ignored with a warning).
|
|
13
|
+
- CLI flags only override when explicitly provided (argparse uses SUPPRESS).
|
|
14
|
+
- --set key=value has the highest precedence and applies dot-path overrides.
|
|
15
|
+
|
|
16
|
+
Examples
|
|
17
|
+
--------
|
|
18
|
+
python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix run1
|
|
19
|
+
python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix tuned --tune
|
|
20
|
+
python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix demo \
|
|
21
|
+
--models ImputeUBP ImputeVAE ImputeMostFrequent --seed deterministic --verbose
|
|
22
|
+
python cli.py --vcf data.vcf.gz --popmap pops.popmap --prefix subset \
|
|
23
|
+
--include-pops EA GU TT ON --device cpu
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import argparse
|
|
29
|
+
import ast
|
|
30
|
+
import logging
|
|
31
|
+
import sys
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple
|
|
34
|
+
|
|
35
|
+
from snpio import GenePopReader, PhylipReader, StructureReader, VCFReader
|
|
36
|
+
|
|
37
|
+
from pgsui import (
|
|
38
|
+
ImputeAutoencoder,
|
|
39
|
+
ImputeMostFrequent,
|
|
40
|
+
ImputeNLPCA,
|
|
41
|
+
ImputeRefAllele,
|
|
42
|
+
ImputeUBP,
|
|
43
|
+
ImputeVAE,
|
|
44
|
+
)
|
|
45
|
+
from pgsui.data_processing.config import (
|
|
46
|
+
apply_dot_overrides,
|
|
47
|
+
dataclass_to_yaml,
|
|
48
|
+
load_yaml_to_dataclass,
|
|
49
|
+
save_dataclass_yaml,
|
|
50
|
+
)
|
|
51
|
+
from pgsui import (
|
|
52
|
+
AutoencoderConfig,
|
|
53
|
+
MostFrequentConfig,
|
|
54
|
+
NLPCAConfig,
|
|
55
|
+
RefAlleleConfig,
|
|
56
|
+
UBPConfig,
|
|
57
|
+
VAEConfig,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ----------------------------- CLI Utilities ----------------------------- #
|
|
62
|
+
def _configure_logging(verbose: bool, log_file: Optional[str] = None) -> None:
|
|
63
|
+
"""Configure root logger.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
verbose: If True, DEBUG; else INFO.
|
|
67
|
+
log_file: Optional file to tee logs to.
|
|
68
|
+
"""
|
|
69
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
70
|
+
handlers: List[logging.Handler] = [logging.StreamHandler(sys.stdout)]
|
|
71
|
+
if log_file:
|
|
72
|
+
handlers.append(logging.FileHandler(log_file, mode="w", encoding="utf-8"))
|
|
73
|
+
logging.basicConfig(
|
|
74
|
+
level=level,
|
|
75
|
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
76
|
+
handlers=handlers,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _parse_seed(seed_arg: str) -> Optional[int]:
|
|
81
|
+
"""Parse --seed argument into an int or None."""
|
|
82
|
+
s = seed_arg.strip().lower()
|
|
83
|
+
if s == "random":
|
|
84
|
+
return None
|
|
85
|
+
if s == "deterministic":
|
|
86
|
+
return 42
|
|
87
|
+
try:
|
|
88
|
+
return int(seed_arg)
|
|
89
|
+
except ValueError as e:
|
|
90
|
+
raise argparse.ArgumentTypeError(
|
|
91
|
+
"Invalid --seed. Use 'random', 'deterministic', or an integer."
|
|
92
|
+
) from e
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _parse_models(models: Iterable[str]) -> Tuple[str, ...]:
|
|
96
|
+
"""Validate and canonicalize model names."""
|
|
97
|
+
valid = {
|
|
98
|
+
"ImputeUBP",
|
|
99
|
+
"ImputeVAE",
|
|
100
|
+
"ImputeAutoencoder",
|
|
101
|
+
"ImputeNLPCA",
|
|
102
|
+
"ImputeMostFrequent",
|
|
103
|
+
"ImputeRefAllele",
|
|
104
|
+
}
|
|
105
|
+
selected = tuple(models) if models else tuple(valid)
|
|
106
|
+
unknown = [m for m in selected if m not in valid]
|
|
107
|
+
if unknown:
|
|
108
|
+
raise argparse.ArgumentTypeError(
|
|
109
|
+
f"Unknown model(s): {unknown}. Valid options: {sorted(valid)}"
|
|
110
|
+
)
|
|
111
|
+
return selected
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _parse_overrides(pairs: list[str]) -> dict:
|
|
115
|
+
"""Parse --set key=value into typed values via literal_eval."""
|
|
116
|
+
out: dict = {}
|
|
117
|
+
for kv in pairs or []:
|
|
118
|
+
if "=" not in kv:
|
|
119
|
+
raise argparse.ArgumentTypeError(f"--set expects key=value, got '{kv}'")
|
|
120
|
+
k, v = kv.split("=", 1)
|
|
121
|
+
v = v.strip()
|
|
122
|
+
try:
|
|
123
|
+
out[k] = ast.literal_eval(v)
|
|
124
|
+
except Exception:
|
|
125
|
+
out[k] = v # raw string fallback
|
|
126
|
+
return out
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _args_to_cli_overrides(args: argparse.Namespace) -> dict:
|
|
130
|
+
"""Convert explicitly provided CLI flags into config dot-overrides."""
|
|
131
|
+
overrides: dict = {}
|
|
132
|
+
|
|
133
|
+
# IO / top-level controls
|
|
134
|
+
if hasattr(args, "prefix") and args.prefix is not None:
|
|
135
|
+
overrides["io.prefix"] = args.prefix
|
|
136
|
+
else:
|
|
137
|
+
overrides["io.prefix"] = str(Path(args.vcf).stem)
|
|
138
|
+
|
|
139
|
+
if hasattr(args, "verbose"):
|
|
140
|
+
overrides["io.verbose"] = bool(args.verbose)
|
|
141
|
+
if hasattr(args, "n_jobs"):
|
|
142
|
+
overrides["io.n_jobs"] = int(args.n_jobs)
|
|
143
|
+
if hasattr(args, "seed"):
|
|
144
|
+
overrides["io.seed"] = _parse_seed(args.seed)
|
|
145
|
+
|
|
146
|
+
# Train
|
|
147
|
+
if hasattr(args, "batch_size"):
|
|
148
|
+
overrides["train.batch_size"] = int(args.batch_size)
|
|
149
|
+
if hasattr(args, "device"):
|
|
150
|
+
dev = args.device
|
|
151
|
+
if dev == "cuda":
|
|
152
|
+
dev = "gpu"
|
|
153
|
+
overrides["train.device"] = dev
|
|
154
|
+
|
|
155
|
+
# Plot
|
|
156
|
+
if hasattr(args, "plot_format"):
|
|
157
|
+
overrides["plot.fmt"] = args.plot_format
|
|
158
|
+
|
|
159
|
+
# Tuning
|
|
160
|
+
if hasattr(args, "tune"):
|
|
161
|
+
overrides["tune.enabled"] = bool(args.tune)
|
|
162
|
+
if hasattr(args, "tune_n_trials"):
|
|
163
|
+
overrides["tune.n_trials"] = int(args.tune_n_trials)
|
|
164
|
+
|
|
165
|
+
return overrides
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# ------------------------------ Core Runner ------------------------------ #
|
|
169
|
+
def build_genotype_data(
|
|
170
|
+
input_path: str,
|
|
171
|
+
fmt: Literal["vcf", "phylip", "genepop"],
|
|
172
|
+
popmap_path: str | None,
|
|
173
|
+
force_popmap: bool,
|
|
174
|
+
verbose: bool,
|
|
175
|
+
include_pops: List[str] | None,
|
|
176
|
+
plot_format: Literal["pdf", "png", "jpg", "jpeg"],
|
|
177
|
+
):
|
|
178
|
+
"""Load genotype data from heterogeneous inputs."""
|
|
179
|
+
logging.info(f"Loading {fmt.upper()} and popmap data...")
|
|
180
|
+
fmt = fmt.lower()
|
|
181
|
+
|
|
182
|
+
kwargs = {
|
|
183
|
+
"filename": input_path,
|
|
184
|
+
"popmapfile": popmap_path,
|
|
185
|
+
"force_popmap": force_popmap,
|
|
186
|
+
"verbose": verbose,
|
|
187
|
+
"include_pops": include_pops if include_pops else None,
|
|
188
|
+
"prefix": f"snpio_{Path(input_path).stem}",
|
|
189
|
+
"plot_format": plot_format,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if fmt == "vcf":
|
|
193
|
+
gd = VCFReader(**kwargs)
|
|
194
|
+
elif fmt == "phylip":
|
|
195
|
+
gd = PhylipReader(**kwargs)
|
|
196
|
+
elif fmt == "genepop":
|
|
197
|
+
gd = GenePopReader(**kwargs)
|
|
198
|
+
else:
|
|
199
|
+
raise ValueError(f"Unsupported genotype data format: {fmt}")
|
|
200
|
+
|
|
201
|
+
logging.info("Loaded genotype data.")
|
|
202
|
+
return gd
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def run_model_safely(model_name: str, builder, *, warn_only: bool = True) -> None:
|
|
206
|
+
"""Run model builder + fit/transform with error isolation."""
|
|
207
|
+
logging.info(f"▶ Running {model_name} ...")
|
|
208
|
+
try:
|
|
209
|
+
model = builder()
|
|
210
|
+
model.fit()
|
|
211
|
+
X_imputed = model.transform()
|
|
212
|
+
logging.info(f"✓ {model_name} completed.")
|
|
213
|
+
return X_imputed
|
|
214
|
+
except Exception as e:
|
|
215
|
+
if warn_only:
|
|
216
|
+
logging.warning(f"⚠ {model_name} failed: {e}", exc_info=True)
|
|
217
|
+
else:
|
|
218
|
+
raise
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# -------------------------- Model Registry ------------------------------- #
|
|
222
|
+
# Add config-driven models here by listing the class and its config dataclass.
|
|
223
|
+
MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
|
|
224
|
+
"ImputeUBP": {"cls": ImputeUBP, "config_cls": UBPConfig},
|
|
225
|
+
"ImputeNLPCA": {"cls": ImputeNLPCA, "config_cls": NLPCAConfig},
|
|
226
|
+
"ImputeAutoencoder": {"cls": ImputeAutoencoder, "config_cls": AutoencoderConfig},
|
|
227
|
+
"ImputeVAE": {"cls": ImputeVAE, "config_cls": VAEConfig},
|
|
228
|
+
"ImputeMostFrequent": {"cls": ImputeMostFrequent, "config_cls": MostFrequentConfig},
|
|
229
|
+
"ImputeRefAllele": {"cls": ImputeRefAllele, "config_cls": RefAlleleConfig},
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _build_effective_config_for_model(
|
|
234
|
+
model_name: str, args: argparse.Namespace
|
|
235
|
+
) -> Any | None:
|
|
236
|
+
"""Build the effective config object for a specific model (if it has one).
|
|
237
|
+
|
|
238
|
+
Precedence (lowest → highest):
|
|
239
|
+
defaults < preset (--preset) < YAML (--config) < explicit CLI flags < --set
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Config dataclass instance or None (for models without config dataclasses).
|
|
243
|
+
"""
|
|
244
|
+
reg = MODEL_REGISTRY[model_name]
|
|
245
|
+
cfg_cls = reg.get("config_cls")
|
|
246
|
+
|
|
247
|
+
if cfg_cls is None:
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
# 0) Start from pure dataclass defaults.
|
|
251
|
+
cfg = cfg_cls()
|
|
252
|
+
|
|
253
|
+
# 1) If user explicitly provided a preset, overlay it.
|
|
254
|
+
if hasattr(args, "preset"):
|
|
255
|
+
preset_name = args.preset
|
|
256
|
+
cfg = cfg_cls.from_preset(preset_name)
|
|
257
|
+
logging.info(f"Initialized {model_name} from '{preset_name}' preset.")
|
|
258
|
+
else:
|
|
259
|
+
logging.info(f"Initialized {model_name} from dataclass defaults (no preset).")
|
|
260
|
+
|
|
261
|
+
# 2) YAML overlays preset/defaults (boss). Ignore any 'preset' in YAML.
|
|
262
|
+
yaml_path = getattr(args, "config", None)
|
|
263
|
+
|
|
264
|
+
if yaml_path:
|
|
265
|
+
cfg = load_yaml_to_dataclass(
|
|
266
|
+
yaml_path,
|
|
267
|
+
cfg_cls,
|
|
268
|
+
base=cfg,
|
|
269
|
+
yaml_preset_behavior="ignore", # 'preset' key in YAML ignored with warning
|
|
270
|
+
)
|
|
271
|
+
logging.info(
|
|
272
|
+
f"Loaded YAML config for {model_name} from {yaml_path} (ignored 'preset' in YAML if present)."
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# 3) Explicit CLI flags overlay YAML.
|
|
276
|
+
cli_overrides = _args_to_cli_overrides(args)
|
|
277
|
+
if cli_overrides:
|
|
278
|
+
cfg = apply_dot_overrides(cfg, cli_overrides)
|
|
279
|
+
|
|
280
|
+
# 4) --set has highest precedence.
|
|
281
|
+
user_overrides = _parse_overrides(getattr(args, "set", []))
|
|
282
|
+
|
|
283
|
+
if user_overrides:
|
|
284
|
+
try:
|
|
285
|
+
cfg = apply_dot_overrides(cfg, user_overrides)
|
|
286
|
+
except Exception as e:
|
|
287
|
+
if model_name in {
|
|
288
|
+
"ImputeUBP",
|
|
289
|
+
"ImputeNLPCA",
|
|
290
|
+
"ImputeAutoencoder",
|
|
291
|
+
"ImputeVAE",
|
|
292
|
+
}:
|
|
293
|
+
logging.error(
|
|
294
|
+
f"Error applying --set overrides to {model_name} config: {e}"
|
|
295
|
+
)
|
|
296
|
+
raise
|
|
297
|
+
else:
|
|
298
|
+
pass # non-config-driven models ignore --set
|
|
299
|
+
|
|
300
|
+
return cfg
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _maybe_print_or_dump_configs(
|
|
304
|
+
cfgs_by_model: Dict[str, Any], args: argparse.Namespace
|
|
305
|
+
) -> bool:
|
|
306
|
+
"""Handle --print-config / --dump-config for ALL config-driven models selected.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
True if we printed/dumped and should exit; else False.
|
|
310
|
+
"""
|
|
311
|
+
did_io = False
|
|
312
|
+
if getattr(args, "print_config", False):
|
|
313
|
+
for m, cfg in cfgs_by_model.items():
|
|
314
|
+
if cfg is None:
|
|
315
|
+
continue
|
|
316
|
+
print(f"# --- {m} effective config ---")
|
|
317
|
+
print(dataclass_to_yaml(cfg))
|
|
318
|
+
print()
|
|
319
|
+
did_io = True
|
|
320
|
+
|
|
321
|
+
if hasattr(args, "dump_config") and args.dump_config:
|
|
322
|
+
# If multiple models, add suffix per model (before extension if possible)
|
|
323
|
+
dump_base = args.dump_config
|
|
324
|
+
for m, cfg in cfgs_by_model.items():
|
|
325
|
+
if cfg is None:
|
|
326
|
+
continue
|
|
327
|
+
if "." in dump_base:
|
|
328
|
+
stem, ext = dump_base.rsplit(".", 1)
|
|
329
|
+
path = f"{stem}.{m}.{ext}"
|
|
330
|
+
else:
|
|
331
|
+
path = f"{dump_base}.{m}.yaml"
|
|
332
|
+
save_dataclass_yaml(cfg, path)
|
|
333
|
+
logging.info(f"Saved {m} config to {path}")
|
|
334
|
+
did_io = True
|
|
335
|
+
|
|
336
|
+
return did_io
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
340
|
+
parser = argparse.ArgumentParser(
|
|
341
|
+
prog="pgsui-cli",
|
|
342
|
+
description="Run PG-SUI imputation models on a VCF with minimal fuss.",
|
|
343
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# ----------------------------- Required I/O ----------------------------- #
|
|
347
|
+
parser.add_argument(
|
|
348
|
+
"--input",
|
|
349
|
+
default=argparse.SUPPRESS,
|
|
350
|
+
help="Path to input file (VCF/PHYLIP/STRUCTURE/GENEPOP).",
|
|
351
|
+
)
|
|
352
|
+
parser.add_argument(
|
|
353
|
+
"--format",
|
|
354
|
+
choices=("vcf", "phylip", "structure", "genepop"),
|
|
355
|
+
default=argparse.SUPPRESS,
|
|
356
|
+
help="Input format; defaults to 'vcf' when --vcf is used.",
|
|
357
|
+
)
|
|
358
|
+
# Back-compat: --vcf retained; if both provided, --input wins.
|
|
359
|
+
parser.add_argument(
|
|
360
|
+
"--vcf", default=argparse.SUPPRESS, help="Path to input VCF(.gz) file."
|
|
361
|
+
)
|
|
362
|
+
parser.add_argument(
|
|
363
|
+
"--popmap", default=argparse.SUPPRESS, help="Path to population map file."
|
|
364
|
+
)
|
|
365
|
+
parser.add_argument(
|
|
366
|
+
"--prefix",
|
|
367
|
+
default=argparse.SUPPRESS,
|
|
368
|
+
help="Run/output prefix; overrides config if provided.",
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# ---------------------- Generic Config Inputs -------------------------- #
|
|
372
|
+
parser.add_argument(
|
|
373
|
+
"--config",
|
|
374
|
+
default=argparse.SUPPRESS,
|
|
375
|
+
help="YAML config for config-driven models (NLPCA/UBP/Autoencoder/VAE).",
|
|
376
|
+
)
|
|
377
|
+
parser.add_argument(
|
|
378
|
+
"--preset",
|
|
379
|
+
choices=("fast", "balanced", "thorough"),
|
|
380
|
+
default=argparse.SUPPRESS, # <-- no default; optional
|
|
381
|
+
help="If provided, initialize config(s) from this preset; otherwise start from dataclass defaults.",
|
|
382
|
+
)
|
|
383
|
+
parser.add_argument(
|
|
384
|
+
"--set",
|
|
385
|
+
action="append",
|
|
386
|
+
default=argparse.SUPPRESS,
|
|
387
|
+
help="Dot-key overrides, e.g. --set model.latent_dim=4",
|
|
388
|
+
)
|
|
389
|
+
parser.add_argument(
|
|
390
|
+
"--print-config",
|
|
391
|
+
action="store_true",
|
|
392
|
+
help="Print effective config(s) and exit.",
|
|
393
|
+
)
|
|
394
|
+
parser.add_argument(
|
|
395
|
+
"--dump-config",
|
|
396
|
+
default=argparse.SUPPRESS,
|
|
397
|
+
help="Write effective config(s) YAML to this path (multi-model gets suffixed).",
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# ------------------------------ Toggles -------------------------------- #
|
|
401
|
+
parser.add_argument(
|
|
402
|
+
"--tune",
|
|
403
|
+
action="store_true",
|
|
404
|
+
default=argparse.SUPPRESS,
|
|
405
|
+
help="Enable hyperparameter tuning (if supported).",
|
|
406
|
+
)
|
|
407
|
+
parser.add_argument(
|
|
408
|
+
"--tune-n-trials",
|
|
409
|
+
type=int,
|
|
410
|
+
default=argparse.SUPPRESS,
|
|
411
|
+
help="Optuna trials when --tune is set.",
|
|
412
|
+
)
|
|
413
|
+
parser.add_argument(
|
|
414
|
+
"--batch-size",
|
|
415
|
+
type=int,
|
|
416
|
+
default=argparse.SUPPRESS,
|
|
417
|
+
help="Batch size for NN-based models.",
|
|
418
|
+
)
|
|
419
|
+
parser.add_argument(
|
|
420
|
+
"--device",
|
|
421
|
+
choices=("cpu", "cuda", "mps"),
|
|
422
|
+
default=argparse.SUPPRESS,
|
|
423
|
+
help="Compute device for NN-based models.",
|
|
424
|
+
)
|
|
425
|
+
parser.add_argument(
|
|
426
|
+
"--n-jobs",
|
|
427
|
+
type=int,
|
|
428
|
+
default=argparse.SUPPRESS,
|
|
429
|
+
help="Parallel workers for various steps.",
|
|
430
|
+
)
|
|
431
|
+
parser.add_argument(
|
|
432
|
+
"--plot-format",
|
|
433
|
+
choices=("png", "pdf", "svg"),
|
|
434
|
+
default=argparse.SUPPRESS,
|
|
435
|
+
help="Figure format for model plots.",
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# --------------------------- Seed & logging ---------------------------- #
|
|
439
|
+
parser.add_argument(
|
|
440
|
+
"--seed",
|
|
441
|
+
default=argparse.SUPPRESS,
|
|
442
|
+
help="Random seed: 'random', 'deterministic', or an integer.",
|
|
443
|
+
)
|
|
444
|
+
parser.add_argument("--verbose", action="store_true", help="Debug-level logging.")
|
|
445
|
+
parser.add_argument(
|
|
446
|
+
"--log-file", default=argparse.SUPPRESS, help="Also write logs to a file."
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# ---------------------------- Data filtering --------------------------- #
|
|
450
|
+
parser.add_argument(
|
|
451
|
+
"--include-pops",
|
|
452
|
+
nargs="+",
|
|
453
|
+
default=argparse.SUPPRESS,
|
|
454
|
+
help="Optional list of population IDs to include.",
|
|
455
|
+
)
|
|
456
|
+
parser.add_argument(
|
|
457
|
+
"--force-popmap",
|
|
458
|
+
action="store_true",
|
|
459
|
+
default=False,
|
|
460
|
+
help="Require popmap (error if absent).",
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
# ---------------------------- Model selection -------------------------- #
|
|
464
|
+
parser.add_argument(
|
|
465
|
+
"--models",
|
|
466
|
+
nargs="+",
|
|
467
|
+
default=argparse.SUPPRESS,
|
|
468
|
+
help=(
|
|
469
|
+
"Which models to run. Choices: "
|
|
470
|
+
"ImputeUBP ImputeVAE ImputeAutoencoder ImputeNLPCA "
|
|
471
|
+
"ImputeMostFrequent ImputeRefAllele. Default is all."
|
|
472
|
+
),
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# ------------------------------ Safety/UX ------------------------------ #
|
|
476
|
+
parser.add_argument(
|
|
477
|
+
"--dry-run",
|
|
478
|
+
action="store_true",
|
|
479
|
+
help="Parse args and load data, but skip model training.",
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
args = parser.parse_args(argv)
|
|
483
|
+
|
|
484
|
+
# Logging (verbose default is False unless passed)
|
|
485
|
+
_configure_logging(
|
|
486
|
+
verbose=getattr(args, "verbose", False),
|
|
487
|
+
log_file=getattr(args, "log_file", None),
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
# Models selection (default to all if not explicitly provided)
|
|
491
|
+
try:
|
|
492
|
+
selected_models = _parse_models(getattr(args, "models", ()))
|
|
493
|
+
except argparse.ArgumentTypeError as e:
|
|
494
|
+
parser.error(str(e))
|
|
495
|
+
return 2
|
|
496
|
+
|
|
497
|
+
# Input resolution
|
|
498
|
+
input_path = getattr(args, "input", None)
|
|
499
|
+
if input_path is None and hasattr(args, "vcf"):
|
|
500
|
+
input_path = args.vcf
|
|
501
|
+
if not hasattr(args, "format"):
|
|
502
|
+
setattr(args, "format", "vcf")
|
|
503
|
+
|
|
504
|
+
if input_path is None:
|
|
505
|
+
parser.error("You must provide --input (or legacy --vcf).")
|
|
506
|
+
return 2
|
|
507
|
+
|
|
508
|
+
fmt = getattr(args, "format", "vcf").lower()
|
|
509
|
+
popmap_path = getattr(args, "popmap", None)
|
|
510
|
+
include_pops = getattr(args, "include_pops", None)
|
|
511
|
+
verbose_flag = getattr(args, "verbose", False)
|
|
512
|
+
force_popmap = bool(getattr(args, "force_popmap", False))
|
|
513
|
+
|
|
514
|
+
# Load genotype data
|
|
515
|
+
gd = build_genotype_data(
|
|
516
|
+
input_path=input_path,
|
|
517
|
+
fmt=fmt,
|
|
518
|
+
popmap_path=popmap_path,
|
|
519
|
+
force_popmap=force_popmap,
|
|
520
|
+
verbose=verbose_flag,
|
|
521
|
+
include_pops=include_pops,
|
|
522
|
+
plot_format=getattr(args, "plot_format", "pdf"),
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
if getattr(args, "dry_run", False):
|
|
526
|
+
logging.info("Dry run complete. Exiting without training models.")
|
|
527
|
+
return 0
|
|
528
|
+
|
|
529
|
+
# ---------------- Build config(s) per selected model ------------------- #
|
|
530
|
+
cfgs_by_model: Dict[str, Any] = {
|
|
531
|
+
m: _build_effective_config_for_model(m, args) for m in selected_models
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
# Maybe print/dump configs and exit
|
|
535
|
+
if _maybe_print_or_dump_configs(cfgs_by_model, args):
|
|
536
|
+
return 0
|
|
537
|
+
|
|
538
|
+
# ------------------------- Model Builders ------------------------------ #
|
|
539
|
+
def build_impute_ubp():
|
|
540
|
+
cfg = cfgs_by_model.get("ImputeUBP")
|
|
541
|
+
if cfg is None:
|
|
542
|
+
cfg = (
|
|
543
|
+
UBPConfig.from_preset(args.preset)
|
|
544
|
+
if hasattr(args, "preset")
|
|
545
|
+
else UBPConfig()
|
|
546
|
+
)
|
|
547
|
+
return ImputeUBP(genotype_data=gd, config=cfg)
|
|
548
|
+
|
|
549
|
+
def build_impute_nlpca():
|
|
550
|
+
cfg = cfgs_by_model.get("ImputeNLPCA")
|
|
551
|
+
if cfg is None:
|
|
552
|
+
cfg = (
|
|
553
|
+
NLPCAConfig.from_preset(args.preset)
|
|
554
|
+
if hasattr(args, "preset")
|
|
555
|
+
else NLPCAConfig()
|
|
556
|
+
)
|
|
557
|
+
return ImputeNLPCA(genotype_data=gd, config=cfg)
|
|
558
|
+
|
|
559
|
+
def build_impute_vae():
|
|
560
|
+
cfg = cfgs_by_model.get("ImputeVAE")
|
|
561
|
+
if cfg is None:
|
|
562
|
+
cfg = (
|
|
563
|
+
VAEConfig.from_preset(args.preset)
|
|
564
|
+
if hasattr(args, "preset")
|
|
565
|
+
else VAEConfig()
|
|
566
|
+
)
|
|
567
|
+
return ImputeVAE(genotype_data=gd, config=cfg)
|
|
568
|
+
|
|
569
|
+
def build_impute_autoencoder():
|
|
570
|
+
cfg = cfgs_by_model.get("ImputeAutoencoder")
|
|
571
|
+
if cfg is None:
|
|
572
|
+
cfg = (
|
|
573
|
+
AutoencoderConfig.from_preset(args.preset)
|
|
574
|
+
if hasattr(args, "preset")
|
|
575
|
+
else AutoencoderConfig()
|
|
576
|
+
)
|
|
577
|
+
return ImputeAutoencoder(genotype_data=gd, config=cfg)
|
|
578
|
+
|
|
579
|
+
def build_impute_mostfreq():
|
|
580
|
+
cfg = cfgs_by_model.get("ImputeMostFrequent")
|
|
581
|
+
if cfg is None:
|
|
582
|
+
cfg = (
|
|
583
|
+
MostFrequentConfig.from_preset(args.preset)
|
|
584
|
+
if hasattr(args, "preset")
|
|
585
|
+
else MostFrequentConfig()
|
|
586
|
+
)
|
|
587
|
+
return ImputeMostFrequent(gd, config=cfg)
|
|
588
|
+
|
|
589
|
+
def build_impute_refallele():
|
|
590
|
+
cfg = cfgs_by_model.get("ImputeRefAllele")
|
|
591
|
+
if cfg is None:
|
|
592
|
+
cfg = (
|
|
593
|
+
RefAlleleConfig.from_preset(args.preset)
|
|
594
|
+
if hasattr(args, "preset")
|
|
595
|
+
else RefAlleleConfig()
|
|
596
|
+
)
|
|
597
|
+
return ImputeRefAllele(gd, config=cfg)
|
|
598
|
+
|
|
599
|
+
model_builders = {
|
|
600
|
+
"ImputeUBP": build_impute_ubp,
|
|
601
|
+
"ImputeVAE": build_impute_vae,
|
|
602
|
+
"ImputeAutoencoder": build_impute_autoencoder,
|
|
603
|
+
"ImputeNLPCA": build_impute_nlpca,
|
|
604
|
+
"ImputeMostFrequent": build_impute_mostfreq,
|
|
605
|
+
"ImputeRefAllele": build_impute_refallele,
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
logging.info(f"Selected models: {', '.join(selected_models)}")
|
|
609
|
+
for name in selected_models:
|
|
610
|
+
X_imputed = run_model_safely(name, model_builders[name], warn_only=True)
|
|
611
|
+
gd_imp = gd.copy()
|
|
612
|
+
gd_imp.snp_data = X_imputed
|
|
613
|
+
|
|
614
|
+
if name in {"ImputeUBP", "ImputeVAE", "ImputeAutoencoder", "ImputeNLPCA"}:
|
|
615
|
+
family = "Unsupervised"
|
|
616
|
+
elif name in {"ImputeMostFrequent", "ImputeRefAllele"}:
|
|
617
|
+
family = "Deterministic"
|
|
618
|
+
elif name in {"ImputeHistGradientBoosting", "ImputeRandomForest"}:
|
|
619
|
+
family = "Supervised"
|
|
620
|
+
else:
|
|
621
|
+
raise ValueError(f"Unknown model family for {name}")
|
|
622
|
+
|
|
623
|
+
prefix = getattr(args, "prefix", str(Path(input_path).stem))
|
|
624
|
+
pth = Path(f"{prefix}_output/{family}/imputed/{name}")
|
|
625
|
+
pth.mkdir(parents=True, exist_ok=True)
|
|
626
|
+
|
|
627
|
+
logging.info(f"Writing imputed VCF for {name} to {pth} ...")
|
|
628
|
+
gd_imp.write_vcf(pth / f"{name.lower()}_imputed.vcf.gz")
|
|
629
|
+
|
|
630
|
+
logging.info("All requested models processed.")
|
|
631
|
+
return 0
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
if __name__ == "__main__":
|
|
635
|
+
raise SystemExit(main())
|