PyPI - pg-sui - Versions diffs - 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/METADATA +26 -30
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/RECORD +29 -33
pgsui/__init__.py +0 -8
pgsui/_version.py +2 -2
pgsui/cli.py +577 -125
pgsui/data_processing/config.py +1 -2
pgsui/data_processing/containers.py +203 -530
pgsui/data_processing/transformers.py +44 -20
pgsui/impute/deterministic/imputers/mode.py +475 -182
pgsui/impute/deterministic/imputers/ref_allele.py +454 -147
pgsui/impute/supervised/imputers/hist_gradient_boosting.py +4 -3
pgsui/impute/supervised/imputers/random_forest.py +3 -2
pgsui/impute/unsupervised/base.py +1269 -534
pgsui/impute/unsupervised/callbacks.py +28 -33
pgsui/impute/unsupervised/imputers/autoencoder.py +870 -841
pgsui/impute/unsupervised/imputers/vae.py +931 -787
pgsui/impute/unsupervised/loss_functions.py +156 -202
pgsui/impute/unsupervised/models/autoencoder_model.py +7 -49
pgsui/impute/unsupervised/models/vae_model.py +40 -221
pgsui/impute/unsupervised/nn_scorers.py +53 -13
pgsui/utils/classification_viz.py +240 -97
pgsui/utils/misc.py +201 -3
pgsui/utils/plotting.py +73 -58
pgsui/utils/pretty_metrics.py +2 -6
pgsui/utils/scorers.py +39 -0
pgsui/impute/unsupervised/imputers/nlpca.py +0 -1666
pgsui/impute/unsupervised/imputers/ubp.py +0 -1660
pgsui/impute/unsupervised/models/nlpca_model.py +0 -206
pgsui/impute/unsupervised/models/ubp_model.py +0 -200
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/WHEEL +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/entry_points.txt +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/licenses/LICENSE +0 -0
{pg_sui-1.6.16a3.dist-info → pg_sui-1.7.0.dist-info}/top_level.txt +0 -0

pgsui/data_processing/containers.py CHANGED Viewed

@@ -121,20 +121,18 @@ class ModelConfig:
         latent_dim (int): Dimensionality of the latent space.
         dropout_rate (float): Dropout rate for regularization.
         num_hidden_layers (int): Number of hidden layers in the neural network.
-        hidden_activation (Literal["relu", "elu", "selu", "leaky_relu"]): Activation function.
+        activation (Literal["relu", "elu", "selu", "leaky_relu"]): Activation function.
         layer_scaling_factor (float): Scaling factor for the number of neurons in hidden layers.
-        layer_schedule (Literal["pyramid", "constant", "linear"]): Schedule for scaling hidden layer sizes.
-        gamma (float): Parameter for the focal loss function.
+        layer_schedule (Literal["pyramid", "linear"]): Schedule for scaling hidden layer sizes.
     """
     latent_init: Literal["random", "pca"] = "random"
     latent_dim: int = 2
     dropout_rate: float = 0.2
     num_hidden_layers: int = 2
-    hidden_activation: Literal["relu", "elu", "selu", "leaky_relu"] = "relu"
+    activation: Literal["relu", "elu", "selu", "leaky_relu"] = "relu"
     layer_scaling_factor: float = 5.0
-    layer_schedule: Literal["pyramid", "constant", "linear"] = "pyramid"
-    gamma: float = 2.0
+    layer_schedule: Literal["pyramid", "linear"] = "pyramid"
 @dataclass
@@ -144,28 +142,39 @@ class TrainConfig:
     Attributes:
         batch_size (int): Number of samples per training batch.
         learning_rate (float): Learning rate for the optimizer.
-        lr_input_factor (float): Factor to scale the learning rate for input layer.
         l1_penalty (float): L1 regularization penalty.
         early_stop_gen (int): Number of generations with no improvement to wait before early stopping.
         min_epochs (int): Minimum number of epochs to train.
         max_epochs (int): Maximum number of epochs to train.
         validation_split (float): Proportion of data to use for validation.
-        weights_beta (float): Smoothing factor for class weights.
-        weights_max_ratio (float): Maximum ratio for class weights to prevent extreme values.
+        weights_max_ratio (float | None): Maximum ratio for class weights to prevent extreme values.
+        gamma (float): Focusing parameter for focal loss.
         device (Literal["gpu", "cpu", "mps"]): Device to use for computation.
     """
-    batch_size: int = 32
+    batch_size: int = 64
     learning_rate: float = 1e-3
-    lr_input_factor: float = 1.0
     l1_penalty: float = 0.0
-    early_stop_gen: int = 20
+    early_stop_gen: int = 25
     min_epochs: int = 100
-    max_epochs: int = 5000
+    max_epochs: int = 2000
     validation_split: float = 0.2
-    weights_beta: float = 0.9999
-    weights_max_ratio: float = 1.0
     device: Literal["gpu", "cpu", "mps"] = "cpu"
+    weights_max_ratio: Optional[float] = None
+    weights_power: float = 1.0
+    weights_normalize: bool = True
+    weights_inverse: bool = False
+    gamma: float = 0.0
+    gamma_schedule: bool = False
+def _default_train_config() -> TrainConfig:
+    """Typed default factory for TrainConfig (helps some type checkers).
+    Using the class object directly (default_factory=TrainConfig) is valid at runtime but certain type checkers can fail to match dataclasses.field overloads.
+    """
+    return TrainConfig()
 @dataclass
@@ -174,19 +183,13 @@ class TuneConfig:
     Attributes:
         enabled (bool): If True, enables hyperparameter tuning.
-        metric (Literal["f1", "accuracy", "pr_macro"]): Metric to optimize during tuning.
+        metric (Literal["f1", "accuracy", "pr_macro", "average_precision", "roc_auc", "precision", "recall", "mcc", "jaccard"]): Metric to optimize during tuning.
         n_trials (int): Number of hyperparameter trials to run.
         resume (bool): If True, resumes tuning from a previous state.
         save_db (bool): If True, saves the tuning results to a database.
-        fast (bool): If True, uses a faster but less thorough tuning approach.
-        max_samples (int): Maximum number of samples to use for tuning. 0 means all samples.
-        max_loci (int): Maximum number of loci to use for tuning. 0 means all loci.
         epochs (int): Number of epochs to train each trial.
         batch_size (int): Batch size for training during tuning.
-        eval_interval (int): Interval (in epochs) at which to evaluate the model during tuning.
-        infer_epochs (int): Number of epochs for inference during tuning.
         patience (int): Number of evaluations with no improvement before stopping early.
-        proxy_metric_batch (int): If > 0, uses a subset of data for proxy metric evaluation.
     """
     enabled: bool = False
@@ -198,34 +201,15 @@ class TuneConfig:
         "roc_auc",
         "precision",
         "recall",
+        "mcc",
+        "jaccard",
     ] = "f1"
     n_trials: int = 100
     resume: bool = False
     save_db: bool = False
-    fast: bool = True
-    max_samples: int = 512
-    max_loci: int = 0  # 0 = all
     epochs: int = 500
     batch_size: int = 64
-    eval_interval: int = 20
-    infer_epochs: int = 100
     patience: int = 10
-    proxy_metric_batch: int = 0
-@dataclass
-class EvalConfig:
-    """Evaluation configuration.
-    Attributes:
-        eval_latent_steps (int): Number of optimization steps for latent space evaluation.
-        eval_latent_lr (float): Learning rate for latent space optimization.
-        eval_latent_weight_decay (float): Weight decay for latent space optimization.
-    """
-    eval_latent_steps: int = 50
-    eval_latent_lr: float = 1e-2
-    eval_latent_weight_decay: float = 0.0
 @dataclass
@@ -244,15 +228,18 @@ class PlotConfig:
     dpi: int = 300
     fontsize: int = 18
     despine: bool = True
-    show: bool = False
+    show: bool = True
 @dataclass
 class IOConfig:
     """I/O configuration.
+    Dataclass that includes configuration settings for file naming, logging verbosity, random seed, and parallelism.
     Attributes:
         prefix (str): Prefix for output files. Default is "pgsui".
+        ploidy (int): Ploidy level of the organism. Default is 2.
         verbose (bool): If True, enables verbose logging. Default is False.
         debug (bool): If True, enables debug mode. Default is False.
         seed (int | None): Random seed for reproducibility. Default is None.
@@ -261,6 +248,7 @@ class IOConfig:
     """
     prefix: str = "pgsui"
+    ploidy: int = 2
     verbose: bool = False
     debug: bool = False
     seed: int | None = None
@@ -287,37 +275,46 @@ class SimConfig:
         "nonrandom",
         "nonrandom_weighted",
     ] = "random"
-    sim_prop: float = 0.10
+    sim_prop: float = 0.20
     sim_kwargs: dict | None = None
 @dataclass
-class NLPCAConfig:
-    """Top-level configuration for ImputeNLPCA.
+class AutoencoderConfig:
+    """Top-level configuration for ImputeAutoencoder.
+    This configuration class encapsulates all settings required for the
+    ImputeAutoencoder model, including I/O, model architecture, training,
+    hyperparameter tuning, plotting, and simulated-missing configuration.
     Attributes:
         io (IOConfig): I/O configuration.
         model (ModelConfig): Model architecture configuration.
         train (TrainConfig): Training procedure configuration.
         tune (TuneConfig): Hyperparameter tuning configuration.
-        evaluate (EvalConfig): Evaluation configuration.
         plot (PlotConfig): Plotting configuration.
-        sim (SimConfig): Simulation configuration.
+        sim (SimConfig): Simulated-missing configuration.
     """
     io: IOConfig = field(default_factory=IOConfig)
     model: ModelConfig = field(default_factory=ModelConfig)
-    train: TrainConfig = field(default_factory=TrainConfig)
+    train: TrainConfig = field(default_factory=_default_train_config)
     tune: TuneConfig = field(default_factory=TuneConfig)
-    evaluate: EvalConfig = field(default_factory=EvalConfig)
     plot: PlotConfig = field(default_factory=PlotConfig)
     sim: SimConfig = field(default_factory=SimConfig)
     @classmethod
     def from_preset(
         cls, preset: Literal["fast", "balanced", "thorough"] = "balanced"
-    ) -> "NLPCAConfig":
-        """Build a NLPCAConfig from a named preset."""
+    ) -> "AutoencoderConfig":
+        """Build a AutoencoderConfig from a named preset.
+        Args:
+            preset (Literal["fast", "balanced", "thorough"]): Preset name.
+        Returns:
+            AutoencoderConfig: Configuration instance corresponding to the preset.
+        """
         if preset not in {"fast", "balanced", "thorough"}:
             raise ValueError(f"Unknown preset: {preset}")
@@ -325,414 +322,87 @@ class NLPCAConfig:
         # Common baselines
         cfg.io.verbose = False
-        cfg.train.validation_split = 0.20
-        cfg.model.hidden_activation = "relu"
+        cfg.io.ploidy = 2
+        cfg.train.validation_split = 0.2
+        cfg.model.activation = "relu"
         cfg.model.layer_schedule = "pyramid"
-        cfg.model.latent_init = "random"
-        cfg.evaluate.eval_latent_lr = 1e-2
-        cfg.evaluate.eval_latent_weight_decay = 0.0
-        cfg.sim.simulate_missing = True
+        cfg.model.layer_scaling_factor = 2.0
         cfg.sim.sim_strategy = "random"
         cfg.sim.sim_prop = 0.2
+        cfg.plot.show = True
+        # Train settings
+        cfg.train.weights_max_ratio = None
+        cfg.train.weights_power = 1.0
+        cfg.train.weights_normalize = True
+        cfg.train.weights_inverse = False
+        cfg.train.gamma = 0.0
+        cfg.train.gamma_schedule = False
+        cfg.train.min_epochs = 100
+        # Tune
+        cfg.tune.enabled = False
+        cfg.tune.n_trials = 100
         if preset == "fast":
             # Model
             cfg.model.latent_dim = 4
             cfg.model.num_hidden_layers = 1
-            cfg.model.layer_scaling_factor = 2.0
             cfg.model.dropout_rate = 0.10
-            cfg.model.gamma = 1.5
-            # Train
-            cfg.train.batch_size = 256
-            cfg.train.learning_rate = 2e-3
-            cfg.train.early_stop_gen = 5
-            cfg.train.min_epochs = 10
-            cfg.train.max_epochs = 150
-            cfg.train.weights_beta = 0.999
-            cfg.train.weights_max_ratio = 5.0
-            # Tuning
-            cfg.tune.enabled = True
-            cfg.tune.fast = True
-            cfg.tune.n_trials = 20
-            cfg.tune.epochs = 150
-            cfg.tune.batch_size = 256
-            cfg.tune.max_samples = 512
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 20
-            cfg.tune.infer_epochs = 20
-            cfg.tune.patience = 5
-            cfg.tune.proxy_metric_batch = 0
-            # Eval
-            cfg.evaluate.eval_latent_steps = 20
-        elif preset == "balanced":
-            # Model
-            cfg.model.latent_dim = 8
-            cfg.model.num_hidden_layers = 2
-            cfg.model.layer_scaling_factor = 3.0
-            cfg.model.dropout_rate = 0.20
-            cfg.model.gamma = 2.0
             # Train
             cfg.train.batch_size = 128
-            cfg.train.learning_rate = 1e-3
+            cfg.train.learning_rate = 2e-3
             cfg.train.early_stop_gen = 15
-            cfg.train.min_epochs = 50
-            cfg.train.max_epochs = 600
-            cfg.train.weights_beta = 0.9999
-            cfg.train.weights_max_ratio = 5.0
-            # Tuning
-            cfg.tune.enabled = True
-            cfg.tune.fast = False
-            cfg.tune.n_trials = 60
-            cfg.tune.epochs = 200
-            cfg.tune.batch_size = 128
-            cfg.tune.max_samples = 2048
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 10
-            cfg.tune.infer_epochs = 50
-            cfg.tune.patience = 10
-            cfg.tune.proxy_metric_batch = 0
-            # Eval
-            cfg.evaluate.eval_latent_steps = 40
-        else:  # thorough
-            # Model
-            cfg.model.latent_dim = 16
-            cfg.model.num_hidden_layers = 3
-            cfg.model.layer_scaling_factor = 5.0
-            cfg.model.dropout_rate = 0.30
-            cfg.model.gamma = 2.5
-            # Train
-            cfg.train.batch_size = 64
-            cfg.train.learning_rate = 5e-4
-            cfg.train.early_stop_gen = 30
-            cfg.train.min_epochs = 100
-            cfg.train.max_epochs = 2000
-            cfg.train.weights_beta = 0.9999
-            cfg.train.weights_max_ratio = 5.0
-            # Tuning
-            cfg.tune.enabled = True
-            cfg.tune.fast = False  # Full search
-            cfg.tune.n_trials = 100
-            cfg.tune.epochs = 600
-            cfg.tune.batch_size = 64
-            cfg.tune.max_samples = 0  # No limit
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 10
-            cfg.tune.infer_epochs = 80
-            cfg.tune.patience = 20
-            cfg.tune.proxy_metric_batch = 0
-            # Eval
-            cfg.evaluate.eval_latent_steps = 100
-        return cfg
-    def apply_overrides(self, overrides: Dict[str, Any] | None) -> "NLPCAConfig":
-        """Apply flat dot-key overrides."""
-        if not overrides:
-            return self
-        for k, v in overrides.items():
-            node = self
-            parts = k.split(".")
-            for p in parts[:-1]:
-                node = getattr(node, p)
-            last = parts[-1]
-            if hasattr(node, last):
-                setattr(node, last, v)
-            else:
-                raise KeyError(f"Unknown config key: {k}")
-        return self
-    def to_dict(self) -> Dict[str, Any]:
-        return asdict(self)
-@dataclass
-class UBPConfig:
-    """Top-level configuration for ImputeUBP.
-    Attributes:
-        io (IOConfig): I/O configuration.
-        model (ModelConfig): Model architecture configuration.
-        train (TrainConfig): Training procedure configuration.
-        tune (TuneConfig): Hyperparameter tuning configuration.
-        evaluate (EvalConfig): Evaluation configuration.
-        plot (PlotConfig): Plotting configuration.
-        sim (SimConfig): Simulated-missing configuration.
-    """
+            cfg.train.max_epochs = 200
+            cfg.train.weights_max_ratio = None
-    io: IOConfig = field(default_factory=IOConfig)
-    model: ModelConfig = field(default_factory=ModelConfig)
-    train: TrainConfig = field(default_factory=TrainConfig)
-    tune: TuneConfig = field(default_factory=TuneConfig)
-    evaluate: EvalConfig = field(default_factory=EvalConfig)
-    plot: PlotConfig = field(default_factory=PlotConfig)
-    sim: SimConfig = field(default_factory=SimConfig)
-    @classmethod
-    def from_preset(
-        cls, preset: Literal["fast", "balanced", "thorough"] = "balanced"
-    ) -> "UBPConfig":
-        """Build a UBPConfig from a named preset."""
-        if preset not in {"fast", "balanced", "thorough"}:
-            raise ValueError(f"Unknown preset: {preset}")
-        cfg = cls()
-        # Common baselines
-        cfg.io.verbose = False
-        cfg.model.hidden_activation = "relu"
-        cfg.model.layer_schedule = "pyramid"
-        cfg.model.latent_init = "random"
-        cfg.sim.simulate_missing = True
-        cfg.sim.sim_strategy = "random"
-        cfg.sim.sim_prop = 0.2
-        if preset == "fast":
-            # Model
-            cfg.model.latent_dim = 4
-            cfg.model.num_hidden_layers = 1
-            cfg.model.layer_scaling_factor = 2.0
-            cfg.model.dropout_rate = 0.10
-            cfg.model.gamma = 1.5
-            # Train
-            cfg.train.batch_size = 256
-            cfg.train.learning_rate = 2e-3
-            cfg.train.early_stop_gen = 5
-            cfg.train.min_epochs = 10
-            cfg.train.max_epochs = 150
-            cfg.train.weights_beta = 0.999
-            cfg.train.weights_max_ratio = 5.0
-            # Tuning
-            cfg.tune.enabled = True
-            cfg.tune.fast = True
-            cfg.tune.n_trials = 20
-            cfg.tune.epochs = 150
-            cfg.tune.batch_size = 256
-            cfg.tune.max_samples = 512
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 20
-            cfg.tune.infer_epochs = 20
-            cfg.tune.patience = 5
-            cfg.tune.proxy_metric_batch = 0
-            # Eval
-            cfg.evaluate.eval_latent_steps = 20
-            cfg.evaluate.eval_latent_lr = 1e-2
-            cfg.evaluate.eval_latent_weight_decay = 0.0
+            # Tune
+            cfg.tune.patience = 15
         elif preset == "balanced":
             # Model
             cfg.model.latent_dim = 8
             cfg.model.num_hidden_layers = 2
-            cfg.model.layer_scaling_factor = 3.0
             cfg.model.dropout_rate = 0.20
-            cfg.model.gamma = 2.0
             # Train
-            cfg.train.batch_size = 128
+            cfg.train.batch_size = 64
             cfg.train.learning_rate = 1e-3
-            cfg.train.early_stop_gen = 15
-            cfg.train.min_epochs = 50
-            cfg.train.max_epochs = 600
-            cfg.train.weights_beta = 0.9999
-            cfg.train.weights_max_ratio = 5.0
-            # Tuning
-            cfg.tune.enabled = True
-            cfg.tune.fast = False
-            cfg.tune.n_trials = 60
-            cfg.tune.epochs = 200
-            cfg.tune.batch_size = 128
-            cfg.tune.max_samples = 2048
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 10
-            cfg.tune.infer_epochs = 50
-            cfg.tune.patience = 10
-            cfg.tune.proxy_metric_batch = 0
-            # Eval
-            cfg.evaluate.eval_latent_steps = 40
-            cfg.evaluate.eval_latent_lr = 1e-2
-            cfg.evaluate.eval_latent_weight_decay = 0.0
+            cfg.train.early_stop_gen = 25
+            cfg.train.max_epochs = 500
+            cfg.train.weights_max_ratio = None
+            # Tune
+            cfg.tune.patience = 25
         else:  # thorough
             # Model
             cfg.model.latent_dim = 16
             cfg.model.num_hidden_layers = 3
-            cfg.model.layer_scaling_factor = 5.0
             cfg.model.dropout_rate = 0.30
-            cfg.model.gamma = 2.5
             # Train
             cfg.train.batch_size = 64
             cfg.train.learning_rate = 5e-4
-            cfg.train.early_stop_gen = 30
-            cfg.train.min_epochs = 100
-            cfg.train.max_epochs = 2000
-            cfg.train.weights_beta = 0.9999
-            cfg.train.weights_max_ratio = 5.0
-            # Tuning
-            cfg.tune.enabled = True
-            cfg.tune.fast = False
-            cfg.tune.n_trials = 100
-            cfg.tune.epochs = 600
-            cfg.tune.batch_size = 64
-            cfg.tune.max_samples = 0
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 10
-            cfg.tune.infer_epochs = 80
-            cfg.tune.patience = 20
-            cfg.tune.proxy_metric_batch = 0
-            # Eval
-            cfg.evaluate.eval_latent_steps = 100
-            cfg.evaluate.eval_latent_lr = 1e-2
-            cfg.evaluate.eval_latent_weight_decay = 0.0
-        return cfg
-    def apply_overrides(self, overrides: Dict[str, Any] | None) -> "UBPConfig":
-        """Apply flat dot-key overrides."""
-        if not overrides:
-            return self
-        for k, v in overrides.items():
-            node = self
-            parts = k.split(".")
-            for p in parts[:-1]:
-                node = getattr(node, p)
-            last = parts[-1]
-            if hasattr(node, last):
-                setattr(node, last, v)
-            else:
-                raise KeyError(f"Unknown config key: {k}")
-        return self
-    def to_dict(self) -> Dict[str, Any]:
-        return asdict(self)
-@dataclass
-class AutoencoderConfig:
-    """Top-level configuration for ImputeAutoencoder.
-    Attributes:
-        io (IOConfig): I/O configuration.
-        model (ModelConfig): Model architecture configuration.
-        train (TrainConfig): Training procedure configuration.
-        tune (TuneConfig): Hyperparameter tuning configuration.
-        evaluate (EvalConfig): Evaluation configuration.
-        plot (PlotConfig): Plotting configuration.
-        sim (SimConfig): Simulated-missing configuration.
-    """
-    io: IOConfig = field(default_factory=IOConfig)
-    model: ModelConfig = field(default_factory=ModelConfig)
-    train: TrainConfig = field(default_factory=TrainConfig)
-    tune: TuneConfig = field(default_factory=TuneConfig)
-    evaluate: EvalConfig = field(default_factory=EvalConfig)
-    plot: PlotConfig = field(default_factory=PlotConfig)
-    sim: SimConfig = field(default_factory=SimConfig)
-    @classmethod
-    def from_preset(
-        cls, preset: Literal["fast", "balanced", "thorough"] = "balanced"
-    ) -> "AutoencoderConfig":
-        """Build a AutoencoderConfig from a named preset."""
-        if preset not in {"fast", "balanced", "thorough"}:
-            raise ValueError(f"Unknown preset: {preset}")
-        cfg = cls()
-        # Common baselines (no latent refinement at eval)
-        cfg.io.verbose = False
-        cfg.train.validation_split = 0.20
-        cfg.model.hidden_activation = "relu"
-        cfg.model.layer_schedule = "pyramid"
-        cfg.evaluate.eval_latent_steps = 0
-        cfg.evaluate.eval_latent_lr = 0.0
-        cfg.evaluate.eval_latent_weight_decay = 0.0
-        cfg.sim.simulate_missing = True
-        cfg.sim.sim_strategy = "random"
-        cfg.sim.sim_prop = 0.2
-        if preset == "fast":
-            cfg.model.latent_dim = 4
-            cfg.model.num_hidden_layers = 1
-            cfg.model.layer_scaling_factor = 2.0
-            cfg.model.dropout_rate = 0.10
-            cfg.model.gamma = 1.5
-            cfg.train.batch_size = 256
-            cfg.train.learning_rate = 2e-3
-            cfg.train.early_stop_gen = 5
-            cfg.train.min_epochs = 10
-            cfg.train.max_epochs = 150
-            cfg.train.weights_beta = 0.999
-            cfg.train.weights_max_ratio = 5.0
-            cfg.tune.enabled = True
-            cfg.tune.fast = True
-            cfg.tune.n_trials = 20
-            cfg.tune.epochs = 150
-            cfg.tune.batch_size = 256
-            cfg.tune.max_samples = 512
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 20
-            cfg.tune.patience = 5
-            cfg.tune.proxy_metric_batch = 0
-            if hasattr(cfg.tune, "infer_epochs"):
-                cfg.tune.infer_epochs = 0
+            cfg.train.early_stop_gen = 50
+            cfg.train.max_epochs = 1000
+            cfg.train.weights_max_ratio = None
-        elif preset == "balanced":
-            cfg.model.latent_dim = 8
-            cfg.model.num_hidden_layers = 2
-            cfg.model.layer_scaling_factor = 3.0
-            cfg.model.dropout_rate = 0.20
-            cfg.model.gamma = 2.0
-            cfg.train.batch_size = 128
-            cfg.train.learning_rate = 1e-3
-            cfg.train.early_stop_gen = 15
-            cfg.train.min_epochs = 50
-            cfg.train.max_epochs = 600
-            cfg.train.weights_beta = 0.9999
-            cfg.train.weights_max_ratio = 5.0
-            cfg.tune.enabled = True
-            cfg.tune.fast = False
-            cfg.tune.n_trials = 60
-            cfg.tune.epochs = 200
-            cfg.tune.batch_size = 128
-            cfg.tune.max_samples = 2048
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 10
-            cfg.tune.patience = 10
-            cfg.tune.proxy_metric_batch = 0
-            if hasattr(cfg.tune, "infer_epochs"):
-                cfg.tune.infer_epochs = 0
-        else:  # thorough
-            cfg.model.latent_dim = 16
-            cfg.model.num_hidden_layers = 3
-            cfg.model.layer_scaling_factor = 5.0
-            cfg.model.dropout_rate = 0.30
-            cfg.model.gamma = 2.5
-            cfg.train.batch_size = 64
-            cfg.train.learning_rate = 5e-4
-            cfg.train.early_stop_gen = 30
-            cfg.train.min_epochs = 100
-            cfg.train.max_epochs = 2000
-            cfg.train.weights_beta = 0.9999
-            cfg.train.weights_max_ratio = 5.0
-            cfg.tune.enabled = True
-            cfg.tune.fast = False
-            cfg.tune.n_trials = 100
-            cfg.tune.epochs = 600
-            cfg.tune.batch_size = 64
-            cfg.tune.max_samples = 0
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 10
-            cfg.tune.patience = 20
-            cfg.tune.proxy_metric_batch = 0
-            if hasattr(cfg.tune, "infer_epochs"):
-                cfg.tune.infer_epochs = 0
+            # Tune
+            cfg.tune.patience = 50
         return cfg
     def apply_overrides(self, overrides: Dict[str, Any] | None) -> "AutoencoderConfig":
-        """Apply flat dot-key overrides."""
+        """Apply flat dot-key overrides.
+        Args:
+            overrides (Dict[str, Any] | None): Dictionary of overrides with dot-separated keys.
+        Returns:
+            AutoencoderConfig: New configuration instance with overrides applied.
+        """
         if not overrides:
             return self
         for k, v in overrides.items():
@@ -753,29 +423,22 @@ class AutoencoderConfig:
 @dataclass
 class VAEExtraConfig:
-    """VAE-specific knobs.
-    Attributes:
-        kl_beta (float): Final β for KL divergence term.
-        kl_warmup (int): Number of epochs with β=0 (warm-up period).
-        kl_ramp (int): Number of epochs for linear ramp to final β.
-    """
     kl_beta: float = 1.0
-    kl_warmup: int = 50
-    kl_ramp: int = 200
+    kl_beta_schedule: bool = False
 @dataclass
 class VAEConfig:
     """Top-level configuration for ImputeVAE (AE-parity + VAE extras).
+    Mirrors AutoencoderConfig sections and adds a ``vae`` block with KL-beta
+    controls for the VAE loss.
     Attributes:
         io (IOConfig): I/O configuration.
         model (ModelConfig): Model architecture configuration.
         train (TrainConfig): Training procedure configuration.
         tune (TuneConfig): Hyperparameter tuning configuration.
-        evaluate (EvalConfig): Evaluation configuration.
         plot (PlotConfig): Plotting configuration.
         vae (VAEExtraConfig): VAE-specific configuration.
         sim (SimConfig): Simulated-missing configuration.
@@ -783,9 +446,8 @@ class VAEConfig:
     io: IOConfig = field(default_factory=IOConfig)
     model: ModelConfig = field(default_factory=ModelConfig)
-    train: TrainConfig = field(default_factory=TrainConfig)
+    train: TrainConfig = field(default_factory=_default_train_config)
     tune: TuneConfig = field(default_factory=TuneConfig)
-    evaluate: EvalConfig = field(default_factory=EvalConfig)
     plot: PlotConfig = field(default_factory=PlotConfig)
     vae: VAEExtraConfig = field(default_factory=VAEExtraConfig)
     sim: SimConfig = field(default_factory=SimConfig)
@@ -794,119 +456,92 @@ class VAEConfig:
     def from_preset(
         cls, preset: Literal["fast", "balanced", "thorough"] = "balanced"
     ) -> "VAEConfig":
-        """Build a VAEConfig from a named preset."""
+        """Build a VAEConfig from a named preset.
+        Args:
+            preset (Literal["fast", "balanced", "thorough"]): Preset name.
+        Returns:
+            VAEConfig: Configuration instance corresponding to the preset.
+        """
         if preset not in {"fast", "balanced", "thorough"}:
             raise ValueError(f"Unknown preset: {preset}")
         cfg = cls()
-        # Common baselines (match AE; no latent refinement at eval)
+        # General settings
         cfg.io.verbose = False
-        cfg.train.validation_split = 0.20
-        cfg.model.hidden_activation = "relu"
+        cfg.io.ploidy = 2
+        cfg.train.validation_split = 0.2
+        cfg.model.activation = "relu"
         cfg.model.layer_schedule = "pyramid"
-        cfg.evaluate.eval_latent_steps = 0
-        cfg.evaluate.eval_latent_lr = 0.0
-        cfg.evaluate.eval_latent_weight_decay = 0.0
+        cfg.model.layer_scaling_factor = 2.0
         cfg.sim.simulate_missing = True
         cfg.sim.sim_strategy = "random"
         cfg.sim.sim_prop = 0.2
+        cfg.plot.show = True
+        # Train settings
+        cfg.train.weights_max_ratio = None
+        cfg.train.weights_power = 1.0
+        cfg.train.weights_normalize = True
+        cfg.train.weights_inverse = False
+        cfg.train.gamma = 0.0
+        cfg.train.gamma_schedule = False
+        cfg.train.min_epochs = 100
+        # VAE-specific
+        cfg.vae.kl_beta = 1.0
+        cfg.vae.kl_beta_schedule = False
+        # Tune
+        cfg.tune.enabled = False
+        cfg.tune.n_trials = 100
         if preset == "fast":
+            # Model
             cfg.model.latent_dim = 4
-            cfg.model.num_hidden_layers = 1
-            cfg.model.layer_scaling_factor = 2.0
+            cfg.model.num_hidden_layers = 2
             cfg.model.dropout_rate = 0.10
-            cfg.model.gamma = 1.5
-            # VAE specifics
-            cfg.vae.kl_beta = 0.5
-            cfg.vae.kl_warmup = 10
-            cfg.vae.kl_ramp = 40
             # Train
-            cfg.train.batch_size = 256
+            cfg.train.batch_size = 128
             cfg.train.learning_rate = 2e-3
-            cfg.train.early_stop_gen = 5
-            cfg.train.min_epochs = 10
-            cfg.train.max_epochs = 150
-            cfg.train.weights_beta = 0.999
-            cfg.train.weights_max_ratio = 5.0
+            cfg.train.early_stop_gen = 15
+            cfg.train.max_epochs = 200
             # Tune
-            cfg.tune.enabled = True
-            cfg.tune.fast = True
-            cfg.tune.n_trials = 20
-            cfg.tune.epochs = 150
-            cfg.tune.batch_size = 256
-            cfg.tune.max_samples = 512
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 20
-            cfg.tune.patience = 5
-            cfg.tune.proxy_metric_batch = 0
-            if hasattr(cfg.tune, "infer_epochs"):
-                cfg.tune.infer_epochs = 0
+            cfg.tune.patience = 15
         elif preset == "balanced":
+            # Model
             cfg.model.latent_dim = 8
-            cfg.model.num_hidden_layers = 2
-            cfg.model.layer_scaling_factor = 3.0
+            cfg.model.num_hidden_layers = 4
             cfg.model.dropout_rate = 0.20
-            cfg.model.gamma = 2.0
-            # VAE specifics
-            cfg.vae.kl_beta = 1.0
-            cfg.vae.kl_warmup = 50
-            cfg.vae.kl_ramp = 150
             # Train
-            cfg.train.batch_size = 128
+            cfg.train.batch_size = 64
             cfg.train.learning_rate = 1e-3
-            cfg.train.early_stop_gen = 15
-            cfg.train.min_epochs = 50
-            cfg.train.max_epochs = 600
-            cfg.train.weights_beta = 0.9999
-            cfg.train.weights_max_ratio = 5.0
+            cfg.train.early_stop_gen = 25
+            cfg.train.max_epochs = 500
             # Tune
-            cfg.tune.enabled = True
-            cfg.tune.fast = False
-            cfg.tune.n_trials = 60
-            cfg.tune.epochs = 200
-            cfg.tune.batch_size = 128
-            cfg.tune.max_samples = 2048
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 10
-            cfg.tune.patience = 10
-            cfg.tune.proxy_metric_batch = 0
-            if hasattr(cfg.tune, "infer_epochs"):
-                cfg.tune.infer_epochs = 0
+            cfg.tune.patience = 25
         else:  # thorough
+            # Model
             cfg.model.latent_dim = 16
-            cfg.model.num_hidden_layers = 3
-            cfg.model.layer_scaling_factor = 5.0
+            cfg.model.num_hidden_layers = 8
             cfg.model.dropout_rate = 0.30
-            cfg.model.gamma = 2.5
-            # VAE specifics
-            cfg.vae.kl_beta = 1.0
-            cfg.vae.kl_warmup = 100
-            cfg.vae.kl_ramp = 400
             # Train
             cfg.train.batch_size = 64
             cfg.train.learning_rate = 5e-4
-            cfg.train.early_stop_gen = 30
-            cfg.train.min_epochs = 100
-            cfg.train.max_epochs = 2000
-            cfg.train.weights_beta = 0.9999
-            cfg.train.weights_max_ratio = 5.0
+            cfg.train.early_stop_gen = 50
+            cfg.train.max_epochs = 1000
             # Tune
-            cfg.tune.enabled = True
-            cfg.tune.fast = False
-            cfg.tune.n_trials = 100
-            cfg.tune.epochs = 600
-            cfg.tune.batch_size = 64
-            cfg.tune.max_samples = 0
-            cfg.tune.max_loci = 0
-            cfg.tune.eval_interval = 10
-            cfg.tune.patience = 20
-            cfg.tune.proxy_metric_batch = 0
-            if hasattr(cfg.tune, "infer_epochs"):
-                cfg.tune.infer_epochs = 0
+            cfg.tune.patience = 50
         return cfg
@@ -935,9 +570,9 @@ class MostFrequentAlgoConfig:
     """Algorithmic knobs for ImputeMostFrequent.
     Attributes:
-        by_populations (bool): Whether to compute per-population modes.
-        default (int): Fallback mode if no valid entries in a locus.
-        missing (int): Code for missing genotypes in 0/1/2.
+        by_populations (bool): Whether to compute per-population modes. Default is False.
+        default (int): Fallback mode if no valid entries in a locus. Default is 0.
+        missing (int): Code for missing genotypes in 0/1/2. Default is -1.
     """
     by_populations: bool = False
@@ -950,8 +585,8 @@ class DeterministicSplitConfig:
     """Evaluation split configuration shared by deterministic imputers.
     Attributes:
-        test_size (float): Proportion of data to use as the test set.
-        test_indices (Optional[Sequence[int]]): Specific indices to use as the test set.
+        test_size (float): Proportion of data to use as the test set. Default is 0.2.
+        test_indices (Optional[Sequence[int]]): Specific indices to use as the test set. Default is None.
     """
     test_size: float = 0.2
@@ -962,6 +597,10 @@ class DeterministicSplitConfig:
 class MostFrequentConfig:
     """Top-level configuration for ImputeMostFrequent.
+    Deterministic imputers primarily use ``io``, ``plot``, ``split``, ``algo``,
+    and ``sim``. The ``train`` and ``tune`` sections are retained for schema
+    parity with NN models but are not currently used by ImputeMostFrequent.
     Attributes:
         io (IOConfig): I/O configuration.
         plot (PlotConfig): Plotting configuration.
@@ -978,19 +617,27 @@ class MostFrequentConfig:
     algo: MostFrequentAlgoConfig = field(default_factory=MostFrequentAlgoConfig)
     sim: SimConfig = field(default_factory=SimConfig)
     tune: TuneConfig = field(default_factory=TuneConfig)
-    train: TrainConfig = field(default_factory=TrainConfig)
+    train: TrainConfig = field(default_factory=_default_train_config)
     @classmethod
     def from_preset(
         cls,
         preset: Literal["fast", "balanced", "thorough"] = "balanced",
     ) -> "MostFrequentConfig":
-        """Construct a preset configuration."""
+        """Construct a preset configuration.
+        Args:
+            preset (Literal["fast", "balanced", "thorough"]): Preset name.
+        Returns:
+            MostFrequentConfig: Configuration instance corresponding to the preset.
+        """
         if preset not in {"fast", "balanced", "thorough"}:
             raise ValueError(f"Unknown preset: {preset}")
         cfg = cls()
         cfg.io.verbose = False
+        cfg.io.ploidy = 2
         cfg.split.test_size = 0.2
         cfg.sim.simulate_missing = True
         cfg.sim.sim_strategy = "random"
@@ -1033,6 +680,10 @@ class RefAlleleAlgoConfig:
 class RefAlleleConfig:
     """Top-level configuration for ImputeRefAllele.
+    Deterministic imputers primarily use ``io``, ``plot``, ``split``, ``algo``,
+    and ``sim``. The ``train`` and ``tune`` sections are retained for schema
+    parity with NN models but are not currently used by ImputeRefAllele.
     Attributes:
         io (IOConfig): I/O configuration.
         plot (PlotConfig): Plotting configuration.
@@ -1049,18 +700,26 @@ class RefAlleleConfig:
     algo: RefAlleleAlgoConfig = field(default_factory=RefAlleleAlgoConfig)
     sim: SimConfig = field(default_factory=SimConfig)
     tune: TuneConfig = field(default_factory=TuneConfig)
-    train: TrainConfig = field(default_factory=TrainConfig)
+    train: TrainConfig = field(default_factory=_default_train_config)
     @classmethod
     def from_preset(
         cls, preset: Literal["fast", "balanced", "thorough"] = "balanced"
     ) -> "RefAlleleConfig":
-        """Presets mainly keep parity with logging/IO and split test_size."""
+        """Presets mainly keep parity with logging/IO and split test_size.
+        Args:
+            preset (Literal["fast", "balanced", "thorough"]): Preset name.
+        Returns:
+            RefAlleleConfig: Configuration instance corresponding to the preset.
+        """
         if preset not in {"fast", "balanced", "thorough"}:
             raise ValueError(f"Unknown preset: {preset}")
         cfg = cls()
         cfg.io.verbose = False
+        cfg.io.ploidy = 2
         cfg.split.test_size = 0.2
         cfg.sim.simulate_missing = True
         cfg.sim.sim_strategy = "random"
@@ -1273,7 +932,14 @@ class RFConfig:
     @classmethod
     def from_preset(cls, preset: str = "balanced") -> "RFConfig":
-        """Build a config from a named preset."""
+        """Build a config from a named preset.
+        Args:
+            preset (str): Preset name.
+        Returns:
+            RFConfig: Configuration instance corresponding to the preset.
+        """
         cfg = cls()
         if preset == "fast":
             cfg.model.n_estimators = 50
@@ -1365,7 +1031,14 @@ class HGBConfig:
     @classmethod
     def from_preset(cls, preset: str = "balanced") -> "HGBConfig":
-        """Build a config from a named preset."""
+        """Build a config from a named preset.
+        Args:
+            preset (str): Preset name.
+        Returns:
+            HGBConfig: Configuration instance corresponding to the preset.
+        """
         cfg = cls()
         if preset == "fast":
             cfg.model.n_estimators = 50

pg-sui 1.6.16a3__py3-none-any.whl → 1.7.0__py3-none-any.whl

pg-sui 1.6.16a3py3-none-any.whl → 1.7.0py3-none-any.whl