PyPI - tlmtc - Versions diffs - 0.1.0__py3-none-any.whl - Mend

tlmtc 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

tlmtc/__init__.py +54 -0
tlmtc/__main__.py +6 -0
tlmtc/api.py +455 -0
tlmtc/cli.py +345 -0
tlmtc/data_contracts.py +160 -0
tlmtc/data_pipeline.py +257 -0
tlmtc/data_preparation.py +221 -0
tlmtc/evaluation.py +291 -0
tlmtc/evaluation_pipeline.py +309 -0
tlmtc/finetune_pipeline.py +355 -0
tlmtc/hpo.py +157 -0
tlmtc/meta.py +86 -0
tlmtc/paths.py +371 -0
tlmtc/prediction.py +154 -0
tlmtc/reporting.py +605 -0
tlmtc/runtime_output.py +100 -0
tlmtc/settings.py +456 -0
tlmtc/training.py +339 -0
tlmtc-0.1.0.dist-info/METADATA +223 -0
tlmtc-0.1.0.dist-info/RECORD +23 -0
tlmtc-0.1.0.dist-info/WHEEL +4 -0
tlmtc-0.1.0.dist-info/entry_points.txt +2 -0
tlmtc-0.1.0.dist-info/licenses/LICENSE.md +21 -0

tlmtc/__init__.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Public package interface and lazy-loaded API exports."""
+import importlib
+import logging
+from typing import TYPE_CHECKING, Any
+__version__ = "0.1.0"
+__all__ = [
+    "predict_tlmtc",
+    "train_tlmtc",
+    "__version__",
+]
+logging.getLogger("tlmtc").addHandler(logging.NullHandler())
+_LAZY: dict[str, tuple[str, str]] = {
+    "predict_tlmtc": ("tlmtc.api", "predict_tlmtc"),
+    "train_tlmtc": ("tlmtc.api", "train_tlmtc"),
+}
+def __getattr__(
+    name: str,
+) -> Any:
+    try:
+        module_path, attr = _LAZY[name]
+    except KeyError as exc:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}") from exc
+    try:
+        value = getattr(importlib.import_module(module_path), attr)
+    except ModuleNotFoundError as exc:
+        missing = getattr(exc, "name", None)
+        if missing in {"torch", "peft", "accelerate"}:
+            raise ImportError(
+                f"`torch`, `peft`, and `accelerate` are required for `tlmtc.{name}`. "
+                "Install them with: `pip install 'tlmtc[full]'`."
+            ) from exc
+        raise
+    globals()[name] = value
+    return value
+def __dir__() -> list[str]:
+    return sorted(__all__)
+if TYPE_CHECKING:
+    from tlmtc.api import predict_tlmtc as predict_tlmtc
+    from tlmtc.api import train_tlmtc as train_tlmtc

tlmtc/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Module execution entrypoint for the tlmtc CLI."""
+from tlmtc.cli import app
+if __name__ == "__main__":
+    app()

tlmtc/api.py ADDED Viewed

@@ -0,0 +1,455 @@
+"""Public Python API for running tlmtc training and prediction workflows."""
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from tlmtc.data_pipeline import DataPipeline
+from tlmtc.data_preparation import create_prediction_dataset, read_prediction_csv, tokenize_prediction_dataset
+from tlmtc.evaluation_pipeline import EvaluationPipeline
+from tlmtc.finetune_pipeline import FinetunePipeline
+from tlmtc.meta import TrainRunMeta, read_run_meta, write_run_meta
+from tlmtc.paths import PredictionPaths, RunPaths, resolve_paths, resolve_prediction_paths
+from tlmtc.prediction import (
+    apply_thresholds,
+    load_prediction_model,
+    make_prediction_frame,
+    predict_probabilities,
+)
+from tlmtc.runtime_output import configure_runtime_output, emit_progress
+from tlmtc.settings import UNSET, PredictionSettings, RunSettings, Unset, load_config_file
+@dataclass(frozen=True, slots=True)
+class TrainResult:
+    """Result metadata for a completed tlmtc training run.
+    Attributes:
+        paths: Resolved filesystem layout containing input paths and generated run artifacts.
+    """
+    paths: RunPaths
+@dataclass(frozen=True, slots=True)
+class PredictResult:
+    """Result metadata for a completed tlmtc prediction run.
+    Attributes:
+        paths: Resolved filesystem layout containing prediction inputs and generated artifacts.
+    """
+    paths: PredictionPaths
+def train_tlmtc(
+    raw_csv: str | Path,
+    *,
+    raw_test_csv: str | Path | Unset = UNSET,
+    work_dir: str | Path | Unset = UNSET,
+    config_path: str | Path | Unset = UNSET,
+    run_id: str | None | Unset = UNSET,
+    target_name: str | Unset = UNSET,
+    validation_size: float | Unset = UNSET,
+    test_size: float | Unset = UNSET,
+    random_seed: int | Unset = UNSET,
+    transfer_learning: bool | Unset = UNSET,
+    hyperparameter_tuning: bool | Unset = UNSET,
+    threshold_optimization: bool | Unset = UNSET,
+    threshold_type: str | Unset = UNSET,
+    scale_learning_rate: bool | Unset = UNSET,
+    wrap_peft: bool | Unset = UNSET,
+    proxy_checkpoint: str | Unset = UNSET,
+    checkpoint: str | Unset = UNSET,
+    sequence_length: int | Unset = UNSET,
+    best_model_metric: str | Unset = UNSET,
+    batch_size: int | Unset = UNSET,
+    train_epochs: int | Unset = UNSET,
+    learning_rate: float | Unset = UNSET,
+    weight_decay: float | Unset = UNSET,
+    lr_scheduler: str | Unset = UNSET,
+    best_threshold_metric: str | Unset = UNSET,
+    tuning_trials: int | Unset = UNSET,
+    optuna_space: dict[str, Any] | Unset = UNSET,
+    lora_r: int | Unset = UNSET,
+    lora_alpha: int | Unset = UNSET,
+    lora_dropout: float | Unset = UNSET,
+    lora_bias: str | Unset = UNSET,
+    early_stopping_patience: int | Unset = UNSET,
+    use_cpu: bool | Unset = UNSET,
+    verbosity: str | Unset = UNSET,
+) -> TrainResult:
+    """Run the full multi-label text classification training workflow.
+    The workflow can perform data preparation, hyperparameter tuning, model fine-tuning,
+    threshold optimization, evaluation, and reporting end-to-end according to the selected
+    workflow flags.
+    Args:
+        raw_csv: Path to the raw multi-label training CSV. The file must contain a `text` column,
+            at least two binary `label_*` columns, and optionally a `text_pair` column.
+        raw_test_csv: Path to a separate raw test CSV. If omitted, a test split is created
+            from `raw_csv` using `test_size`. Defaults to no separate test CSV.
+        work_dir: Base directory for resolving inputs and writing run artifacts. Defaults to the
+            current working directory.
+        config_path: Path to a YAML configuration file. Defaults to no configuration file.
+        run_id: Run identifier used to name the run directory. If omitted, a random
+            identifier is generated.
+        target_name: Display name for the classification target in logs and reports. Defaults to
+            `"Target"`.
+        validation_size: Fraction reserved for validation splitting. Defaults to `0.15`.
+        test_size: Fraction reserved for test splitting when `raw_test_csv` is omitted. Defaults to
+            `0.15`.
+        random_seed: Random seed used for reproducible splitting and shuffling. Defaults to `2469`.
+        transfer_learning: Whether to fine-tune the target checkpoint and produce model/evaluation
+            artifacts. If `False`, data preparation still runs; with `hyperparameter_tuning=True`,
+            tlmtc runs proxy-checkpoint hyperparameter tuning only. Defaults to `True`.
+        hyperparameter_tuning: Whether to evaluate candidate hyperparameter configurations with
+            Optuna before final fine-tuning. If `True` and `transfer_learning=False`, only the
+            proxy-checkpoint tuning stage is run after data preparation. If both are `False`,
+            the workflow stops after data preparation. Defaults to `True`.
+        threshold_optimization: Whether to tune decision thresholds on validation-set predictions
+            after fine-tuning. If `False`, evaluation uses the default threshold `0.5`. Ignored
+            when `transfer_learning=False`. Defaults to `True`.
+        threshold_type: Thresholding mode. Supported values are `"global"` and `"label"`. Defaults to
+            `"label"`.
+        scale_learning_rate: Whether to scale a proxy-tuned learning rate for the target checkpoint.
+            Defaults to `False`.
+        wrap_peft: Whether to use parameter-efficient fine-tuning with LoRA adapters. Defaults to `True`.
+        proxy_checkpoint: Compatible encoder-only Hugging Face checkpoint identifier used during
+            hyperparameter tuning. Defaults to `"EuroBERT/EuroBERT-210m"`. If `checkpoint`
+            is supplied and `proxy_checkpoint` is omitted, the proxy checkpoint defaults to the
+            selected `checkpoint`. Loaded with `trust_remote_code=False`; checkpoints that require
+            custom remote code are not supported. Only use checkpoints you trust.
+        checkpoint: Compatible encoder-only Hugging Face checkpoint identifier or local path used for
+            final fine-tuning. Defaults to `"EuroBERT/EuroBERT-610m"`. Loaded with `trust_remote_code=False`;
+            checkpoints that require custom remote code are not supported. Only use checkpoints and local model
+            directories you trust.
+        sequence_length: Maximum tokenized sequence length. Defaults to `128`.
+        best_model_metric: Metric used to select the best model checkpoint. Supported values are
+            `"f1_micro"`, `"f1_macro"`, `"roc_auc_micro"`, and `"roc_auc_macro"`. Defaults to
+            `"roc_auc_macro"`.
+        batch_size: Initial training and evaluation batch size. Used directly when hyperparameter tuning is
+            disabled, otherwise replaced by the tuned value. Defaults to `16`.
+        train_epochs: Initial number of training epochs. Used directly when hyperparameter tuning is
+            disabled, otherwise replaced by the tuned value. Defaults to `20`.
+        learning_rate: Initial optimizer learning rate. Used directly when hyperparameter tuning is
+            disabled, otherwise replaced by the tuned value. Defaults to `2e-5`.
+        weight_decay: Initial weight decay for training. Used directly when hyperparameter tuning is
+            disabled, otherwise replaced by the tuned value. Defaults to `0.01`.
+        lr_scheduler: Initial learning-rate scheduler name. Used directly when hyperparameter tuning is
+            disabled, otherwise replaced by the tuned value. Defaults to `"linear"`.
+        best_threshold_metric: Metric used to select decision thresholds. Supported values are
+            `"f1_micro"` and `"f1_macro"`. Defaults to `"f1_macro"`.
+        tuning_trials: Number of hyperparameter configurations to evaluate during Optuna tuning. Higher
+            values may improve the selected configuration but increase runtime. Defaults to `10`.
+        optuna_space: Optional partial override for the hyperparameter tuning ranges and candidate
+            values. Supported keys are `lr_low`, `lr_high`, `batch_sizes`, `wd_low`, `wd_high`,
+            `schedulers`, `epoch_low`, `epoch_high`. Missing keys are filled from the default tuning space
+            selected by `wrap_peft`.
+            Defaults to the PEFT search space when `wrap_peft=True`:
+            {
+                "lr_low": 5e-5,
+                "lr_high": 4e-4,
+                "batch_sizes": [8, 16, 32],
+                "wd_low": 0.0,
+                "wd_high": 0.01,
+                "schedulers": ["linear", "cosine"],
+                "epoch_low": 5,
+                "epoch_high": 20,
+                "lr_reference_batch_size": 32,
+            }
+            Defaults to the full fine-tuning search space when `wrap_peft=False`:
+            {
+                "lr_low": 1e-5,
+                "lr_high": 8e-5,
+                "batch_sizes": [8, 16, 32],
+                "wd_low": 0.0,
+                "wd_high": 0.1,
+                "schedulers": ["linear", "cosine", "polynomial"],
+                "epoch_low": 5,
+                "epoch_high": 30,
+                "lr_reference_batch_size": 32,
+            }
+        lora_r: LoRA rank. Defaults to `8`.
+        lora_alpha: LoRA scaling factor. Defaults to `32`.
+        lora_dropout: LoRA dropout probability. Defaults to `0.1`.
+        lora_bias: LoRA bias handling mode. Supported values are `"none"`, `"all"`, and `"lora_only"`.
+            Defaults to `"none"`.
+        early_stopping_patience: Early stopping patience in epochs without improvement. Defaults to
+            `10`.
+        use_cpu: Whether to force CPU execution. Defaults to `False`.
+        verbosity: Runtime output mode. Supported values are `"progress"` and `"quiet"`. Defaults to
+            `"progress"`.
+    Returns:
+        Result metadata containing the resolved input and artifact paths.
+    """
+    settings = RunSettings.resolve(
+        config=load_config_file(config_path) if isinstance(config_path, (str, Path)) else None,
+        env=None,
+        overrides={
+            "raw_csv": raw_csv,
+            "raw_test_csv": raw_test_csv,
+            "work_dir": work_dir,
+            "run_id": run_id,
+            "model": {
+                "target_name": target_name,
+                "proxy_checkpoint": proxy_checkpoint,
+                "checkpoint": checkpoint,
+                "sequence_length": sequence_length,
+            },
+            "split": {
+                "validation_size": validation_size,
+                "test_size": test_size,
+                "random_seed": random_seed,
+            },
+            "workflow": {
+                "hyperparameter_tuning": hyperparameter_tuning,
+                "threshold_optimization": threshold_optimization,
+                "transfer_learning": transfer_learning,
+                "scale_learning_rate": scale_learning_rate,
+                "wrap_peft": wrap_peft,
+            },
+            "training": {
+                "batch_size": batch_size,
+                "train_epochs": train_epochs,
+                "weight_decay": weight_decay,
+                "learning_rate": learning_rate,
+                "lr_scheduler": lr_scheduler,
+                "best_model_metric": best_model_metric,
+                "early_stopping_patience": early_stopping_patience,
+            },
+            "threshold": {
+                "threshold_type": threshold_type,
+                "best_threshold_metric": best_threshold_metric,
+            },
+            "hpo": {
+                "tuning_trials": tuning_trials,
+                "optuna_space": optuna_space,
+            },
+            "peft": {
+                "lora_r": lora_r,
+                "lora_alpha": lora_alpha,
+                "lora_dropout": lora_dropout,
+                "lora_bias": lora_bias,
+            },
+            "hardware": {
+                "use_cpu": use_cpu,
+            },
+            "runtime": {
+                "verbosity": verbosity,
+            },
+        },
+    )
+    configure_runtime_output(settings.runtime.verbosity)
+    emit_progress("Starting training run")
+    paths = resolve_paths(
+        raw_csv=settings.raw_csv,
+        raw_test_csv=settings.raw_test_csv,
+        work_dir=settings.work_dir,
+        run_id=settings.run_id,
+    ).ensure_dirs()
+    data_pipeline = DataPipeline(
+        paths=paths,
+        split=settings.split,
+        model=settings.model,
+    )
+    data_pipeline.split_data()
+    data_pipeline.get_multi_hot_vectors()
+    data_pipeline.create_hf_dataset()
+    data_pipeline.tokenize_data()
+    finetune_pipeline = FinetunePipeline(
+        tokenized_dataset=data_pipeline.tokenized_dataset,
+        paths=paths,
+        model=settings.model,
+        workflow=settings.workflow,
+        peft=settings.peft,
+        training=settings.training,
+        hpo=settings.hpo,
+        threshold=settings.threshold,
+        hardware=settings.hardware,
+    )
+    finetune_pipeline.load_pretrained()
+    finetune_pipeline.tune_hyperparameters()
+    finetune_pipeline.fine_tune_pretrained()
+    finetune_pipeline.tune_thresholds()
+    finetune_pipeline.save_pretrained()
+    evaluation_pipeline = EvaluationPipeline(
+        tokenized_dataset=data_pipeline.tokenized_dataset,
+        updated_trainer=finetune_pipeline.updated_trainer,
+        paths=paths,
+        model=settings.model,
+        workflow=settings.workflow,
+        training=settings.training,
+        tuned_threshold=finetune_pipeline.tuned_threshold,
+        input_mode=data_pipeline.input_mode,
+    )
+    evaluation_pipeline.run_evaluation()
+    evaluation_pipeline.save_metrics()
+    evaluation_pipeline.render_tables()
+    evaluation_pipeline.render_figures()
+    write_run_meta(
+        meta=TrainRunMeta(
+            run_id=settings.run_id,
+            target_name=settings.model.target_name,
+            checkpoint=settings.model.checkpoint,
+            proxy_checkpoint=settings.model.proxy_checkpoint,
+            sequence_length=settings.model.sequence_length,
+            input_mode=data_pipeline.input_mode,
+            label_names=evaluation_pipeline.label_names,
+            threshold_type=settings.threshold.threshold_type,
+            thresholds=finetune_pipeline.tuned_threshold.tolist(),
+            transfer_learning=settings.workflow.transfer_learning,
+            hyperparameter_tuning=settings.workflow.hyperparameter_tuning,
+            threshold_optimization=settings.workflow.threshold_optimization,
+            scale_learning_rate=settings.workflow.scale_learning_rate,
+            wrap_peft=settings.workflow.wrap_peft,
+        ),
+        path=paths.train_run_meta_path,
+    )
+    emit_progress("Training run complete")
+    return TrainResult(paths=paths)
+def predict_tlmtc(
+    prediction_csv: str | Path,
+    *,
+    work_dir: str | Path | Unset = UNSET,
+    config_path: str | Path | Unset = UNSET,
+    run_id: str | None | Unset = UNSET,
+    batch_size: int | Unset = UNSET,
+    use_cpu: bool | Unset = UNSET,
+    verbosity: str | Unset = UNSET,
+) -> PredictResult:
+    """Run the multi-label text classification prediction workflow.
+    Prediction consumes persisted metadata and model artifacts from a completed
+    training run, applies the persisted decision thresholds, and writes probability
+    and binary prediction artifacts.
+    Args:
+        prediction_csv: Path to the unlabeled prediction CSV. The file must contain a `text`
+            column and, for models trained with paired-text inputs, a `text_pair` column.
+            Prediction artifacts preserve input text columns unchanged.
+        work_dir: Base directory for resolving inputs, reading training artifacts, and writing
+            prediction artifacts. Defaults to the current working directory.
+        config_path: Path to a YAML configuration file. Defaults to no configuration file.
+        run_id: Run identifier used to select the completed training run. If omitted, the latest
+            completed training run is selected from persisted training metadata. Prediction reloads
+            the trained model or adapter artifacts for this run with `trust_remote_code=False`;
+            artifacts that require custom remote code are not supported. Only use saved model
+            artifacts and adapters you trust.
+        batch_size: Prediction batch size used for batched inference. Defaults to `32`.
+        use_cpu: Whether to force CPU execution. Defaults to `False`.
+        verbosity: Runtime output mode. Supported values are `"progress"` and `"quiet"`. Defaults to
+            `"progress"`.
+    Returns:
+        Result metadata containing the resolved input and artifact paths.
+    """
+    settings = PredictionSettings.resolve(
+        config=load_config_file(config_path) if isinstance(config_path, (str, Path)) else None,
+        env=None,
+        overrides={
+            "prediction_csv": prediction_csv,
+            "work_dir": work_dir,
+            "run_id": run_id,
+            "batch_size": batch_size,
+            "hardware": {
+                "use_cpu": use_cpu,
+            },
+            "runtime": {
+                "verbosity": verbosity,
+            },
+        },
+    )
+    configure_runtime_output(settings.runtime.verbosity)
+    emit_progress("Starting prediction run")
+    paths = resolve_prediction_paths(
+        input_csv=settings.prediction_csv,
+        work_dir=settings.work_dir,
+        run_id=settings.run_id,
+    ).ensure_dirs()
+    emit_progress("Reading training metadata")
+    meta = read_run_meta(paths.train_run_meta_path)
+    if not meta.transfer_learning:
+        raise RuntimeError(
+            "Prediction requires a training run with transfer_learning=True. "
+            f"Run '{meta.run_id}' did not persist a fine-tuned prediction model."
+        )
+    input_mode = meta.input_mode
+    label_names = meta.label_names
+    assert input_mode is not None
+    assert label_names is not None
+    emit_progress("Reading prediction inputs")
+    input_df = read_prediction_csv(
+        df_path=paths.input_data_path,
+        expected_input_mode=input_mode,
+    )
+    prediction_dataset = create_prediction_dataset(
+        df=input_df,
+        input_mode=input_mode,
+    )
+    emit_progress("Tokenizing prediction inputs")
+    tokenized_dataset = tokenize_prediction_dataset(
+        dataset=prediction_dataset,
+        checkpoint=meta.checkpoint,
+        input_mode=input_mode,
+        sequence_length=meta.sequence_length,
+    )
+    emit_progress("Loading fine-tuned prediction model")
+    model = load_prediction_model(
+        model_dir=paths.train_run_model_dir,
+        checkpoint=meta.checkpoint,
+        num_labels=len(label_names),
+        wrap_peft=meta.wrap_peft,
+    )
+    emit_progress("Running prediction")
+    probabilities = predict_probabilities(
+        model=model,
+        dataset=tokenized_dataset,
+        batch_size=settings.batch_size,
+        use_cpu=settings.hardware.use_cpu,
+    )
+    probability_df = make_prediction_frame(
+        input_df=input_df,
+        values=probabilities,
+        label_names=label_names,
+    )
+    predictions = apply_thresholds(
+        probabilities=probabilities,
+        thresholds=meta.thresholds,
+    )
+    prediction_df = make_prediction_frame(
+        input_df=input_df,
+        values=predictions,
+        label_names=label_names,
+    )
+    emit_progress("Writing prediction artifacts")
+    probability_df.to_csv(paths.probabilities_path, index=False)
+    prediction_df.to_csv(paths.predictions_path, index=False)
+    emit_progress("Prediction run complete")
+    return PredictResult(paths=paths)