PyPI - mudra-ml - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mudra-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

mudra_ml/__init__.py +34 -0
mudra_ml/cli.py +112 -0
mudra_ml/constants.py +41 -0
mudra_ml/core.py +320 -0
mudra_ml/decisions.py +89 -0
mudra_ml/evaluate.py +406 -0
mudra_ml/goal.py +220 -0
mudra_ml/ingest.py +146 -0
mudra_ml/preprocess.py +401 -0
mudra_ml/profile.py +278 -0
mudra_ml/recommend.py +310 -0
mudra_ml/report.py +213 -0
mudra_ml-0.1.0.dist-info/METADATA +213 -0
mudra_ml-0.1.0.dist-info/RECORD +17 -0
mudra_ml-0.1.0.dist-info/WHEEL +4 -0
mudra_ml-0.1.0.dist-info/entry_points.txt +2 -0
mudra_ml-0.1.0.dist-info/licenses/LICENSE +21 -0

mudra_ml/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""MudraML: glass-box autonomous data science.
+The decision engine that drives the pipeline is rule-based and statistical.
+It is deterministic, logged, and explainable. The machine learning models are
+the output it produces, not the mechanism by which it chooses what to do.
+"""
+from __future__ import annotations
+from .core import Mudra, RunResult
+from .evaluate import evaluate
+from .goal import Goal, infer_goal
+from .ingest import load
+from .preprocess import build_pipeline
+from .profile import DataProfile, DataProfiler
+from .recommend import recommend_models
+from .report import write_report
+__version__ = "0.1.0"
+__all__ = [
+    "Mudra",
+    "RunResult",
+    "Goal",
+    "infer_goal",
+    "load",
+    "DataProfiler",
+    "DataProfile",
+    "build_pipeline",
+    "recommend_models",
+    "evaluate",
+    "write_report",
+    "__version__",
+]

mudra_ml/cli.py ADDED Viewed

@@ -0,0 +1,112 @@
+"""Command line interface for MudraML."""
+from __future__ import annotations
+import json
+from pathlib import Path
+import typer
+from .core import Mudra
+from .ingest import load
+from .profile import DataProfiler
+app = typer.Typer(
+    add_completion=False,
+    help="Glass-box autonomous data science from the command line.",
+    no_args_is_help=True,
+)
+@app.command()
+def run(
+    data: str = typer.Argument(..., help="Path to the data file."),
+    target: str | None = typer.Option(None, help="Target column to predict."),
+    task: str | None = typer.Option(
+        None, help="classification, regression, or clustering."
+    ),
+    metric: str | None = typer.Option(None, help="Metric to optimize."),
+    interpretable: bool = typer.Option(
+        False, help="Restrict the shortlist to interpretable models."
+    ),
+    max_train_seconds: int | None = typer.Option(
+        None, help="Soft time budget that caps model complexity."
+    ),
+    output: str = typer.Option("mudra_ml_report", help="Report path without suffix."),
+    save: str | None = typer.Option(None, help="Save the artifact to this path."),
+    no_html: bool = typer.Option(False, help="Skip the HTML report."),
+) -> None:
+    """Run the full pipeline and write a report."""
+    constraints: dict[str, object] = {}
+    if interpretable:
+        constraints["interpretable"] = True
+    if max_train_seconds is not None:
+        constraints["max_train_seconds"] = max_train_seconds
+    mudra = Mudra(verbose=True)
+    result = mudra.run(
+        data,
+        target=target,
+        task=task,
+        metric=metric,
+        constraints=constraints or None,
+        report_path=output,
+        html=not no_html,
+    )
+    typer.echo("")
+    typer.echo(f"Task: {result.task}")
+    typer.echo(f"Selected model: {result.evaluation['best_name']}")
+    best = next(
+        c for c in result.evaluation["candidates"] if c["name"] == result.evaluation["best_name"]
+    )
+    for name, value in best["test_metrics"].items():
+        if name == "confusion_matrix":
+            continue
+        typer.echo(f"  {name}: {value:.4f}" if isinstance(value, float) else f"  {name}: {value}")
+    typer.echo(f"Report: {result.report_path}")
+    if save:
+        saved = result.save(save)
+        typer.echo(f"Artifact: {saved}")
+@app.command()
+def profile(
+    data: str = typer.Argument(..., help="Path to the data file."),
+    as_json: bool = typer.Option(False, "--json", help="Print the profile as JSON."),
+) -> None:
+    """Profile a dataset and print column types and statistics."""
+    frame = load(data)
+    profiler = DataProfiler()
+    result = profiler.profile(frame)
+    if as_json:
+        typer.echo(json.dumps(result.as_dict(), indent=2, default=str))
+        return
+    typer.echo(f"Dataset: {Path(data).name}")
+    typer.echo(
+        f"Rows: {result.n_rows}  Columns: {result.n_columns}  "
+        f"Duplicates: {result.duplicate_rows}"
+    )
+    typer.echo("")
+    header = f"{'column':<24}{'type':<14}{'missing':<10}{'unique':<10}"
+    typer.echo(header)
+    typer.echo("-" * len(header))
+    for col in result.columns.values():
+        typer.echo(
+            f"{col.name[:23]:<24}{col.inferred_type:<14}"
+            f"{col.missing_fraction:<10.2%}{col.n_unique:<10}"
+        )
+    typer.echo("")
+    if result.candidate_targets:
+        typer.echo(f"Candidate targets: {', '.join(result.candidate_targets[:3])}")
+def main() -> None:
+    app()
+if __name__ == "__main__":
+    main()

mudra_ml/constants.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Shared defaults and thresholds.
+These values drive the rule-based engine. They are named here so the rules that
+use them stay readable and so a single edit changes behavior everywhere.
+"""
+from __future__ import annotations
+DEFAULT_RANDOM_STATE = 42
+# Profiling thresholds.
+ID_UNIQUE_RATIO = 0.95
+CATEGORICAL_MAX_UNIQUE = 20
+CATEGORICAL_MAX_RATIO = 0.5
+TEXT_MIN_AVG_LENGTH = 25
+TEXT_MIN_WORD_COUNT = 3
+HIGH_CARDINALITY_THRESHOLD = 30
+# Cleaning thresholds.
+DEFAULT_MISSING_DROP_THRESHOLD = 0.6
+IQR_MULTIPLIER = 1.5
+ZSCORE_THRESHOLD = 3.0
+# Goal inference thresholds.
+CLASSIFICATION_MAX_CLASSES = 20
+REGRESSION_MIN_UNIQUE = 20
+# Training.
+DEFAULT_CV_FOLDS = 5
+DEFAULT_SEARCH_ITER = 10
+SMALL_DATASET_ROWS = 2000
+LARGE_DATASET_ROWS = 50000
+# Default metrics per task.
+DEFAULT_METRICS = {
+    "classification": "f1",
+    "regression": "rmse",
+    "clustering": "silhouette",
+}
+VALID_TASKS = ("classification", "regression", "clustering")

mudra_ml/core.py ADDED Viewed

@@ -0,0 +1,320 @@
+"""The Mudra orchestrator and the RunResult artifact."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from .constants import DEFAULT_RANDOM_STATE
+from .decisions import DecisionLog, configure_logging
+from .evaluate import evaluate
+from .goal import Goal, infer_goal
+from .ingest import load
+from .preprocess import build_pipeline
+from .profile import DataProfile, DataProfiler
+from .recommend import recommend_models
+from .report import build_context, write_report
+_ARTIFACT_VERSION = 1
+@dataclass
+class RunResult:
+    """The output of a run: the fitted model, the report, and the metadata.
+    The preprocessing pipeline and the model are kept separate so that
+    predictions transform new data the same way the training data was
+    transformed.
+    """
+    best_model: Any
+    pipeline: Any
+    goal: Goal
+    task: str
+    metric: str
+    report_path: Path
+    evaluation: dict[str, Any]
+    profile: dict[str, Any]
+    feature_names: list[str]
+    def predict(self, data: pd.DataFrame) -> np.ndarray:
+        """Transform new rows with the fitted pipeline and predict.
+        Args:
+            data: New rows with the same feature columns as training.
+        Returns:
+            Model predictions (labels, values, or cluster ids).
+        """
+        transformed = self.pipeline.transform(data)
+        return self.best_model.predict(transformed)
+    def save(self, path: str | Path) -> Path:
+        """Persist the pipeline, model, and metadata to one joblib file.
+        Args:
+            path: Destination path. A .joblib suffix is added if absent.
+        Returns:
+            The path written.
+        """
+        path = Path(path)
+        if path.suffix != ".joblib":
+            path = path.with_suffix(".joblib")
+        payload = {
+            "version": _ARTIFACT_VERSION,
+            "best_model": self.best_model,
+            "pipeline": self.pipeline,
+            "goal": self.goal.as_dict(),
+            "task": self.task,
+            "metric": self.metric,
+            "evaluation": self.evaluation,
+            "profile": self.profile,
+            "feature_names": self.feature_names,
+        }
+        joblib.dump(payload, path)
+        return path
+class Mudra:
+    """Run the full data science workflow and explain every decision.
+    Example:
+        >>> m = Mudra()
+        >>> result = m.run("data.csv")
+        >>> preds = result.predict(new_frame)
+    """
+    def __init__(
+        self,
+        random_state: int = DEFAULT_RANDOM_STATE,
+        verbose: bool = False,
+        test_size: float = 0.2,
+    ) -> None:
+        self.random_state = random_state
+        self.test_size = test_size
+        self.log = DecisionLog()
+        self._loaded_payload: dict[str, Any] | None = None
+        if verbose:
+            configure_logging()
+    def run(
+        self,
+        data: str | Path | pd.DataFrame,
+        target: str | None = None,
+        task: str | None = None,
+        metric: str | None = None,
+        constraints: dict[str, Any] | None = None,
+        report_path: str | Path = "mudra_ml_report",
+        html: bool = True,
+        use_boost: bool = True,
+    ) -> RunResult:
+        """Ingest, profile, plan, train, evaluate, and report.
+        Args:
+            data: Path to a data file or an in-memory DataFrame.
+            target: Target column, or None to infer.
+            task: classification, regression, clustering, or None to infer.
+            metric: Metric to optimize, or None for the task default.
+            constraints: Optional constraints, for example
+                {"interpretable": True, "max_train_seconds": 120}.
+            report_path: Where to write the report (without suffix).
+            html: Whether to also write an HTML report.
+            use_boost: Whether to include xgboost and lightgbm if installed.
+        Returns:
+            A RunResult with the fitted model and the report path.
+        """
+        frame, dataset_name = self._as_frame(data)
+        self.log = DecisionLog()
+        profiler = DataProfiler(self.log)
+        profile = profiler.profile(frame)
+        operator_goal = Goal(
+            target=target,
+            task=task,
+            metric=metric,
+            constraints=constraints or {},
+            random_state=self.random_state,
+        )
+        operator_fields = operator_goal.operator_set_fields()
+        goal = infer_goal(profile, operator_goal, self.log)
+        # infer_goal always resolves task and metric.
+        assert goal.task is not None and goal.metric is not None
+        if goal.task == "clustering":
+            evaluation = self._run_clustering(frame, profile, goal)
+        else:
+            evaluation = self._run_supervised(frame, profile, goal)
+        ctx = build_context(
+            dataset_name=dataset_name,
+            n_rows=profile.n_rows,
+            n_columns=profile.n_columns,
+            goal=goal.as_dict(),
+            operator_set_fields=operator_fields,
+            log=self.log,
+            evaluation=evaluation["evaluation_dict"],
+        )
+        written = write_report(ctx, report_path, html=html)
+        return RunResult(
+            best_model=evaluation["result"].best_estimator,
+            pipeline=evaluation["pipeline"],
+            goal=goal,
+            task=goal.task,
+            metric=goal.metric,
+            report_path=written,
+            evaluation=evaluation["evaluation_dict"],
+            profile=profile.as_dict(),
+            feature_names=evaluation["feature_names"],
+        )
+    def _run_supervised(
+        self, frame: pd.DataFrame, profile: DataProfile, goal: Goal
+    ) -> dict[str, Any]:
+        target = goal.target
+        assert target is not None and goal.task is not None and goal.metric is not None
+        clean = frame.dropna(subset=[target])
+        X = clean.drop(columns=[target])
+        y = clean[target]
+        stratify = y if goal.task == "classification" and y.nunique() > 1 else None
+        X_train, X_test, y_train, y_test = train_test_split(
+            X,
+            y,
+            test_size=self.test_size,
+            random_state=self.random_state,
+            stratify=stratify,
+        )
+        self.log.record(
+            "preprocess",
+            f"Split into {len(X_train)} train and {len(X_test)} test rows "
+            f"({'stratified' if stratify is not None else 'random'}).",
+            "train-test-split",
+            {"test_size": self.test_size},
+        )
+        pipeline, _ = build_pipeline(profile, target, goal.constraints, self.log)
+        X_train_t = pipeline.fit_transform(X_train, y_train)
+        X_test_t = pipeline.transform(X_test)
+        feature_names = self._feature_names(pipeline, X_train_t.shape[1])
+        candidates = recommend_models(
+            task=goal.task,
+            n_rows=len(X_train),
+            n_features=X_train_t.shape[1],
+            constraints=goal.constraints,
+            random_state=self.random_state,
+            log=self.log,
+            use_boost=goal.constraints.get("interpretable") is not True,
+        )
+        result = evaluate(
+            candidates=candidates,
+            task=goal.task,
+            metric=goal.metric,
+            feature_names=feature_names,
+            X_train=X_train_t,
+            y_train=y_train.to_numpy(),
+            X_test=X_test_t,
+            y_test=y_test.to_numpy(),
+            random_state=self.random_state,
+            log=self.log,
+        )
+        eval_dict = result.as_dict()
+        eval_dict["feature_importance"] = self._named_importance(
+            result.feature_importance, feature_names
+        )
+        return {
+            "result": result,
+            "pipeline": pipeline,
+            "evaluation_dict": eval_dict,
+            "feature_names": feature_names,
+        }
+    def _run_clustering(
+        self, frame: pd.DataFrame, profile: DataProfile, goal: Goal
+    ) -> dict[str, Any]:
+        assert goal.metric is not None
+        pipeline, _ = build_pipeline(profile, None, goal.constraints, self.log)
+        X_t = pipeline.fit_transform(frame)
+        feature_names = self._feature_names(pipeline, X_t.shape[1])
+        candidates = recommend_models(
+            task="clustering",
+            n_rows=len(frame),
+            n_features=X_t.shape[1],
+            constraints=goal.constraints,
+            random_state=self.random_state,
+            log=self.log,
+        )
+        result = evaluate(
+            candidates=candidates,
+            task="clustering",
+            metric=goal.metric,
+            feature_names=feature_names,
+            X_train=X_t,
+            random_state=self.random_state,
+            log=self.log,
+        )
+        return {
+            "result": result,
+            "pipeline": pipeline,
+            "evaluation_dict": result.as_dict(),
+            "feature_names": feature_names,
+        }
+    @staticmethod
+    def _feature_names(pipeline: Any, n_features: int) -> list[str]:
+        try:
+            names = pipeline.named_steps["columns"].get_feature_names_out()
+            return [str(n) for n in names]
+        except (AttributeError, KeyError, ValueError):
+            return [f"feature_{i}" for i in range(n_features)]
+    @staticmethod
+    def _named_importance(
+        importance: dict[str, float], feature_names: list[str]
+    ) -> dict[str, float]:
+        return importance
+    @staticmethod
+    def _as_frame(data: str | Path | pd.DataFrame) -> tuple[pd.DataFrame, str]:
+        if isinstance(data, pd.DataFrame):
+            return data.copy(), "in-memory DataFrame"
+        return load(data), Path(data).name
+    @classmethod
+    def load(cls, path: str | Path) -> RunResult:
+        """Load a saved artifact and return a RunResult ready to predict.
+        Args:
+            path: Path to a .joblib artifact written by RunResult.save.
+        Returns:
+            A RunResult with the fitted pipeline and model.
+        """
+        path = Path(path)
+        if path.suffix != ".joblib":
+            path = path.with_suffix(".joblib")
+        payload = joblib.load(path)
+        goal = Goal(**payload["goal"])
+        return RunResult(
+            best_model=payload["best_model"],
+            pipeline=payload["pipeline"],
+            goal=goal,
+            task=payload["task"],
+            metric=payload["metric"],
+            report_path=Path("loaded-artifact"),
+            evaluation=payload["evaluation"],
+            profile=payload["profile"],
+            feature_names=payload["feature_names"],
+        )

mudra_ml/decisions.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""Decision log used to make every automated choice auditable.
+Every stage of the pipeline records what it decided and the rule that produced
+the decision. The report is rendered directly from this log, so the log is the
+source of truth for how a run reached its result.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+logger = logging.getLogger("mudra_ml")
+@dataclass(frozen=True)
+class Decision:
+    """A single recorded choice.
+    Args:
+        stage: Pipeline stage that made the choice (for example "profile").
+        decision: Short statement of what was decided.
+        rule: The named rule or statistical test that produced the decision.
+        detail: Optional structured context, such as the values compared.
+    """
+    stage: str
+    decision: str
+    rule: str
+    detail: dict[str, Any] = field(default_factory=dict)
+    def as_dict(self) -> dict[str, Any]:
+        return {
+            "stage": self.stage,
+            "decision": self.decision,
+            "rule": self.rule,
+            "detail": self.detail,
+        }
+class DecisionLog:
+    """Ordered collection of decisions made during a run."""
+    def __init__(self) -> None:
+        self._entries: list[Decision] = []
+    def record(
+        self,
+        stage: str,
+        decision: str,
+        rule: str,
+        detail: dict[str, Any] | None = None,
+    ) -> Decision:
+        """Append a decision and emit it to the logger."""
+        entry = Decision(stage=stage, decision=decision, rule=rule, detail=detail or {})
+        self._entries.append(entry)
+        logger.info("[%s] %s (rule: %s)", stage, decision, rule)
+        return entry
+    def for_stage(self, stage: str) -> list[Decision]:
+        return [e for e in self._entries if e.stage == stage]
+    def stages(self) -> list[str]:
+        seen: list[str] = []
+        for entry in self._entries:
+            if entry.stage not in seen:
+                seen.append(entry.stage)
+        return seen
+    def as_list(self) -> list[dict[str, Any]]:
+        return [e.as_dict() for e in self._entries]
+    def __len__(self) -> int:
+        return len(self._entries)
+    def __iter__(self):
+        return iter(self._entries)
+def configure_logging(level: int = logging.INFO) -> None:
+    """Attach a stream handler to the package logger if none is present."""
+    if logger.handlers:
+        return
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter("%(message)s"))
+    logger.addHandler(handler)
+    logger.setLevel(level)
+    logger.propagate = False