mudra-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mudra_ml/__init__.py ADDED
@@ -0,0 +1,34 @@
1
+ """MudraML: glass-box autonomous data science.
2
+
3
+ The decision engine that drives the pipeline is rule-based and statistical.
4
+ It is deterministic, logged, and explainable. The machine learning models are
5
+ the output it produces, not the mechanism by which it chooses what to do.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .core import Mudra, RunResult
11
+ from .evaluate import evaluate
12
+ from .goal import Goal, infer_goal
13
+ from .ingest import load
14
+ from .preprocess import build_pipeline
15
+ from .profile import DataProfile, DataProfiler
16
+ from .recommend import recommend_models
17
+ from .report import write_report
18
+
19
+ __version__ = "0.1.0"
20
+
21
+ __all__ = [
22
+ "Mudra",
23
+ "RunResult",
24
+ "Goal",
25
+ "infer_goal",
26
+ "load",
27
+ "DataProfiler",
28
+ "DataProfile",
29
+ "build_pipeline",
30
+ "recommend_models",
31
+ "evaluate",
32
+ "write_report",
33
+ "__version__",
34
+ ]
mudra_ml/cli.py ADDED
@@ -0,0 +1,112 @@
1
+ """Command line interface for MudraML."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+
8
+ import typer
9
+
10
+ from .core import Mudra
11
+ from .ingest import load
12
+ from .profile import DataProfiler
13
+
14
+ app = typer.Typer(
15
+ add_completion=False,
16
+ help="Glass-box autonomous data science from the command line.",
17
+ no_args_is_help=True,
18
+ )
19
+
20
+
21
+ @app.command()
22
+ def run(
23
+ data: str = typer.Argument(..., help="Path to the data file."),
24
+ target: str | None = typer.Option(None, help="Target column to predict."),
25
+ task: str | None = typer.Option(
26
+ None, help="classification, regression, or clustering."
27
+ ),
28
+ metric: str | None = typer.Option(None, help="Metric to optimize."),
29
+ interpretable: bool = typer.Option(
30
+ False, help="Restrict the shortlist to interpretable models."
31
+ ),
32
+ max_train_seconds: int | None = typer.Option(
33
+ None, help="Soft time budget that caps model complexity."
34
+ ),
35
+ output: str = typer.Option("mudra_ml_report", help="Report path without suffix."),
36
+ save: str | None = typer.Option(None, help="Save the artifact to this path."),
37
+ no_html: bool = typer.Option(False, help="Skip the HTML report."),
38
+ ) -> None:
39
+ """Run the full pipeline and write a report."""
40
+ constraints: dict[str, object] = {}
41
+ if interpretable:
42
+ constraints["interpretable"] = True
43
+ if max_train_seconds is not None:
44
+ constraints["max_train_seconds"] = max_train_seconds
45
+
46
+ mudra = Mudra(verbose=True)
47
+ result = mudra.run(
48
+ data,
49
+ target=target,
50
+ task=task,
51
+ metric=metric,
52
+ constraints=constraints or None,
53
+ report_path=output,
54
+ html=not no_html,
55
+ )
56
+
57
+ typer.echo("")
58
+ typer.echo(f"Task: {result.task}")
59
+ typer.echo(f"Selected model: {result.evaluation['best_name']}")
60
+ best = next(
61
+ c for c in result.evaluation["candidates"] if c["name"] == result.evaluation["best_name"]
62
+ )
63
+ for name, value in best["test_metrics"].items():
64
+ if name == "confusion_matrix":
65
+ continue
66
+ typer.echo(f" {name}: {value:.4f}" if isinstance(value, float) else f" {name}: {value}")
67
+ typer.echo(f"Report: {result.report_path}")
68
+
69
+ if save:
70
+ saved = result.save(save)
71
+ typer.echo(f"Artifact: {saved}")
72
+
73
+
74
+ @app.command()
75
+ def profile(
76
+ data: str = typer.Argument(..., help="Path to the data file."),
77
+ as_json: bool = typer.Option(False, "--json", help="Print the profile as JSON."),
78
+ ) -> None:
79
+ """Profile a dataset and print column types and statistics."""
80
+ frame = load(data)
81
+ profiler = DataProfiler()
82
+ result = profiler.profile(frame)
83
+
84
+ if as_json:
85
+ typer.echo(json.dumps(result.as_dict(), indent=2, default=str))
86
+ return
87
+
88
+ typer.echo(f"Dataset: {Path(data).name}")
89
+ typer.echo(
90
+ f"Rows: {result.n_rows} Columns: {result.n_columns} "
91
+ f"Duplicates: {result.duplicate_rows}"
92
+ )
93
+ typer.echo("")
94
+ header = f"{'column':<24}{'type':<14}{'missing':<10}{'unique':<10}"
95
+ typer.echo(header)
96
+ typer.echo("-" * len(header))
97
+ for col in result.columns.values():
98
+ typer.echo(
99
+ f"{col.name[:23]:<24}{col.inferred_type:<14}"
100
+ f"{col.missing_fraction:<10.2%}{col.n_unique:<10}"
101
+ )
102
+ typer.echo("")
103
+ if result.candidate_targets:
104
+ typer.echo(f"Candidate targets: {', '.join(result.candidate_targets[:3])}")
105
+
106
+
107
+ def main() -> None:
108
+ app()
109
+
110
+
111
+ if __name__ == "__main__":
112
+ main()
mudra_ml/constants.py ADDED
@@ -0,0 +1,41 @@
1
+ """Shared defaults and thresholds.
2
+
3
+ These values drive the rule-based engine. They are named here so the rules that
4
+ use them stay readable and so a single edit changes behavior everywhere.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ DEFAULT_RANDOM_STATE = 42
10
+
11
+ # Profiling thresholds.
12
+ ID_UNIQUE_RATIO = 0.95
13
+ CATEGORICAL_MAX_UNIQUE = 20
14
+ CATEGORICAL_MAX_RATIO = 0.5
15
+ TEXT_MIN_AVG_LENGTH = 25
16
+ TEXT_MIN_WORD_COUNT = 3
17
+ HIGH_CARDINALITY_THRESHOLD = 30
18
+
19
+ # Cleaning thresholds.
20
+ DEFAULT_MISSING_DROP_THRESHOLD = 0.6
21
+ IQR_MULTIPLIER = 1.5
22
+ ZSCORE_THRESHOLD = 3.0
23
+
24
+ # Goal inference thresholds.
25
+ CLASSIFICATION_MAX_CLASSES = 20
26
+ REGRESSION_MIN_UNIQUE = 20
27
+
28
+ # Training.
29
+ DEFAULT_CV_FOLDS = 5
30
+ DEFAULT_SEARCH_ITER = 10
31
+ SMALL_DATASET_ROWS = 2000
32
+ LARGE_DATASET_ROWS = 50000
33
+
34
+ # Default metrics per task.
35
+ DEFAULT_METRICS = {
36
+ "classification": "f1",
37
+ "regression": "rmse",
38
+ "clustering": "silhouette",
39
+ }
40
+
41
+ VALID_TASKS = ("classification", "regression", "clustering")
mudra_ml/core.py ADDED
@@ -0,0 +1,320 @@
1
+ """The Mudra orchestrator and the RunResult artifact."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import joblib
10
+ import numpy as np
11
+ import pandas as pd
12
+ from sklearn.model_selection import train_test_split
13
+
14
+ from .constants import DEFAULT_RANDOM_STATE
15
+ from .decisions import DecisionLog, configure_logging
16
+ from .evaluate import evaluate
17
+ from .goal import Goal, infer_goal
18
+ from .ingest import load
19
+ from .preprocess import build_pipeline
20
+ from .profile import DataProfile, DataProfiler
21
+ from .recommend import recommend_models
22
+ from .report import build_context, write_report
23
+
24
+ _ARTIFACT_VERSION = 1
25
+
26
+
27
+ @dataclass
28
+ class RunResult:
29
+ """The output of a run: the fitted model, the report, and the metadata.
30
+
31
+ The preprocessing pipeline and the model are kept separate so that
32
+ predictions transform new data the same way the training data was
33
+ transformed.
34
+ """
35
+
36
+ best_model: Any
37
+ pipeline: Any
38
+ goal: Goal
39
+ task: str
40
+ metric: str
41
+ report_path: Path
42
+ evaluation: dict[str, Any]
43
+ profile: dict[str, Any]
44
+ feature_names: list[str]
45
+
46
+ def predict(self, data: pd.DataFrame) -> np.ndarray:
47
+ """Transform new rows with the fitted pipeline and predict.
48
+
49
+ Args:
50
+ data: New rows with the same feature columns as training.
51
+
52
+ Returns:
53
+ Model predictions (labels, values, or cluster ids).
54
+ """
55
+ transformed = self.pipeline.transform(data)
56
+ return self.best_model.predict(transformed)
57
+
58
+ def save(self, path: str | Path) -> Path:
59
+ """Persist the pipeline, model, and metadata to one joblib file.
60
+
61
+ Args:
62
+ path: Destination path. A .joblib suffix is added if absent.
63
+
64
+ Returns:
65
+ The path written.
66
+ """
67
+ path = Path(path)
68
+ if path.suffix != ".joblib":
69
+ path = path.with_suffix(".joblib")
70
+ payload = {
71
+ "version": _ARTIFACT_VERSION,
72
+ "best_model": self.best_model,
73
+ "pipeline": self.pipeline,
74
+ "goal": self.goal.as_dict(),
75
+ "task": self.task,
76
+ "metric": self.metric,
77
+ "evaluation": self.evaluation,
78
+ "profile": self.profile,
79
+ "feature_names": self.feature_names,
80
+ }
81
+ joblib.dump(payload, path)
82
+ return path
83
+
84
+
85
+ class Mudra:
86
+ """Run the full data science workflow and explain every decision.
87
+
88
+ Example:
89
+ >>> m = Mudra()
90
+ >>> result = m.run("data.csv")
91
+ >>> preds = result.predict(new_frame)
92
+ """
93
+
94
+ def __init__(
95
+ self,
96
+ random_state: int = DEFAULT_RANDOM_STATE,
97
+ verbose: bool = False,
98
+ test_size: float = 0.2,
99
+ ) -> None:
100
+ self.random_state = random_state
101
+ self.test_size = test_size
102
+ self.log = DecisionLog()
103
+ self._loaded_payload: dict[str, Any] | None = None
104
+ if verbose:
105
+ configure_logging()
106
+
107
+ def run(
108
+ self,
109
+ data: str | Path | pd.DataFrame,
110
+ target: str | None = None,
111
+ task: str | None = None,
112
+ metric: str | None = None,
113
+ constraints: dict[str, Any] | None = None,
114
+ report_path: str | Path = "mudra_ml_report",
115
+ html: bool = True,
116
+ use_boost: bool = True,
117
+ ) -> RunResult:
118
+ """Ingest, profile, plan, train, evaluate, and report.
119
+
120
+ Args:
121
+ data: Path to a data file or an in-memory DataFrame.
122
+ target: Target column, or None to infer.
123
+ task: classification, regression, clustering, or None to infer.
124
+ metric: Metric to optimize, or None for the task default.
125
+ constraints: Optional constraints, for example
126
+ {"interpretable": True, "max_train_seconds": 120}.
127
+ report_path: Where to write the report (without suffix).
128
+ html: Whether to also write an HTML report.
129
+ use_boost: Whether to include xgboost and lightgbm if installed.
130
+
131
+ Returns:
132
+ A RunResult with the fitted model and the report path.
133
+ """
134
+ frame, dataset_name = self._as_frame(data)
135
+ self.log = DecisionLog()
136
+
137
+ profiler = DataProfiler(self.log)
138
+ profile = profiler.profile(frame)
139
+
140
+ operator_goal = Goal(
141
+ target=target,
142
+ task=task,
143
+ metric=metric,
144
+ constraints=constraints or {},
145
+ random_state=self.random_state,
146
+ )
147
+ operator_fields = operator_goal.operator_set_fields()
148
+ goal = infer_goal(profile, operator_goal, self.log)
149
+ # infer_goal always resolves task and metric.
150
+ assert goal.task is not None and goal.metric is not None
151
+
152
+ if goal.task == "clustering":
153
+ evaluation = self._run_clustering(frame, profile, goal)
154
+ else:
155
+ evaluation = self._run_supervised(frame, profile, goal)
156
+
157
+ ctx = build_context(
158
+ dataset_name=dataset_name,
159
+ n_rows=profile.n_rows,
160
+ n_columns=profile.n_columns,
161
+ goal=goal.as_dict(),
162
+ operator_set_fields=operator_fields,
163
+ log=self.log,
164
+ evaluation=evaluation["evaluation_dict"],
165
+ )
166
+ written = write_report(ctx, report_path, html=html)
167
+
168
+ return RunResult(
169
+ best_model=evaluation["result"].best_estimator,
170
+ pipeline=evaluation["pipeline"],
171
+ goal=goal,
172
+ task=goal.task,
173
+ metric=goal.metric,
174
+ report_path=written,
175
+ evaluation=evaluation["evaluation_dict"],
176
+ profile=profile.as_dict(),
177
+ feature_names=evaluation["feature_names"],
178
+ )
179
+
180
+ def _run_supervised(
181
+ self, frame: pd.DataFrame, profile: DataProfile, goal: Goal
182
+ ) -> dict[str, Any]:
183
+ target = goal.target
184
+ assert target is not None and goal.task is not None and goal.metric is not None
185
+ clean = frame.dropna(subset=[target])
186
+ X = clean.drop(columns=[target])
187
+ y = clean[target]
188
+
189
+ stratify = y if goal.task == "classification" and y.nunique() > 1 else None
190
+ X_train, X_test, y_train, y_test = train_test_split(
191
+ X,
192
+ y,
193
+ test_size=self.test_size,
194
+ random_state=self.random_state,
195
+ stratify=stratify,
196
+ )
197
+ self.log.record(
198
+ "preprocess",
199
+ f"Split into {len(X_train)} train and {len(X_test)} test rows "
200
+ f"({'stratified' if stratify is not None else 'random'}).",
201
+ "train-test-split",
202
+ {"test_size": self.test_size},
203
+ )
204
+
205
+ pipeline, _ = build_pipeline(profile, target, goal.constraints, self.log)
206
+ X_train_t = pipeline.fit_transform(X_train, y_train)
207
+ X_test_t = pipeline.transform(X_test)
208
+ feature_names = self._feature_names(pipeline, X_train_t.shape[1])
209
+
210
+ candidates = recommend_models(
211
+ task=goal.task,
212
+ n_rows=len(X_train),
213
+ n_features=X_train_t.shape[1],
214
+ constraints=goal.constraints,
215
+ random_state=self.random_state,
216
+ log=self.log,
217
+ use_boost=goal.constraints.get("interpretable") is not True,
218
+ )
219
+
220
+ result = evaluate(
221
+ candidates=candidates,
222
+ task=goal.task,
223
+ metric=goal.metric,
224
+ feature_names=feature_names,
225
+ X_train=X_train_t,
226
+ y_train=y_train.to_numpy(),
227
+ X_test=X_test_t,
228
+ y_test=y_test.to_numpy(),
229
+ random_state=self.random_state,
230
+ log=self.log,
231
+ )
232
+ eval_dict = result.as_dict()
233
+ eval_dict["feature_importance"] = self._named_importance(
234
+ result.feature_importance, feature_names
235
+ )
236
+ return {
237
+ "result": result,
238
+ "pipeline": pipeline,
239
+ "evaluation_dict": eval_dict,
240
+ "feature_names": feature_names,
241
+ }
242
+
243
+ def _run_clustering(
244
+ self, frame: pd.DataFrame, profile: DataProfile, goal: Goal
245
+ ) -> dict[str, Any]:
246
+ assert goal.metric is not None
247
+ pipeline, _ = build_pipeline(profile, None, goal.constraints, self.log)
248
+ X_t = pipeline.fit_transform(frame)
249
+ feature_names = self._feature_names(pipeline, X_t.shape[1])
250
+
251
+ candidates = recommend_models(
252
+ task="clustering",
253
+ n_rows=len(frame),
254
+ n_features=X_t.shape[1],
255
+ constraints=goal.constraints,
256
+ random_state=self.random_state,
257
+ log=self.log,
258
+ )
259
+ result = evaluate(
260
+ candidates=candidates,
261
+ task="clustering",
262
+ metric=goal.metric,
263
+ feature_names=feature_names,
264
+ X_train=X_t,
265
+ random_state=self.random_state,
266
+ log=self.log,
267
+ )
268
+ return {
269
+ "result": result,
270
+ "pipeline": pipeline,
271
+ "evaluation_dict": result.as_dict(),
272
+ "feature_names": feature_names,
273
+ }
274
+
275
+ @staticmethod
276
+ def _feature_names(pipeline: Any, n_features: int) -> list[str]:
277
+ try:
278
+ names = pipeline.named_steps["columns"].get_feature_names_out()
279
+ return [str(n) for n in names]
280
+ except (AttributeError, KeyError, ValueError):
281
+ return [f"feature_{i}" for i in range(n_features)]
282
+
283
+ @staticmethod
284
+ def _named_importance(
285
+ importance: dict[str, float], feature_names: list[str]
286
+ ) -> dict[str, float]:
287
+ return importance
288
+
289
+ @staticmethod
290
+ def _as_frame(data: str | Path | pd.DataFrame) -> tuple[pd.DataFrame, str]:
291
+ if isinstance(data, pd.DataFrame):
292
+ return data.copy(), "in-memory DataFrame"
293
+ return load(data), Path(data).name
294
+
295
+ @classmethod
296
+ def load(cls, path: str | Path) -> RunResult:
297
+ """Load a saved artifact and return a RunResult ready to predict.
298
+
299
+ Args:
300
+ path: Path to a .joblib artifact written by RunResult.save.
301
+
302
+ Returns:
303
+ A RunResult with the fitted pipeline and model.
304
+ """
305
+ path = Path(path)
306
+ if path.suffix != ".joblib":
307
+ path = path.with_suffix(".joblib")
308
+ payload = joblib.load(path)
309
+ goal = Goal(**payload["goal"])
310
+ return RunResult(
311
+ best_model=payload["best_model"],
312
+ pipeline=payload["pipeline"],
313
+ goal=goal,
314
+ task=payload["task"],
315
+ metric=payload["metric"],
316
+ report_path=Path("loaded-artifact"),
317
+ evaluation=payload["evaluation"],
318
+ profile=payload["profile"],
319
+ feature_names=payload["feature_names"],
320
+ )
mudra_ml/decisions.py ADDED
@@ -0,0 +1,89 @@
1
+ """Decision log used to make every automated choice auditable.
2
+
3
+ Every stage of the pipeline records what it decided and the rule that produced
4
+ the decision. The report is rendered directly from this log, so the log is the
5
+ source of truth for how a run reached its result.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+ logger = logging.getLogger("mudra_ml")
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class Decision:
19
+ """A single recorded choice.
20
+
21
+ Args:
22
+ stage: Pipeline stage that made the choice (for example "profile").
23
+ decision: Short statement of what was decided.
24
+ rule: The named rule or statistical test that produced the decision.
25
+ detail: Optional structured context, such as the values compared.
26
+ """
27
+
28
+ stage: str
29
+ decision: str
30
+ rule: str
31
+ detail: dict[str, Any] = field(default_factory=dict)
32
+
33
+ def as_dict(self) -> dict[str, Any]:
34
+ return {
35
+ "stage": self.stage,
36
+ "decision": self.decision,
37
+ "rule": self.rule,
38
+ "detail": self.detail,
39
+ }
40
+
41
+
42
+ class DecisionLog:
43
+ """Ordered collection of decisions made during a run."""
44
+
45
+ def __init__(self) -> None:
46
+ self._entries: list[Decision] = []
47
+
48
+ def record(
49
+ self,
50
+ stage: str,
51
+ decision: str,
52
+ rule: str,
53
+ detail: dict[str, Any] | None = None,
54
+ ) -> Decision:
55
+ """Append a decision and emit it to the logger."""
56
+ entry = Decision(stage=stage, decision=decision, rule=rule, detail=detail or {})
57
+ self._entries.append(entry)
58
+ logger.info("[%s] %s (rule: %s)", stage, decision, rule)
59
+ return entry
60
+
61
+ def for_stage(self, stage: str) -> list[Decision]:
62
+ return [e for e in self._entries if e.stage == stage]
63
+
64
+ def stages(self) -> list[str]:
65
+ seen: list[str] = []
66
+ for entry in self._entries:
67
+ if entry.stage not in seen:
68
+ seen.append(entry.stage)
69
+ return seen
70
+
71
+ def as_list(self) -> list[dict[str, Any]]:
72
+ return [e.as_dict() for e in self._entries]
73
+
74
+ def __len__(self) -> int:
75
+ return len(self._entries)
76
+
77
+ def __iter__(self):
78
+ return iter(self._entries)
79
+
80
+
81
+ def configure_logging(level: int = logging.INFO) -> None:
82
+ """Attach a stream handler to the package logger if none is present."""
83
+ if logger.handlers:
84
+ return
85
+ handler = logging.StreamHandler()
86
+ handler.setFormatter(logging.Formatter("%(message)s"))
87
+ logger.addHandler(handler)
88
+ logger.setLevel(level)
89
+ logger.propagate = False