mlcompass 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mlcompass/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """mlcompass — An LLM agent that sits next to you through your whole ML pipeline."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,5 @@
1
+ """agentlite-based agents that drive each mlcompass command.
2
+
3
+ Each module exposes one or more factory functions that return a
4
+ configured ``agentlite.Agent`` ready to be used by the CLI.
5
+ """
@@ -0,0 +1,174 @@
1
+ """Model + feature engineering advisor.
2
+
3
+ Consumes the structured output of ``tools.dataset.analyze_dataset`` and
4
+ produces a JSON recommendation: models to try, feature engineering
5
+ suggestions, and pitfalls to mitigate.
6
+
7
+ The analysis is pre-computed by deterministic Python (no LLM cost), and
8
+ fed to the agent as the user message. The advisor agent itself does not
9
+ need tools — it is a pure reasoner over the structured input. This keeps
10
+ ``advise`` fast, cheap, and predictable.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ from typing import Any
17
+
18
+ from agentlite import Agent
19
+
20
+ ADVISOR_MODEL_DEFAULT = "claude-opus-4-7"
21
+
22
+ ADVISOR_SYSTEM_PROMPT = """You are a senior data scientist advising a colleague who just analyzed a new dataset. Your job is to recommend the next steps.
23
+
24
+ You will receive a structured JSON dataset analysis. Based on it, produce:
25
+
26
+ 1. **Top 3 model families** to try, with one-line reasoning and a realistic
27
+ metric range (AUC for classification, RMSE/MAE for regression).
28
+ 2. **Feature engineering suggestions**: per-column or cross-column hints
29
+ that have a high likelihood of helping.
30
+ 3. **Pitfalls**: data-quality or methodology issues the user should mitigate
31
+ before training.
32
+
33
+ Format your reply as a single JSON object matching this schema, and only
34
+ that — no preamble, no markdown fence:
35
+
36
+ {
37
+ "models": [
38
+ {"name": "XGBoost", "reason": "...", "expected_metric": "AUC 0.82 - 0.87"}
39
+ ],
40
+ "features": [
41
+ {"column": "signup_date", "suggestion": "derive days_since_signup, month, dayofweek", "reason": "..."}
42
+ ],
43
+ "pitfalls": [
44
+ {"issue": "Class imbalance (12% positive)", "mitigation": "Use AUC/F1, class_weight='balanced', or focal loss"}
45
+ ]
46
+ }
47
+
48
+ Rules:
49
+ - Always include at least one interpretable baseline (logistic / linear
50
+ regression, decision tree, etc.) so the user has a sanity check.
51
+ - Use realistic metric ranges based on the dataset signal — do not promise
52
+ numbers you can't back up.
53
+ - Cite the column or fact you reasoned from inside the ``reason`` field.
54
+ - When uncertain, prefer conservative, well-established choices."""
55
+
56
+
57
+ # --------------------------------------------------------------------------- #
58
+ # Agent construction #
59
+ # --------------------------------------------------------------------------- #
60
+
61
+
62
+ def build_advisor_agent(
63
+ *,
64
+ client: Any | None = None,
65
+ model: str = ADVISOR_MODEL_DEFAULT,
66
+ ) -> Agent:
67
+ """Build the model + feature engineering advisor agent.
68
+
69
+ Args:
70
+ client: Optional Anthropic client or ``MockClient`` for tests.
71
+ When ``None``, agentlite creates a real client from
72
+ ``ANTHROPIC_API_KEY``.
73
+ model: Claude model name. Opus is the default because the advisor's
74
+ reasoning needs to be sharp; Haiku tends to oversimplify on
75
+ tabular ML recommendations.
76
+
77
+ Returns:
78
+ A configured ``agentlite.Agent`` ready to accept a dataset analysis
79
+ as its user message.
80
+ """
81
+ return Agent(
82
+ model=model,
83
+ system=ADVISOR_SYSTEM_PROMPT,
84
+ tools=[], # Pure reasoner — no tool calls needed.
85
+ client=client,
86
+ max_turns=2, # Sanity cap; single-turn is the expected case.
87
+ )
88
+
89
+
90
+ # --------------------------------------------------------------------------- #
91
+ # Public API #
92
+ # --------------------------------------------------------------------------- #
93
+
94
+
95
+ def get_recommendation(
96
+ analysis: dict[str, Any],
97
+ *,
98
+ client: Any | None = None,
99
+ model: str = ADVISOR_MODEL_DEFAULT,
100
+ ) -> dict[str, Any]:
101
+ """Run the advisor on a pre-computed dataset analysis.
102
+
103
+ Args:
104
+ analysis: Output of ``tools.dataset.analyze_dataset()``.
105
+ client: Optional client for testing.
106
+ model: Claude model name.
107
+
108
+ Returns:
109
+ Parsed recommendation dict with keys ``models``, ``features``,
110
+ ``pitfalls``.
111
+
112
+ Raises:
113
+ AdvisorParseError: If the agent's response can't be parsed as the
114
+ expected JSON shape.
115
+ """
116
+ agent = build_advisor_agent(client=client, model=model)
117
+
118
+ user_message = (
119
+ "Here is the analysis of a dataset. Please produce model, feature, "
120
+ "and pitfall recommendations as specified.\n\n"
121
+ f"```json\n{json.dumps(analysis, indent=2)}\n```"
122
+ )
123
+
124
+ raw_response = agent.run(user_message)
125
+ return _parse_advisor_response(raw_response)
126
+
127
+
128
+ # --------------------------------------------------------------------------- #
129
+ # Internal: response parsing #
130
+ # --------------------------------------------------------------------------- #
131
+
132
+
133
+ class AdvisorParseError(ValueError):
134
+ """Raised when the advisor returns something other than valid JSON."""
135
+
136
+
137
+ def _parse_advisor_response(text: str) -> dict[str, Any]:
138
+ """Strip optional markdown fences and parse JSON."""
139
+ stripped = text.strip()
140
+
141
+ # The system prompt explicitly forbids fences, but be lenient.
142
+ if stripped.startswith("```"):
143
+ lines = stripped.splitlines()
144
+ # Drop opening fence (``` or ```json) and find the closing fence
145
+ start = 1
146
+ end = len(lines)
147
+ for i, line in enumerate(lines[1:], 1):
148
+ if line.strip().startswith("```"):
149
+ end = i
150
+ break
151
+ stripped = "\n".join(lines[start:end])
152
+
153
+ try:
154
+ parsed = json.loads(stripped)
155
+ except json.JSONDecodeError as exc:
156
+ snippet = text[:200].replace("\n", " ")
157
+ raise AdvisorParseError(
158
+ f"Advisor response was not valid JSON: {snippet!r}"
159
+ ) from exc
160
+
161
+ if not isinstance(parsed, dict):
162
+ raise AdvisorParseError(
163
+ f"Advisor response was JSON but not an object (got {type(parsed).__name__})"
164
+ )
165
+
166
+ # Light shape validation — keep it loose so the schema can evolve.
167
+ for required_key in ("models", "features", "pitfalls"):
168
+ if required_key not in parsed:
169
+ raise AdvisorParseError(
170
+ f"Advisor response missing required key '{required_key}'. "
171
+ f"Got: {sorted(parsed.keys())}"
172
+ )
173
+
174
+ return parsed
mlcompass/cli.py ADDED
@@ -0,0 +1,293 @@
1
+ """Command-line interface for ``mlcompass``.
2
+
3
+ Subcommands are added incrementally as the project advances through
4
+ its phases. See ARCHITECTURE.md §7 for the full CLI design.
5
+
6
+ Currently implemented:
7
+ init — create a new ``.mlcompass/`` project (Faz 1)
8
+ advise — analyze dataset + recommend models / features / pitfalls (Faz 1)
9
+
10
+ Planned:
11
+ audit, watch, compare, evaluate, deploy, status
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ import sys
19
+ from datetime import datetime, timezone
20
+ from pathlib import Path
21
+ from typing import Any, Callable
22
+
23
+ import click
24
+ from rich.console import Console
25
+ from rich.panel import Panel
26
+
27
+ from . import __version__
28
+ from .agents.advise import AdvisorParseError, get_recommendation
29
+ from .context import ProjectContext, ProjectExistsError, ProjectNotFoundError
30
+ from .tools.dataset import analyze_dataset
31
+ from .ui.advise import render_analysis, render_recommendation
32
+
33
+
34
+ def _force_utf8_stdio() -> None:
35
+ """Reconfigure stdout/stderr to UTF-8.
36
+
37
+ On Windows the default encoding follows the system code page
38
+ (e.g. ``cp1254`` for Turkish locales) and cannot represent the
39
+ Unicode glyphs rich uses for panels, checkmarks, and box drawing.
40
+ Reconfiguring early avoids ``UnicodeEncodeError`` at print time.
41
+ """
42
+ for stream in (sys.stdout, sys.stderr):
43
+ reconfigure = getattr(stream, "reconfigure", None)
44
+ if reconfigure is None:
45
+ continue
46
+ try:
47
+ reconfigure(encoding="utf-8", errors="replace")
48
+ except (AttributeError, ValueError):
49
+ pass
50
+
51
+
52
+ _force_utf8_stdio()
53
+ console = Console()
54
+
55
+
56
+ @click.group(
57
+ help="mlcompass — your AI ML engineer at every pipeline stage.",
58
+ context_settings={"help_option_names": ["-h", "--help"]},
59
+ )
60
+ @click.version_option(__version__, prog_name="mlcompass")
61
+ def cli() -> None:
62
+ """Root command group."""
63
+
64
+
65
+ @cli.command(help="Initialize a new mlcompass project.")
66
+ @click.argument("name")
67
+ @click.option(
68
+ "--path",
69
+ "parent_dir",
70
+ type=click.Path(file_okay=False, dir_okay=True, path_type=Path),
71
+ default=Path("."),
72
+ show_default=True,
73
+ help="Directory in which .mlcompass/ will be created.",
74
+ )
75
+ @click.option(
76
+ "--default-model",
77
+ default="claude-opus-4-7",
78
+ show_default=True,
79
+ help="Default LLM model for this project.",
80
+ )
81
+ def init(name: str, parent_dir: Path, default_model: str) -> None:
82
+ """Create a new mlcompass project under ``parent_dir/.mlcompass/``."""
83
+ try:
84
+ ctx = ProjectContext.init(
85
+ name=name,
86
+ parent_dir=parent_dir,
87
+ default_model=default_model,
88
+ )
89
+ except ProjectExistsError as e:
90
+ console.print(f"[red]✗[/red] {e}")
91
+ raise SystemExit(1) from e
92
+
93
+ console.print(
94
+ Panel.fit(
95
+ f"[green]✓[/green] Project [bold]{name}[/bold] initialized.\n\n"
96
+ f"Directory: [cyan]{ctx.path}[/cyan]\n"
97
+ f"Model: {default_model}\n\n"
98
+ f"Next: [yellow]mlcompass advise <data.csv>[/yellow]",
99
+ title="mlcompass init",
100
+ border_style="green",
101
+ )
102
+ )
103
+
104
+
105
+ @cli.command(
106
+ help="Analyze a dataset and recommend models, features, and pitfalls."
107
+ )
108
+ @click.argument(
109
+ "dataset_path",
110
+ type=click.Path(exists=True, dir_okay=False, path_type=Path),
111
+ )
112
+ @click.option(
113
+ "--target",
114
+ "target_column",
115
+ default=None,
116
+ help="Target column name (auto-detected from name conventions if omitted).",
117
+ )
118
+ @click.option(
119
+ "--sample-rows",
120
+ type=int,
121
+ default=None,
122
+ help="Limit analysis to the first N rows (useful for very large files).",
123
+ )
124
+ @click.option(
125
+ "--no-llm",
126
+ is_flag=True,
127
+ help="Skip the LLM advisor step; show only the deterministic analysis.",
128
+ )
129
+ @click.option(
130
+ "--model",
131
+ "advisor_model",
132
+ default="claude-opus-4-7",
133
+ show_default=True,
134
+ help="Claude model used by the advisor.",
135
+ )
136
+ def advise(
137
+ dataset_path: Path,
138
+ target_column: str | None,
139
+ sample_rows: int | None,
140
+ no_llm: bool,
141
+ advisor_model: str,
142
+ ) -> None:
143
+ """Run the deterministic dataset analyzer, then the LLM advisor."""
144
+ project = _try_load_project()
145
+
146
+ with console.status("[cyan]Analyzing dataset...[/cyan]", spinner="dots"):
147
+ analysis = analyze_dataset(
148
+ dataset_path,
149
+ target_column=target_column,
150
+ sample_rows=sample_rows,
151
+ )
152
+
153
+ render_analysis(console, analysis)
154
+
155
+ recommendation: dict[str, Any] | None = None
156
+
157
+ if no_llm:
158
+ console.print("\n[dim](--no-llm specified; skipping advisor)[/dim]")
159
+ elif not _has_api_key():
160
+ console.print(
161
+ "\n[yellow]⚠ ANTHROPIC_API_KEY not set; "
162
+ "skipping advisor step. Set the variable to enable.[/yellow]"
163
+ )
164
+ else:
165
+ recommendation = _run_advisor(analysis, model=advisor_model)
166
+ if recommendation is not None:
167
+ render_recommendation(console, recommendation)
168
+
169
+ if project is not None:
170
+ _persist_advise_result(
171
+ project=project,
172
+ dataset_path=dataset_path,
173
+ analysis=analysis,
174
+ recommendation=recommendation,
175
+ )
176
+
177
+
178
+ # --------------------------------------------------------------------------- #
179
+ # advise helpers (split out so tests can monkeypatch them) #
180
+ # --------------------------------------------------------------------------- #
181
+
182
+
183
+ def _try_load_project() -> ProjectContext | None:
184
+ """Load an existing project context, or warn and return None."""
185
+ try:
186
+ return ProjectContext.load()
187
+ except ProjectNotFoundError:
188
+ console.print(
189
+ "[dim](no .mlcompass/ project found in current path — "
190
+ "running standalone; results will not be persisted)[/dim]\n"
191
+ )
192
+ return None
193
+
194
+
195
+ def _has_api_key() -> bool:
196
+ """True iff an Anthropic API key is configured in the environment."""
197
+ return bool(os.environ.get("ANTHROPIC_API_KEY"))
198
+
199
+
200
+ # Indirection point: tests monkeypatch ``cli._advisor_callable`` to inject a
201
+ # fake instead of calling the real ``get_recommendation``.
202
+ _advisor_callable: Callable[..., dict[str, Any]] = get_recommendation
203
+
204
+
205
+ def _run_advisor(
206
+ analysis: dict[str, Any],
207
+ *,
208
+ model: str,
209
+ ) -> dict[str, Any] | None:
210
+ """Invoke the LLM advisor, handling parse errors gracefully."""
211
+ try:
212
+ with console.status(
213
+ "[cyan]Consulting model advisor...[/cyan]",
214
+ spinner="dots",
215
+ ):
216
+ return _advisor_callable(analysis, model=model)
217
+ except AdvisorParseError as exc:
218
+ console.print(f"\n[red]✗ Advisor returned an invalid response:[/red] {exc}")
219
+ return None
220
+
221
+
222
+ def _persist_advise_result(
223
+ *,
224
+ project: ProjectContext,
225
+ dataset_path: Path,
226
+ analysis: dict[str, Any],
227
+ recommendation: dict[str, Any] | None,
228
+ ) -> None:
229
+ """Save dataset metadata + context updates + advice log entry."""
230
+ fingerprint = project.register_dataset(
231
+ dataset_path,
232
+ meta={"analysis": analysis},
233
+ )
234
+
235
+ project.write_context(
236
+ {
237
+ "active_dataset": f"datasets/{fingerprint}.json",
238
+ "project_type": analysis["task_hint"].get("type"),
239
+ "target_column": analysis["target_hint"].get("column"),
240
+ }
241
+ )
242
+
243
+ top_model: str
244
+ if recommendation and recommendation.get("models"):
245
+ top_model = recommendation["models"][0].get("name", "n/a")
246
+ else:
247
+ top_model = "n/a (LLM advisor skipped)"
248
+
249
+ project.append_decision(
250
+ command="advise",
251
+ summary=f"Top model recommendation: {top_model}",
252
+ reasoning=(
253
+ f"Dataset: {dataset_path}, "
254
+ f"task: {analysis['task_hint'].get('type', 'unknown')}"
255
+ ),
256
+ )
257
+
258
+ _append_advice_log(project, dataset_path, analysis, recommendation)
259
+
260
+ console.print(f"\n[green]✓[/green] Saved to {project.path}")
261
+
262
+
263
+ def _append_advice_log(
264
+ project: ProjectContext,
265
+ dataset_path: Path,
266
+ analysis: dict[str, Any],
267
+ recommendation: dict[str, Any] | None,
268
+ ) -> None:
269
+ """Append a single advice entry to ``.mlcompass/advice.log``."""
270
+ entry = {
271
+ "timestamp": datetime.now(timezone.utc).isoformat(),
272
+ "dataset": str(dataset_path),
273
+ "task_type": analysis["task_hint"].get("type"),
274
+ "target": analysis["target_hint"].get("column"),
275
+ "recommendation": recommendation,
276
+ }
277
+ log_path = project.path / "advice.log"
278
+ with log_path.open("a", encoding="utf-8") as f:
279
+ f.write(json.dumps(entry) + "\n")
280
+
281
+
282
+ # --------------------------------------------------------------------------- #
283
+ # Entry point #
284
+ # --------------------------------------------------------------------------- #
285
+
286
+
287
+ def main() -> None:
288
+ """Entry point for the ``mlcompass`` console script."""
289
+ cli()
290
+
291
+
292
+ if __name__ == "__main__": # pragma: no cover
293
+ main()
mlcompass/context.py ADDED
@@ -0,0 +1,222 @@
1
+ """Project context: persistent state stored in ``.mlcompass/``.
2
+
3
+ The directory layout is described in ``ARCHITECTURE.md §2``.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import hashlib
9
+ import json
10
+ from dataclasses import dataclass
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import yaml
16
+
17
+ from . import __version__
18
+
19
+ DEFAULT_PROJECT_DIR = ".mlcompass"
20
+
21
+
22
+ class ProjectExistsError(FileExistsError):
23
+ """Raised when init is called but ``.mlcompass/`` already exists."""
24
+
25
+
26
+ class ProjectNotFoundError(FileNotFoundError):
27
+ """Raised when load is called but no ``.mlcompass/`` is found."""
28
+
29
+
30
+ @dataclass
31
+ class ProjectContext:
32
+ """Wraps a single ``.mlcompass/`` project directory.
33
+
34
+ Layout::
35
+
36
+ .mlcompass/
37
+ ├── project.yaml # static metadata
38
+ ├── context.json # dynamic state (decisions, recommendations)
39
+ ├── datasets/ # registered datasets
40
+ ├── runs/ # training run history
41
+ ├── advice.log # advisor recommendation history
42
+ └── cache/ # tool result + LLM prompt cache
43
+ """
44
+
45
+ path: Path # the ``.mlcompass/`` directory itself
46
+
47
+ # ---------------------- Construction ----------------------
48
+
49
+ @classmethod
50
+ def init(
51
+ cls,
52
+ name: str,
53
+ parent_dir: Path | str = ".",
54
+ *,
55
+ default_model: str = "claude-opus-4-7",
56
+ ) -> "ProjectContext":
57
+ """Create a new ``.mlcompass/`` directory under ``parent_dir``.
58
+
59
+ Args:
60
+ name: Human-readable project name (e.g., ``"churn-model"``).
61
+ parent_dir: Directory in which ``.mlcompass/`` will be created.
62
+ default_model: Default LLM model name for the project.
63
+
64
+ Returns:
65
+ The newly created ``ProjectContext``.
66
+
67
+ Raises:
68
+ ProjectExistsError: If a project already exists at the target.
69
+ """
70
+ parent_dir = Path(parent_dir).resolve()
71
+ target = parent_dir / DEFAULT_PROJECT_DIR
72
+
73
+ if target.exists():
74
+ raise ProjectExistsError(f"Project already exists at {target}")
75
+
76
+ # Directory tree
77
+ target.mkdir(parents=True)
78
+ (target / "datasets").mkdir()
79
+ (target / "runs").mkdir()
80
+ (target / "cache").mkdir()
81
+
82
+ # Static metadata
83
+ project_meta = {
84
+ "name": name,
85
+ "created": datetime.now(timezone.utc).isoformat(),
86
+ "mlcompass_version": __version__,
87
+ "default_model": default_model,
88
+ }
89
+ (target / "project.yaml").write_text(
90
+ yaml.safe_dump(project_meta, sort_keys=False),
91
+ encoding="utf-8",
92
+ )
93
+
94
+ # Dynamic context starts empty
95
+ initial_context: dict[str, Any] = {
96
+ "project_type": None,
97
+ "target_column": None,
98
+ "preferred_models": [],
99
+ "active_dataset": None,
100
+ "current_run": None,
101
+ "decisions": [],
102
+ }
103
+ (target / "context.json").write_text(
104
+ json.dumps(initial_context, indent=2),
105
+ encoding="utf-8",
106
+ )
107
+
108
+ # Advice log starts empty
109
+ (target / "advice.log").touch()
110
+
111
+ # Local .gitignore so cache/ and runs/ don't pollute the user's repo
112
+ (target / ".gitignore").write_text(
113
+ "cache/\nruns/\n*.pyc\n__pycache__/\n",
114
+ encoding="utf-8",
115
+ )
116
+
117
+ return cls(path=target)
118
+
119
+ @classmethod
120
+ def load(cls, search_from: Path | str = ".") -> "ProjectContext":
121
+ """Find and load an existing project by walking up from ``search_from``.
122
+
123
+ Mirrors the behaviour of ``git`` when discovering a repository.
124
+
125
+ Raises:
126
+ ProjectNotFoundError: If no ``.mlcompass/`` is found between
127
+ ``search_from`` and the filesystem root.
128
+ """
129
+ current = Path(search_from).resolve()
130
+ for candidate in [current, *current.parents]:
131
+ target = candidate / DEFAULT_PROJECT_DIR
132
+ if target.is_dir():
133
+ return cls(path=target)
134
+ raise ProjectNotFoundError(
135
+ f"No {DEFAULT_PROJECT_DIR}/ found at or above {current}. "
136
+ "Run `mlcompass init <name>` to create one."
137
+ )
138
+
139
+ # ---------------------- Read / write ----------------------
140
+
141
+ @property
142
+ def project_meta(self) -> dict[str, Any]:
143
+ """Static metadata loaded from ``project.yaml``."""
144
+ return yaml.safe_load(
145
+ (self.path / "project.yaml").read_text(encoding="utf-8")
146
+ )
147
+
148
+ def read_context(self) -> dict[str, Any]:
149
+ """Read the dynamic context (``context.json``)."""
150
+ return json.loads(
151
+ (self.path / "context.json").read_text(encoding="utf-8")
152
+ )
153
+
154
+ def write_context(self, updates: dict[str, Any]) -> None:
155
+ """Merge ``updates`` into ``context.json``.
156
+
157
+ Top-level keys are replaced. For lists like ``decisions`` prefer
158
+ ``append_decision`` so timestamps are added automatically.
159
+ """
160
+ current = self.read_context()
161
+ current.update(updates)
162
+ (self.path / "context.json").write_text(
163
+ json.dumps(current, indent=2),
164
+ encoding="utf-8",
165
+ )
166
+
167
+ def append_decision(
168
+ self,
169
+ command: str,
170
+ summary: str,
171
+ *,
172
+ reasoning: str = "",
173
+ ) -> None:
174
+ """Append a timestamped decision entry to ``decisions``."""
175
+ ctx = self.read_context()
176
+ ctx.setdefault("decisions", []).append(
177
+ {
178
+ "timestamp": datetime.now(timezone.utc).isoformat(),
179
+ "command": command,
180
+ "summary": summary,
181
+ "reasoning": reasoning,
182
+ }
183
+ )
184
+ (self.path / "context.json").write_text(
185
+ json.dumps(ctx, indent=2),
186
+ encoding="utf-8",
187
+ )
188
+
189
+ # ---------------------- Dataset registry ----------------------
190
+
191
+ def register_dataset(
192
+ self,
193
+ dataset_path: Path | str,
194
+ meta: dict[str, Any],
195
+ ) -> str:
196
+ """Save dataset metadata under ``datasets/<fingerprint>.json``.
197
+
198
+ The fingerprint is the truncated SHA-256 of the absolute path plus
199
+ the file's modification time, so re-registering an unchanged file
200
+ is idempotent.
201
+
202
+ Returns:
203
+ The fingerprint (16-character hex).
204
+ """
205
+ dataset_path = Path(dataset_path).resolve()
206
+ fingerprint = self._fingerprint(dataset_path)
207
+ record = {
208
+ "path": str(dataset_path),
209
+ "registered_at": datetime.now(timezone.utc).isoformat(),
210
+ **meta,
211
+ }
212
+ (self.path / "datasets" / f"{fingerprint}.json").write_text(
213
+ json.dumps(record, indent=2),
214
+ encoding="utf-8",
215
+ )
216
+ return fingerprint
217
+
218
+ @staticmethod
219
+ def _fingerprint(p: Path) -> str:
220
+ """Stable per-file fingerprint based on path + mtime."""
221
+ marker = f"{p.resolve()}::{p.stat().st_mtime_ns}".encode("utf-8")
222
+ return hashlib.sha256(marker).hexdigest()[:16]