multimodal-mllog 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- multimodal_mllog-0.1.0/PKG-INFO +10 -0
- multimodal_mllog-0.1.0/README.md +96 -0
- multimodal_mllog-0.1.0/mllog/__init__.py +6 -0
- multimodal_mllog-0.1.0/mllog/cli.py +97 -0
- multimodal_mllog-0.1.0/mllog/git_context.py +70 -0
- multimodal_mllog-0.1.0/mllog/mlflow_io.py +126 -0
- multimodal_mllog-0.1.0/mllog/render.py +78 -0
- multimodal_mllog-0.1.0/mllog/schema.py +69 -0
- multimodal_mllog-0.1.0/multimodal_mllog.egg-info/PKG-INFO +10 -0
- multimodal_mllog-0.1.0/multimodal_mllog.egg-info/SOURCES.txt +17 -0
- multimodal_mllog-0.1.0/multimodal_mllog.egg-info/dependency_links.txt +1 -0
- multimodal_mllog-0.1.0/multimodal_mllog.egg-info/entry_points.txt +2 -0
- multimodal_mllog-0.1.0/multimodal_mllog.egg-info/requires.txt +6 -0
- multimodal_mllog-0.1.0/multimodal_mllog.egg-info/top_level.txt +1 -0
- multimodal_mllog-0.1.0/pyproject.toml +23 -0
- multimodal_mllog-0.1.0/setup.cfg +4 -0
- multimodal_mllog-0.1.0/tests/test_mlflow_io.py +77 -0
- multimodal_mllog-0.1.0/tests/test_render.py +26 -0
- multimodal_mllog-0.1.0/tests/test_schema.py +47 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: multimodal-mllog
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Experiment logbook pipeline: MLflow runs + git context -> structured logbook entries
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: mlflow
|
|
7
|
+
Requires-Dist: pydantic>=2
|
|
8
|
+
Requires-Dist: click
|
|
9
|
+
Provides-Extra: synthesis
|
|
10
|
+
Requires-Dist: anthropic; extra == "synthesis"
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# multimodal_mllogger
|
|
2
|
+
|
|
3
|
+
Experiment Logbook Pipeline for a researcher finetuning an ML model using coding agents.
|
|
4
|
+
|
|
5
|
+
A researcher finetunes a model inside Claude Code; the training script logs to MLflow. At
|
|
6
|
+
session end, `/mllog` generates a structured, validated **logbook entry** — what was tried,
|
|
7
|
+
what changed in the code, what the metrics were, what likely happened, next steps — and
|
|
8
|
+
stores it back into MLflow as artifacts and tags.
|
|
9
|
+
|
|
10
|
+
```
|
|
11
|
+
researcher → Claude Code → MLflow → logbook entry (triggered by /mllog at session end)
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Full design spec: [BUILD.md](BUILD.md).
|
|
15
|
+
|
|
16
|
+
## Quick start
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
python -m venv .venv
|
|
20
|
+
.venv/Scripts/activate # Windows; on Unix: source .venv/bin/activate
|
|
21
|
+
pip install -e .
|
|
22
|
+
pip install pytest # for the test suite
|
|
23
|
+
|
|
24
|
+
python examples/finetune_demo.py # demo run: 4 params, 2 metrics
|
|
25
|
+
mlflow ui --backend-store-uri sqlite:///mlflow.db # optional: inspect runs
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Then, in a Claude Code session opened in this repo, run **`/mllog`**. It collects ground
|
|
29
|
+
truth via the CLI, fills in the logbook from the live session, and ingests it.
|
|
30
|
+
|
|
31
|
+
## Storage (v0.1)
|
|
32
|
+
|
|
33
|
+
- Tracking backend: **local SQLite**, default `MLFLOW_TRACKING_URI=sqlite:///mlflow.db`
|
|
34
|
+
(serverless; override via the env var). Run artifacts live under `./mlruns/`.
|
|
35
|
+
- `logbook.json` is canonical; `logbook.md` is a deterministic rendering of it — never
|
|
36
|
+
edited or parsed.
|
|
37
|
+
|
|
38
|
+
## CLI
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
mllog get-mlflow-info [--run-id ID | --latest] [--experiment NAME] --json
|
|
42
|
+
# params, metrics, tags, artifacts, UTC start/end of one MLflow run; exits 1 if no run
|
|
43
|
+
|
|
44
|
+
mllog get-git-info [--base REF] --json
|
|
45
|
+
# working-tree diff vs HEAD, recent commits (UTC ISO 8601), HEAD sha
|
|
46
|
+
|
|
47
|
+
mllog create-logbook <logbook.json> [--run-id ID | --latest]
|
|
48
|
+
# validate against the schema, render logbook.md, attach both as artifacts,
|
|
49
|
+
# set logbook.* tags; exits 1 on validation failure or missing run
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Schema and guardrails
|
|
53
|
+
|
|
54
|
+
`mllog/schema.py` defines the versioned `LogbookEntry` (pydantic v2). Validators enforce:
|
|
55
|
+
|
|
56
|
+
1. **Grounding** — `hypothesis`, `likely_cause`, and `next_experiment` must each have a key
|
|
57
|
+
in `evidence`.
|
|
58
|
+
2. **Metric references resolve** — every `metric:<name>` evidence value must name a metric
|
|
59
|
+
present in `metrics`.
|
|
60
|
+
3. **Timestamps** — `date` must be a full ISO 8601 UTC timestamp.
|
|
61
|
+
|
|
62
|
+
Evidence values are typed references: `metric:val_acc`, `diff:planner.py@@-120,6`,
|
|
63
|
+
`session:user asked to lower lr` (`video:` / `telemetry:` reserved for later milestones).
|
|
64
|
+
The pipeline never fabricates data: no MLflow run or no git repo means a loud non-zero exit.
|
|
65
|
+
|
|
66
|
+
## Layout
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
mllog/
|
|
70
|
+
├── cli.py # `mllog` entrypoint
|
|
71
|
+
├── schema.py # versioned LogbookEntry + Metric (the contract)
|
|
72
|
+
├── mlflow_io.py # collector: read run; writer: artifacts + tags
|
|
73
|
+
├── git_context.py # collector: diff + recent commits
|
|
74
|
+
├── render.py # LogbookEntry -> markdown (deterministic projection)
|
|
75
|
+
└── synthesis/ # empty in v0; M2 LLM layer (pip install mllog[synthesis])
|
|
76
|
+
.claude/commands/mllog.md # the /mllog command (thin agent wrapper)
|
|
77
|
+
examples/finetune_demo.py # tiny demo finetune that logs to MLflow
|
|
78
|
+
tests/ # schema, render (golden file), mlflow_io
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Tests
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
python -m pytest -q
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
The deterministic core is tested without any LLM. `tests/test_render.py` includes a
|
|
88
|
+
golden-file test (`tests/golden/logbook.md`) locking the markdown output; if a render
|
|
89
|
+
change is intentional, regenerate the golden file from the `valid_entry` fixture.
|
|
90
|
+
|
|
91
|
+
## Roadmap
|
|
92
|
+
|
|
93
|
+
- **M1 (done):** `run-info`, `git-context`, `ingest`, `/mllog` — full loop on the demo.
|
|
94
|
+
- **M2:** `mllog.synthesis` (Anthropic API, optional extra), `mllog logbook`,
|
|
95
|
+
session-id run selection, multi-run sessions (`related_run_ids`).
|
|
96
|
+
- **M3:** video/telemetry collectors, time alignment, object-storage artifacts.
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""mllog CLI. Fails loudly: non-zero exit when a source is absent or invalid."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from pydantic import ValidationError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@click.group()
|
|
13
|
+
def main() -> None:
|
|
14
|
+
"""Experiment logbook tools: MLflow + git -> validated logbook entries."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@main.command("get-mlflow-info")
|
|
18
|
+
@click.option("--run-id", "run_id", default=None, help="Specific MLflow run id.")
|
|
19
|
+
@click.option("--latest", is_flag=True, help="Most recently started run in the experiment.")
|
|
20
|
+
@click.option("--experiment", "experiment_name", default=None, help="Experiment name (default: Default).")
|
|
21
|
+
@click.option("--json", "as_json", is_flag=True, help="Print JSON payload.")
|
|
22
|
+
def get_mlflow_info(run_id: str | None, latest: bool, experiment_name: str | None, as_json: bool) -> None:
|
|
23
|
+
"""Print params, metrics, tags, artifacts and UTC times for one MLflow run."""
|
|
24
|
+
from mllog.mlflow_io import RunNotFoundError, get_mlflow_info
|
|
25
|
+
|
|
26
|
+
if not run_id and not latest:
|
|
27
|
+
raise click.UsageError("missing parameters: provide --run-id or --latest")
|
|
28
|
+
try:
|
|
29
|
+
info = get_mlflow_info(run_id=run_id, latest=latest, experiment_name=experiment_name)
|
|
30
|
+
except RunNotFoundError as exc:
|
|
31
|
+
click.echo(f"error: {exc}", err=True)
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
|
|
34
|
+
click.echo(
|
|
35
|
+
f"selected run: {info['run_id']} (experiment: {info['experiment_name']})",
|
|
36
|
+
err=True,
|
|
37
|
+
)
|
|
38
|
+
click.echo(json.dumps(info, indent=2))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@main.command("get-git-info")
|
|
42
|
+
@click.option("--base", default="HEAD", help="Diff base ref (default: HEAD).")
|
|
43
|
+
@click.option("--json", "as_json", is_flag=True, help="Print JSON payload.")
|
|
44
|
+
def get_git_info(base: str, as_json: bool) -> None:
|
|
45
|
+
"""Print working-tree diff, recent commits (UTC timestamps), and HEAD sha."""
|
|
46
|
+
from mllog.git_context import GitContextError, get_git_context
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
ctx = get_git_context(base=base)
|
|
50
|
+
except GitContextError as exc:
|
|
51
|
+
click.echo(f"error: {exc}", err=True)
|
|
52
|
+
sys.exit(1)
|
|
53
|
+
click.echo(json.dumps(ctx, indent=2))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@main.command("create-logbook")
|
|
57
|
+
@click.argument("logbook_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
|
58
|
+
@click.option("--run-id", "run_id", default=None, help="Specific MLflow run id.")
|
|
59
|
+
@click.option("--latest", is_flag=True, help="Most recently started run in the experiment.")
|
|
60
|
+
@click.option("--experiment", "experiment_name", default=None, help="Experiment name (default: Default).")
|
|
61
|
+
def verify_log(logbook_path: Path, run_id: str | None, latest: bool, experiment_name: str | None) -> None:
|
|
62
|
+
"""Validate logbook JSON, render markdown, attach both to the MLflow run."""
|
|
63
|
+
from mllog.mlflow_io import RunNotFoundError, attach_to_mlflow
|
|
64
|
+
from mllog.schema import LogbookEntry
|
|
65
|
+
|
|
66
|
+
if not run_id and not latest:
|
|
67
|
+
raise click.UsageError("specify --run-id ID or --latest")
|
|
68
|
+
|
|
69
|
+
# Load logbook JSON
|
|
70
|
+
try:
|
|
71
|
+
data = json.loads(logbook_path.read_text(encoding="utf-8"))
|
|
72
|
+
except json.JSONDecodeError as exc:
|
|
73
|
+
click.echo(f"error: {logbook_path} is not valid JSON: {exc}", err=True)
|
|
74
|
+
sys.exit(1)
|
|
75
|
+
|
|
76
|
+
# Validate logbook schema
|
|
77
|
+
try:
|
|
78
|
+
entry = LogbookEntry.model_validate(data)
|
|
79
|
+
except ValidationError as exc:
|
|
80
|
+
click.echo(f"error: logbook failed schema validation:\n{exc}", err=True)
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
|
|
83
|
+
# Attach the logbook to the MLflow run
|
|
84
|
+
try:
|
|
85
|
+
result = attach_to_mlflow(
|
|
86
|
+
entry, run_id=run_id, latest=latest, experiment_name=experiment_name
|
|
87
|
+
)
|
|
88
|
+
except RunNotFoundError as exc:
|
|
89
|
+
click.echo(f"error: {exc}", err=True)
|
|
90
|
+
sys.exit(1)
|
|
91
|
+
|
|
92
|
+
click.echo(f"logbook ingested into run: {result['run_id']}")
|
|
93
|
+
click.echo(f"artifacts: {', '.join(result['artifacts'])}")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
if __name__ == "__main__":
|
|
97
|
+
main()
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Git collector: diff + recent commits as a timestamped, typed payload.
|
|
2
|
+
|
|
3
|
+
Collector symmetry with mlflow_io: explicit inputs, typed payload with UTC
|
|
4
|
+
ISO 8601 timestamps, fails loudly (raises GitContextError) when git/repo is absent.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import subprocess
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GitContextError(Exception):
|
|
12
|
+
"""Raised when git context cannot be collected."""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _git(*args: str, cwd: str | None = None) -> str:
|
|
16
|
+
try:
|
|
17
|
+
proc = subprocess.run(
|
|
18
|
+
["git", *args],
|
|
19
|
+
cwd=cwd,
|
|
20
|
+
capture_output=True,
|
|
21
|
+
text=True,
|
|
22
|
+
encoding="utf-8",
|
|
23
|
+
errors="replace",
|
|
24
|
+
)
|
|
25
|
+
except FileNotFoundError as exc:
|
|
26
|
+
raise GitContextError("git executable not found") from exc
|
|
27
|
+
if proc.returncode != 0:
|
|
28
|
+
raise GitContextError(
|
|
29
|
+
f"git {' '.join(args)} failed: {proc.stderr.strip() or proc.stdout.strip()}"
|
|
30
|
+
)
|
|
31
|
+
return proc.stdout
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_git_context(base: str = "HEAD", cwd: str | None = None, max_commits: int = 10) -> dict:
|
|
35
|
+
"""Return {"collected_at", "diff", "recent_commits", "head"}."""
|
|
36
|
+
|
|
37
|
+
# Get the current HEAD commit
|
|
38
|
+
head = _git("rev-parse", "HEAD", cwd=cwd).strip()
|
|
39
|
+
|
|
40
|
+
# Get the diff
|
|
41
|
+
diff = _git("diff", base, cwd=cwd)
|
|
42
|
+
|
|
43
|
+
# Get recent commits
|
|
44
|
+
log_raw = _git(
|
|
45
|
+
"log",
|
|
46
|
+
f"-{max_commits}",
|
|
47
|
+
"--pretty=format:%H%x1f%cI%x1f%s", # %cI = committer date, strict ISO 8601
|
|
48
|
+
cwd=cwd,
|
|
49
|
+
)
|
|
50
|
+
commits = []
|
|
51
|
+
for line in log_raw.splitlines():
|
|
52
|
+
if not line.strip():
|
|
53
|
+
continue
|
|
54
|
+
sha, raw_ts, subject = line.split("\x1f", 2)
|
|
55
|
+
utc_ts = (
|
|
56
|
+
datetime.fromisoformat(raw_ts)
|
|
57
|
+
.astimezone(timezone.utc)
|
|
58
|
+
.isoformat(timespec="seconds")
|
|
59
|
+
.replace("+00:00", "Z")
|
|
60
|
+
)
|
|
61
|
+
commits.append({"sha": sha, "timestamp": utc_ts, "subject": subject})
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
"collected_at": datetime.now(timezone.utc)
|
|
65
|
+
.isoformat(timespec="seconds")
|
|
66
|
+
.replace("+00:00", "Z"),
|
|
67
|
+
"diff": diff,
|
|
68
|
+
"recent_commits": commits,
|
|
69
|
+
"head": head,
|
|
70
|
+
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""MLflow collector (read run -> typed payload) and writer (artifacts + tags).
|
|
2
|
+
|
|
3
|
+
Collector symmetry: explicit inputs, timestamped typed payload, fails loudly
|
|
4
|
+
(raises RunNotFoundError) when the source is absent.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import tempfile
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
# v0.1 default: local serverless SQLite backend (MLflow's recommended local store).
|
|
14
|
+
os.environ.setdefault("MLFLOW_TRACKING_URI", "sqlite:///mlflow.db")
|
|
15
|
+
|
|
16
|
+
import mlflow
|
|
17
|
+
from mlflow.tracking import MlflowClient
|
|
18
|
+
|
|
19
|
+
from mllog.render import render_markdown
|
|
20
|
+
from mllog.schema import LogbookEntry
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RunNotFoundError(Exception):
|
|
24
|
+
"""Raised when no MLflow run matches the request."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _ms_to_utc_iso(ms: int | None) -> str | None:
|
|
28
|
+
if ms is None:
|
|
29
|
+
return None
|
|
30
|
+
return (
|
|
31
|
+
datetime.fromtimestamp(ms / 1000, tz=timezone.utc)
|
|
32
|
+
.isoformat(timespec="seconds")
|
|
33
|
+
.replace("+00:00", "Z")
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _resolve_experiment(client: MlflowClient, experiment_name: str | None):
|
|
38
|
+
if experiment_name is None:
|
|
39
|
+
experiment_name = "Default"
|
|
40
|
+
experiment = client.get_experiment_by_name(experiment_name)
|
|
41
|
+
if experiment is None:
|
|
42
|
+
raise RunNotFoundError(f"MLflow experiment not found: {experiment_name!r}")
|
|
43
|
+
return experiment
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_mlflow_info(
|
|
47
|
+
run_id: str | None = None,
|
|
48
|
+
latest: bool = False,
|
|
49
|
+
experiment_name: str | None = None,
|
|
50
|
+
) -> dict:
|
|
51
|
+
"""Return a typed, timestamped payload describing one MLflow run."""
|
|
52
|
+
client = MlflowClient()
|
|
53
|
+
|
|
54
|
+
# Find the run
|
|
55
|
+
if run_id is not None:
|
|
56
|
+
try:
|
|
57
|
+
run = client.get_run(run_id)
|
|
58
|
+
except Exception as exc:
|
|
59
|
+
raise RunNotFoundError(f"MLflow run not found: {run_id!r} ({exc})") from exc
|
|
60
|
+
elif latest:
|
|
61
|
+
experiment = _resolve_experiment(client, experiment_name)
|
|
62
|
+
runs = client.search_runs(
|
|
63
|
+
[experiment.experiment_id],
|
|
64
|
+
order_by=["attributes.start_time DESC"],
|
|
65
|
+
max_results=1,
|
|
66
|
+
)
|
|
67
|
+
if not runs:
|
|
68
|
+
raise RunNotFoundError(f"no runs found in experiment {experiment.name!r}")
|
|
69
|
+
run = runs[0]
|
|
70
|
+
else:
|
|
71
|
+
raise RunNotFoundError("specify --run-id or --latest")
|
|
72
|
+
|
|
73
|
+
# Get run info (metrics+params)
|
|
74
|
+
experiment = client.get_experiment(run.info.experiment_id)
|
|
75
|
+
artifacts = [f.path for f in client.list_artifacts(run.info.run_id)]
|
|
76
|
+
return {
|
|
77
|
+
"collected_at": datetime.now(timezone.utc)
|
|
78
|
+
.isoformat(timespec="seconds")
|
|
79
|
+
.replace("+00:00", "Z"),
|
|
80
|
+
"run_id": run.info.run_id,
|
|
81
|
+
"experiment_id": run.info.experiment_id,
|
|
82
|
+
"experiment_name": experiment.name,
|
|
83
|
+
"status": run.info.status,
|
|
84
|
+
"start_time": _ms_to_utc_iso(run.info.start_time),
|
|
85
|
+
"end_time": _ms_to_utc_iso(run.info.end_time),
|
|
86
|
+
"params": dict(run.data.params),
|
|
87
|
+
"metrics": dict(run.data.metrics),
|
|
88
|
+
"tags": dict(run.data.tags),
|
|
89
|
+
"artifacts": artifacts,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def attach_to_mlflow(
|
|
94
|
+
entry: LogbookEntry,
|
|
95
|
+
run_id: str | None = None,
|
|
96
|
+
latest: bool = False,
|
|
97
|
+
experiment_name: str | None = None,
|
|
98
|
+
) -> dict:
|
|
99
|
+
"""Render markdown, log logbook.json + logbook.md as artifacts, set tags.
|
|
100
|
+
|
|
101
|
+
Returns {"run_id": ..., "artifacts": [...]}.
|
|
102
|
+
"""
|
|
103
|
+
info = get_mlflow_info(run_id=run_id, latest=latest, experiment_name=experiment_name)
|
|
104
|
+
target_run_id = info["run_id"]
|
|
105
|
+
client = MlflowClient()
|
|
106
|
+
|
|
107
|
+
# Create logbook artifacts
|
|
108
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
109
|
+
json_path = Path(tmp) / "logbook.json"
|
|
110
|
+
md_path = Path(tmp) / "logbook.md"
|
|
111
|
+
json_path.write_text(
|
|
112
|
+
json.dumps(entry.model_dump(), indent=2, ensure_ascii=False),
|
|
113
|
+
encoding="utf-8",
|
|
114
|
+
)
|
|
115
|
+
md_path.write_text(render_markdown(entry), encoding="utf-8")
|
|
116
|
+
# Attach to the MLflow run
|
|
117
|
+
client.log_artifact(target_run_id, str(json_path))
|
|
118
|
+
client.log_artifact(target_run_id, str(md_path))
|
|
119
|
+
|
|
120
|
+
# Set MLflow tags
|
|
121
|
+
client.set_tag(target_run_id, "logbook.generated", "true")
|
|
122
|
+
client.set_tag(target_run_id, "logbook.task", entry.task)
|
|
123
|
+
client.set_tag(target_run_id, "logbook.result", entry.result)
|
|
124
|
+
client.set_tag(target_run_id, "logbook.schema_version", entry.schema_version)
|
|
125
|
+
|
|
126
|
+
return {"run_id": target_run_id, "artifacts": ["logbook.json", "logbook.md"]}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Deterministic LogbookEntry -> markdown projection.
|
|
2
|
+
|
|
3
|
+
logbook.md is never edited or parsed; to change it, amend the JSON and re-render.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from mllog.schema import LogbookEntry
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def render_markdown(entry: LogbookEntry) -> str:
|
|
10
|
+
lines: list[str] = []
|
|
11
|
+
lines.append(f"# Logbook: {entry.task}")
|
|
12
|
+
lines.append("")
|
|
13
|
+
lines.append(f"- **Date:** {entry.date}")
|
|
14
|
+
lines.append(f"- **Experiment:** {entry.experiment_id}")
|
|
15
|
+
lines.append(f"- **MLflow run:** `{entry.mlflow_run_id}`")
|
|
16
|
+
if entry.related_run_ids:
|
|
17
|
+
lines.append(
|
|
18
|
+
"- **Related runs:** " + ", ".join(f"`{r}`" for r in entry.related_run_ids)
|
|
19
|
+
)
|
|
20
|
+
if entry.git_before or entry.git_after:
|
|
21
|
+
lines.append(
|
|
22
|
+
f"- **Git:** `{entry.git_before or '?'}` -> `{entry.git_after or '?'}`"
|
|
23
|
+
)
|
|
24
|
+
lines.append(f"- **Schema version:** {entry.schema_version}")
|
|
25
|
+
lines.append("")
|
|
26
|
+
|
|
27
|
+
lines.append("## Prompt")
|
|
28
|
+
lines.append("")
|
|
29
|
+
lines.append(entry.prompt)
|
|
30
|
+
lines.append("")
|
|
31
|
+
|
|
32
|
+
lines.append("## Hypothesis")
|
|
33
|
+
lines.append("")
|
|
34
|
+
lines.append(entry.hypothesis)
|
|
35
|
+
lines.append("")
|
|
36
|
+
|
|
37
|
+
lines.append("## Code changes")
|
|
38
|
+
lines.append("")
|
|
39
|
+
for change in entry.code_changes:
|
|
40
|
+
lines.append(f"- {change}")
|
|
41
|
+
lines.append("")
|
|
42
|
+
|
|
43
|
+
lines.append("## Metrics")
|
|
44
|
+
lines.append("")
|
|
45
|
+
lines.append("| Name | Value | Unit |")
|
|
46
|
+
lines.append("|------|-------|------|")
|
|
47
|
+
for m in entry.metrics:
|
|
48
|
+
lines.append(f"| {m.name} | {m.value} | {m.unit or ''} |")
|
|
49
|
+
lines.append("")
|
|
50
|
+
|
|
51
|
+
lines.append("## Result")
|
|
52
|
+
lines.append("")
|
|
53
|
+
lines.append(entry.result)
|
|
54
|
+
lines.append("")
|
|
55
|
+
|
|
56
|
+
lines.append("## Observations")
|
|
57
|
+
lines.append("")
|
|
58
|
+
for obs in entry.observations:
|
|
59
|
+
lines.append(f"- {obs}")
|
|
60
|
+
lines.append("")
|
|
61
|
+
|
|
62
|
+
lines.append("## Likely cause")
|
|
63
|
+
lines.append("")
|
|
64
|
+
lines.append(entry.likely_cause)
|
|
65
|
+
lines.append("")
|
|
66
|
+
|
|
67
|
+
lines.append("## Next experiment")
|
|
68
|
+
lines.append("")
|
|
69
|
+
lines.append(entry.next_experiment)
|
|
70
|
+
lines.append("")
|
|
71
|
+
|
|
72
|
+
lines.append("## Evidence")
|
|
73
|
+
lines.append("")
|
|
74
|
+
for claim in sorted(entry.evidence):
|
|
75
|
+
lines.append(f"- **{claim}** -> `{entry.evidence[claim]}`")
|
|
76
|
+
lines.append("")
|
|
77
|
+
|
|
78
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Versioned logbook schema. This is the contract; validators enforce grounding."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, model_validator
|
|
6
|
+
|
|
7
|
+
GROUNDED_FIELDS = ("hypothesis", "likely_cause", "next_experiment")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Metric(BaseModel):
|
|
11
|
+
name: str
|
|
12
|
+
value: float
|
|
13
|
+
unit: str | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LogbookEntry(BaseModel):
|
|
17
|
+
schema_version: str = "1.0"
|
|
18
|
+
experiment_id: str
|
|
19
|
+
date: str # full ISO 8601 UTC timestamp, e.g. "2026-06-09T18:45:01Z"
|
|
20
|
+
task: str
|
|
21
|
+
prompt: str
|
|
22
|
+
hypothesis: str
|
|
23
|
+
code_changes: list[str]
|
|
24
|
+
metrics: list[Metric]
|
|
25
|
+
result: str
|
|
26
|
+
observations: list[str]
|
|
27
|
+
likely_cause: str
|
|
28
|
+
next_experiment: str
|
|
29
|
+
evidence: dict[str, str] # claim -> typed reference (metric:, diff:, session:, ...)
|
|
30
|
+
mlflow_run_id: str
|
|
31
|
+
related_run_ids: list[str] = Field(default_factory=list) # RESERVED: multi-run sessions (M2+)
|
|
32
|
+
git_before: str | None = None
|
|
33
|
+
git_after: str | None = None
|
|
34
|
+
extra_inputs: dict = Field(default_factory=dict) # RESERVED for video/telemetry; {} in v0
|
|
35
|
+
|
|
36
|
+
@model_validator(mode="after")
|
|
37
|
+
def _validate_date_iso8601(self) -> "LogbookEntry":
|
|
38
|
+
try:
|
|
39
|
+
datetime.fromisoformat(self.date.replace("Z", "+00:00"))
|
|
40
|
+
except ValueError as exc:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"date must be a full ISO 8601 timestamp, got {self.date!r}: {exc}"
|
|
43
|
+
) from exc
|
|
44
|
+
return self
|
|
45
|
+
|
|
46
|
+
@model_validator(mode="after")
|
|
47
|
+
def _validate_grounding(self) -> "LogbookEntry":
|
|
48
|
+
missing = [f for f in GROUNDED_FIELDS if f not in self.evidence]
|
|
49
|
+
if missing:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"evidence must contain a key for each grounded claim; missing: "
|
|
52
|
+
+ ", ".join(missing)
|
|
53
|
+
)
|
|
54
|
+
return self
|
|
55
|
+
|
|
56
|
+
@model_validator(mode="after")
|
|
57
|
+
def _validate_metric_references(self) -> "LogbookEntry":
|
|
58
|
+
metric_names = {m.name for m in self.metrics}
|
|
59
|
+
bad = [
|
|
60
|
+
ref
|
|
61
|
+
for ref in self.evidence.values()
|
|
62
|
+
if ref.startswith("metric:") and ref.removeprefix("metric:") not in metric_names
|
|
63
|
+
]
|
|
64
|
+
if bad:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
"evidence metric references must name a metric present in `metrics`; "
|
|
67
|
+
"unresolved: " + ", ".join(bad)
|
|
68
|
+
)
|
|
69
|
+
return self
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: multimodal-mllog
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Experiment logbook pipeline: MLflow runs + git context -> structured logbook entries
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: mlflow
|
|
7
|
+
Requires-Dist: pydantic>=2
|
|
8
|
+
Requires-Dist: click
|
|
9
|
+
Provides-Extra: synthesis
|
|
10
|
+
Requires-Dist: anthropic; extra == "synthesis"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
mllog/__init__.py
|
|
4
|
+
mllog/cli.py
|
|
5
|
+
mllog/git_context.py
|
|
6
|
+
mllog/mlflow_io.py
|
|
7
|
+
mllog/render.py
|
|
8
|
+
mllog/schema.py
|
|
9
|
+
multimodal_mllog.egg-info/PKG-INFO
|
|
10
|
+
multimodal_mllog.egg-info/SOURCES.txt
|
|
11
|
+
multimodal_mllog.egg-info/dependency_links.txt
|
|
12
|
+
multimodal_mllog.egg-info/entry_points.txt
|
|
13
|
+
multimodal_mllog.egg-info/requires.txt
|
|
14
|
+
multimodal_mllog.egg-info/top_level.txt
|
|
15
|
+
tests/test_mlflow_io.py
|
|
16
|
+
tests/test_render.py
|
|
17
|
+
tests/test_schema.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mllog
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "multimodal-mllog"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Experiment logbook pipeline: MLflow runs + git context -> structured logbook entries"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"mlflow",
|
|
12
|
+
"pydantic>=2",
|
|
13
|
+
"click",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
synthesis = ["anthropic"]
|
|
18
|
+
|
|
19
|
+
[project.scripts]
|
|
20
|
+
mllog = "mllog.cli:main"
|
|
21
|
+
|
|
22
|
+
[tool.setuptools.packages.find]
|
|
23
|
+
include = ["mllog*"]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
import mlflow
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from mllog.mlflow_io import RunNotFoundError, attach_to_mlflow, get_mlflow_info
|
|
8
|
+
|
|
9
|
+
ISO_UTC = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.fixture
|
|
13
|
+
def tracking(tmp_path, monkeypatch):
|
|
14
|
+
monkeypatch.chdir(tmp_path) # keep ./mlruns artifact root inside tmp
|
|
15
|
+
uri = f"sqlite:///{(tmp_path / 'mlflow.db').as_posix()}"
|
|
16
|
+
monkeypatch.setenv("MLFLOW_TRACKING_URI", uri)
|
|
17
|
+
mlflow.set_tracking_uri(uri)
|
|
18
|
+
yield uri
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def demo_run(tracking):
|
|
23
|
+
with mlflow.start_run(run_name="test-run") as run:
|
|
24
|
+
mlflow.log_params({"learning_rate": 3e-4, "epochs": 5})
|
|
25
|
+
mlflow.log_metric("train_loss", 0.271)
|
|
26
|
+
mlflow.log_metric("val_acc", 0.86)
|
|
27
|
+
return run.info.run_id
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_run_info_latest(demo_run):
|
|
31
|
+
info = get_mlflow_info(latest=True)
|
|
32
|
+
assert info["run_id"] == demo_run
|
|
33
|
+
assert info["params"] == {"learning_rate": "0.0003", "epochs": "5"}
|
|
34
|
+
assert info["metrics"] == {"train_loss": 0.271, "val_acc": 0.86}
|
|
35
|
+
assert ISO_UTC.match(info["start_time"])
|
|
36
|
+
assert ISO_UTC.match(info["collected_at"])
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_run_info_by_id(demo_run):
|
|
40
|
+
info = get_mlflow_info(run_id=demo_run)
|
|
41
|
+
assert info["run_id"] == demo_run
|
|
42
|
+
assert info["experiment_name"] == "Default"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_run_info_no_runs_fails_loudly(tracking):
|
|
46
|
+
with pytest.raises(RunNotFoundError):
|
|
47
|
+
get_mlflow_info(latest=True)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_run_info_bad_run_id_fails_loudly(tracking):
|
|
51
|
+
with pytest.raises(RunNotFoundError):
|
|
52
|
+
get_mlflow_info(run_id="nonexistent")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_ingest_attaches_artifacts_and_tags(demo_run, valid_entry):
|
|
56
|
+
entry = valid_entry.model_copy(update={"mlflow_run_id": demo_run})
|
|
57
|
+
result = attach_to_mlflow(entry, run_id=demo_run)
|
|
58
|
+
assert result["run_id"] == demo_run
|
|
59
|
+
|
|
60
|
+
client = mlflow.tracking.MlflowClient()
|
|
61
|
+
artifacts = {f.path for f in client.list_artifacts(demo_run)}
|
|
62
|
+
assert {"logbook.json", "logbook.md"} <= artifacts
|
|
63
|
+
|
|
64
|
+
run = client.get_run(demo_run)
|
|
65
|
+
assert run.data.tags["logbook.generated"] == "true"
|
|
66
|
+
assert run.data.tags["logbook.task"] == entry.task
|
|
67
|
+
assert run.data.tags["logbook.result"] == entry.result
|
|
68
|
+
assert run.data.tags["logbook.schema_version"] == "1.0"
|
|
69
|
+
|
|
70
|
+
json_path = client.download_artifacts(demo_run, "logbook.json")
|
|
71
|
+
stored = json.loads(open(json_path, encoding="utf-8").read())
|
|
72
|
+
assert stored["mlflow_run_id"] == demo_run
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_ingest_missing_run_fails_loudly(tracking, valid_entry):
|
|
76
|
+
with pytest.raises(RunNotFoundError):
|
|
77
|
+
attach_to_mlflow(valid_entry, run_id="nonexistent")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from mllog.render import render_markdown
|
|
4
|
+
|
|
5
|
+
GOLDEN = Path(__file__).parent / "golden" / "logbook.md"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_render_is_deterministic(valid_entry):
|
|
9
|
+
assert render_markdown(valid_entry) == render_markdown(valid_entry)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_render_contains_metrics_and_evidence(valid_entry):
|
|
13
|
+
md = render_markdown(valid_entry)
|
|
14
|
+
assert "| train_loss | 0.271 |" in md
|
|
15
|
+
assert "| val_acc | 0.86 |" in md
|
|
16
|
+
assert "`metric:train_loss`" in md
|
|
17
|
+
assert "abc123" in md
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_golden_file(valid_entry):
|
|
21
|
+
"""Locks the markdown projection of a fixed LogbookEntry.
|
|
22
|
+
|
|
23
|
+
If a render change is intentional, regenerate with:
|
|
24
|
+
python -c "from tests.conftest import *; ..." (see README) or update the file.
|
|
25
|
+
"""
|
|
26
|
+
assert render_markdown(valid_entry) == GOLDEN.read_text(encoding="utf-8")
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pydantic import ValidationError
|
|
3
|
+
|
|
4
|
+
from mllog.schema import LogbookEntry
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_valid_entry_passes(valid_entry):
|
|
8
|
+
assert valid_entry.schema_version == "1.0"
|
|
9
|
+
assert valid_entry.related_run_ids == []
|
|
10
|
+
assert valid_entry.extra_inputs == {}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.mark.parametrize("missing", ["hypothesis", "likely_cause", "next_experiment"])
|
|
14
|
+
def test_grounding_required_for_each_claim(valid_entry, missing):
|
|
15
|
+
data = valid_entry.model_dump()
|
|
16
|
+
del data["evidence"][missing]
|
|
17
|
+
with pytest.raises(ValidationError, match=missing):
|
|
18
|
+
LogbookEntry.model_validate(data)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_metric_reference_must_resolve(valid_entry):
|
|
22
|
+
data = valid_entry.model_dump()
|
|
23
|
+
data["evidence"]["likely_cause"] = "metric:does_not_exist"
|
|
24
|
+
with pytest.raises(ValidationError, match="metric:does_not_exist"):
|
|
25
|
+
LogbookEntry.model_validate(data)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_non_metric_references_not_resolved_at_schema_level(valid_entry):
|
|
29
|
+
data = valid_entry.model_dump()
|
|
30
|
+
data["evidence"]["likely_cause"] = "diff:anything.py@@-1,1"
|
|
31
|
+
data["evidence"]["next_experiment"] = "session:we discussed it"
|
|
32
|
+
LogbookEntry.model_validate(data) # should not raise
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@pytest.mark.parametrize("bad_date", ["2026-13-40", "yesterday", "", "18:45"])
|
|
36
|
+
def test_date_must_be_iso8601(valid_entry, bad_date):
|
|
37
|
+
data = valid_entry.model_dump()
|
|
38
|
+
data["date"] = bad_date
|
|
39
|
+
with pytest.raises(ValidationError):
|
|
40
|
+
LogbookEntry.model_validate(data)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_date_accepts_z_suffix_and_offset(valid_entry):
|
|
44
|
+
data = valid_entry.model_dump()
|
|
45
|
+
for ok in ("2026-06-09T18:45:01Z", "2026-06-09T18:45:01+00:00"):
|
|
46
|
+
data["date"] = ok
|
|
47
|
+
LogbookEntry.model_validate(data)
|