mdmp-protocol 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdmp_ai/__init__.py +3 -0
- mdmp_ai/lineage.py +198 -0
- mdmp_core/__init__.py +48 -0
- mdmp_core/cli.py +346 -0
- mdmp_core/contracts.py +133 -0
- mdmp_core/fingerprint.py +84 -0
- mdmp_core/fingerprint_store.py +42 -0
- mdmp_core/hf.py +60 -0
- mdmp_core/registry.py +184 -0
- mdmp_core/runner.py +196 -0
- mdmp_core/visualizer.py +60 -0
- mdmp_flavors/__init__.py +67 -0
- mdmp_protocol-0.1.0.dist-info/METADATA +109 -0
- mdmp_protocol-0.1.0.dist-info/RECORD +18 -0
- mdmp_protocol-0.1.0.dist-info/WHEEL +5 -0
- mdmp_protocol-0.1.0.dist-info/entry_points.txt +2 -0
- mdmp_protocol-0.1.0.dist-info/licenses/LICENSE +173 -0
- mdmp_protocol-0.1.0.dist-info/top_level.txt +3 -0
mdmp_ai/__init__.py
ADDED
mdmp_ai/lineage.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
import json
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
from mdmp_core.contracts import load_contract
|
|
13
|
+
from mdmp_core.fingerprint import check_fingerprint, compute_fingerprint
|
|
14
|
+
from mdmp_core.runner import ContractRunner
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _now_iso() -> str:
|
|
18
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class DatasetRef:
|
|
23
|
+
path: str
|
|
24
|
+
fingerprint: str
|
|
25
|
+
grade: str
|
|
26
|
+
rows_used: int
|
|
27
|
+
date_validated: str
|
|
28
|
+
consent: str
|
|
29
|
+
contract_fingerprint: str
|
|
30
|
+
expires: str
|
|
31
|
+
status: str = "valid"
|
|
32
|
+
stale_reason: Optional[str] = None
|
|
33
|
+
|
|
34
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
35
|
+
return {
|
|
36
|
+
"path": self.path,
|
|
37
|
+
"fingerprint": self.fingerprint,
|
|
38
|
+
"grade": self.grade,
|
|
39
|
+
"rows_used": self.rows_used,
|
|
40
|
+
"date_validated": self.date_validated,
|
|
41
|
+
"consent": self.consent,
|
|
42
|
+
"contract_fingerprint": self.contract_fingerprint,
|
|
43
|
+
"expires": self.expires,
|
|
44
|
+
"status": self.status,
|
|
45
|
+
"stale_reason": self.stale_reason,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
@classmethod
|
|
49
|
+
def from_dict(cls, payload: Dict[str, Any]) -> "DatasetRef":
|
|
50
|
+
return cls(
|
|
51
|
+
path=str(payload.get("path", "")),
|
|
52
|
+
fingerprint=str(payload.get("fingerprint", "")),
|
|
53
|
+
grade=str(payload.get("grade", "draft")),
|
|
54
|
+
rows_used=int(payload.get("rows_used", 0)),
|
|
55
|
+
date_validated=str(payload.get("date_validated", "")),
|
|
56
|
+
consent=str(payload.get("consent", "not_verified")),
|
|
57
|
+
contract_fingerprint=str(payload.get("contract_fingerprint", "")),
|
|
58
|
+
expires=str(payload.get("expires", "")),
|
|
59
|
+
status=str(payload.get("status", "valid")),
|
|
60
|
+
stale_reason=payload.get("stale_reason"),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class LineageCard:
|
|
66
|
+
model_name: str
|
|
67
|
+
training_date: str
|
|
68
|
+
datasets: List[DatasetRef] = field(default_factory=list)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def stale_datasets(self) -> List[DatasetRef]:
|
|
72
|
+
return [row for row in self.datasets if row.status == "stale"]
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def lineage_status(self) -> str:
|
|
76
|
+
return "stale" if self.stale_datasets else "valid"
|
|
77
|
+
|
|
78
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
79
|
+
return {
|
|
80
|
+
"model": {
|
|
81
|
+
"name": self.model_name,
|
|
82
|
+
"training_date": self.training_date,
|
|
83
|
+
"lineage_status": self.lineage_status,
|
|
84
|
+
"stale_lineage": bool(self.stale_datasets),
|
|
85
|
+
"stale_datasets": [row.fingerprint for row in self.stale_datasets],
|
|
86
|
+
"trained_on": [row.to_dict() for row in self.datasets],
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def from_dict(cls, payload: Dict[str, Any]) -> "LineageCard":
|
|
92
|
+
model = payload.get("model", {}) if isinstance(payload, dict) else {}
|
|
93
|
+
trained_on = model.get("trained_on", []) if isinstance(model, dict) else []
|
|
94
|
+
rows: List[DatasetRef] = []
|
|
95
|
+
if isinstance(trained_on, list):
|
|
96
|
+
for entry in trained_on:
|
|
97
|
+
if isinstance(entry, dict):
|
|
98
|
+
rows.append(DatasetRef.from_dict(entry))
|
|
99
|
+
return cls(
|
|
100
|
+
model_name=str(model.get("name", "unknown_model")),
|
|
101
|
+
training_date=str(model.get("training_date", _now_iso())),
|
|
102
|
+
datasets=rows,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def export(self, output_path: str | Path) -> Dict[str, Any]:
|
|
106
|
+
payload = self.to_dict()
|
|
107
|
+
output = Path(output_path)
|
|
108
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
if output.suffix.lower() in {".yaml", ".yml"}:
|
|
110
|
+
output.write_text(yaml.safe_dump(payload, sort_keys=False), encoding="utf-8")
|
|
111
|
+
else:
|
|
112
|
+
output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
113
|
+
return payload
|
|
114
|
+
|
|
115
|
+
def refresh(self, *, data_dir: Optional[Path] = None) -> "LineageCard":
|
|
116
|
+
for dataset in self.datasets:
|
|
117
|
+
resolved_path = Path(dataset.path)
|
|
118
|
+
if not resolved_path.is_file() and data_dir is not None:
|
|
119
|
+
candidate = data_dir / dataset.path
|
|
120
|
+
if candidate.is_file():
|
|
121
|
+
resolved_path = candidate
|
|
122
|
+
if not resolved_path.is_file():
|
|
123
|
+
dataset.status = "stale"
|
|
124
|
+
dataset.stale_reason = "dataset_not_found"
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
checked = check_fingerprint(
|
|
128
|
+
{
|
|
129
|
+
"fingerprint": dataset.fingerprint,
|
|
130
|
+
"expires": dataset.expires,
|
|
131
|
+
"status": dataset.status,
|
|
132
|
+
"stale_reason": dataset.stale_reason,
|
|
133
|
+
},
|
|
134
|
+
data_path=resolved_path,
|
|
135
|
+
)
|
|
136
|
+
dataset.status = str(checked.get("status", "stale"))
|
|
137
|
+
dataset.stale_reason = checked.get("stale_reason")
|
|
138
|
+
return self
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def load_lineage_card(path: str | Path) -> LineageCard:
|
|
142
|
+
card_path = Path(path)
|
|
143
|
+
raw = card_path.read_text(encoding="utf-8")
|
|
144
|
+
if card_path.suffix.lower() in {".yaml", ".yml"}:
|
|
145
|
+
payload = yaml.safe_load(raw) or {}
|
|
146
|
+
else:
|
|
147
|
+
payload = json.loads(raw)
|
|
148
|
+
if not isinstance(payload, dict):
|
|
149
|
+
raise ValueError("Lineage card root must be a mapping")
|
|
150
|
+
return LineageCard.from_dict(payload)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@dataclass
|
|
154
|
+
class LineageTracker:
|
|
155
|
+
datasets: list[DatasetRef] = field(default_factory=list)
|
|
156
|
+
model_name: str | None = None
|
|
157
|
+
|
|
158
|
+
def register_dataset(
|
|
159
|
+
self,
|
|
160
|
+
dataset_path: str | Path,
|
|
161
|
+
*,
|
|
162
|
+
contract: str | Path,
|
|
163
|
+
expires_days: int = 365,
|
|
164
|
+
) -> Dict[str, Any]:
|
|
165
|
+
dataset_path = Path(dataset_path)
|
|
166
|
+
contract_path = Path(contract)
|
|
167
|
+
df = pd.read_csv(dataset_path)
|
|
168
|
+
contract_obj = load_contract(contract_path)
|
|
169
|
+
validation = ContractRunner(contract_obj).run(df)
|
|
170
|
+
fp_record = compute_fingerprint(dataset_path, expires_days=expires_days)
|
|
171
|
+
|
|
172
|
+
row = DatasetRef(
|
|
173
|
+
path=str(dataset_path),
|
|
174
|
+
fingerprint=str(fp_record["fingerprint"]),
|
|
175
|
+
grade=validation.grade,
|
|
176
|
+
rows_used=validation.row_count,
|
|
177
|
+
date_validated=validation.created_utc,
|
|
178
|
+
consent="verified" if validation.consent_verified else "not_verified",
|
|
179
|
+
contract_fingerprint=f"sha256:{validation.contract_fingerprint_sha256}",
|
|
180
|
+
expires=str(fp_record["expires"]),
|
|
181
|
+
status=str(fp_record["status"]),
|
|
182
|
+
stale_reason=fp_record.get("stale_reason"),
|
|
183
|
+
)
|
|
184
|
+
self.datasets.append(row)
|
|
185
|
+
return row.to_dict()
|
|
186
|
+
|
|
187
|
+
def attach_to_model(self, model_name: str) -> None:
|
|
188
|
+
self.model_name = model_name
|
|
189
|
+
|
|
190
|
+
def export_card(self, output_path: str | Path) -> Dict[str, Any]:
|
|
191
|
+
if not self.model_name:
|
|
192
|
+
raise ValueError("Call attach_to_model() before export_card()")
|
|
193
|
+
card = LineageCard(
|
|
194
|
+
model_name=self.model_name,
|
|
195
|
+
training_date=_now_iso(),
|
|
196
|
+
datasets=self.datasets,
|
|
197
|
+
)
|
|
198
|
+
return card.export(output_path)
|
mdmp_core/__init__.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from .contracts import DataContract, load_contract, parse_contract, save_contract
|
|
2
|
+
from .runner import (
|
|
3
|
+
MDMP_PROTOCOL_VERSION,
|
|
4
|
+
MDMP_GRADE_ORDER,
|
|
5
|
+
ContractRunner,
|
|
6
|
+
ValidationResult,
|
|
7
|
+
dataframe_fingerprint,
|
|
8
|
+
grade_meets_minimum,
|
|
9
|
+
)
|
|
10
|
+
from .fingerprint import compute_fingerprint, check_fingerprint
|
|
11
|
+
from .fingerprint_store import FingerprintStore
|
|
12
|
+
from .registry import (
|
|
13
|
+
REGISTRY_VERSION,
|
|
14
|
+
init_registry,
|
|
15
|
+
load_registry,
|
|
16
|
+
save_registry,
|
|
17
|
+
upsert_record,
|
|
18
|
+
lookup_record,
|
|
19
|
+
list_records,
|
|
20
|
+
)
|
|
21
|
+
from .hf import build_hf_mdmp_section, load_report
|
|
22
|
+
from .visualizer import build_dashboard_html
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"DataContract",
|
|
26
|
+
"load_contract",
|
|
27
|
+
"parse_contract",
|
|
28
|
+
"save_contract",
|
|
29
|
+
"MDMP_PROTOCOL_VERSION",
|
|
30
|
+
"MDMP_GRADE_ORDER",
|
|
31
|
+
"ContractRunner",
|
|
32
|
+
"ValidationResult",
|
|
33
|
+
"dataframe_fingerprint",
|
|
34
|
+
"grade_meets_minimum",
|
|
35
|
+
"compute_fingerprint",
|
|
36
|
+
"check_fingerprint",
|
|
37
|
+
"FingerprintStore",
|
|
38
|
+
"REGISTRY_VERSION",
|
|
39
|
+
"init_registry",
|
|
40
|
+
"load_registry",
|
|
41
|
+
"save_registry",
|
|
42
|
+
"upsert_record",
|
|
43
|
+
"lookup_record",
|
|
44
|
+
"list_records",
|
|
45
|
+
"build_hf_mdmp_section",
|
|
46
|
+
"load_report",
|
|
47
|
+
"build_dashboard_html",
|
|
48
|
+
]
|
mdmp_core/cli.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import typer
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
from mdmp_ai import LineageTracker, load_lineage_card
|
|
11
|
+
from mdmp_core.contracts import load_contract, parse_contract, save_contract
|
|
12
|
+
from mdmp_core.fingerprint import check_fingerprint, compute_fingerprint
|
|
13
|
+
from mdmp_core.fingerprint_store import FingerprintStore
|
|
14
|
+
from mdmp_core.hf import build_hf_mdmp_section, load_report
|
|
15
|
+
from mdmp_core.registry import (
|
|
16
|
+
init_registry,
|
|
17
|
+
list_records,
|
|
18
|
+
lookup_record,
|
|
19
|
+
upsert_record,
|
|
20
|
+
)
|
|
21
|
+
from mdmp_core.runner import ContractRunner, dataframe_fingerprint
|
|
22
|
+
from mdmp_core.visualizer import build_dashboard_html
|
|
23
|
+
from mdmp_flavors import TEMPLATES
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
app = typer.Typer(help="MDMP CLI - contracts, grading, fingerprints, and lineage")
|
|
27
|
+
registry_app = typer.Typer(help="MDMP registry scaffold commands")
|
|
28
|
+
app.add_typer(registry_app, name="registry")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@app.command("init")
|
|
32
|
+
def init_contract(
|
|
33
|
+
output: Path = typer.Option(Path("mdmp_contract.yaml"), help="Output contract file"),
|
|
34
|
+
flavor: str = typer.Option("health", help="Template flavor: health|finance|industrial"),
|
|
35
|
+
) -> None:
|
|
36
|
+
normalized = flavor.strip().lower()
|
|
37
|
+
if normalized not in TEMPLATES:
|
|
38
|
+
allowed = ", ".join(sorted(TEMPLATES))
|
|
39
|
+
raise typer.BadParameter(f"Unknown flavor '{flavor}'. Allowed: {allowed}")
|
|
40
|
+
|
|
41
|
+
payload = TEMPLATES[normalized]()
|
|
42
|
+
contract = parse_contract(payload)
|
|
43
|
+
save_contract(output, contract)
|
|
44
|
+
typer.echo(f"Created contract: {output}")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@app.command("validate")
|
|
48
|
+
def validate(
|
|
49
|
+
contract_path: Path,
|
|
50
|
+
dataset_csv: Path,
|
|
51
|
+
output_json: Path = typer.Option(Path("results/mdmp_report.json"), help="Output report JSON"),
|
|
52
|
+
fingerprint_store: Path | None = typer.Option(
|
|
53
|
+
None,
|
|
54
|
+
help="Optional fingerprint store JSON for stale/expiry checks",
|
|
55
|
+
),
|
|
56
|
+
) -> None:
|
|
57
|
+
contract = load_contract(contract_path)
|
|
58
|
+
df = pd.read_csv(dataset_csv)
|
|
59
|
+
result = ContractRunner(contract).run(df)
|
|
60
|
+
payload = result.to_dict()
|
|
61
|
+
|
|
62
|
+
if fingerprint_store is not None:
|
|
63
|
+
store = FingerprintStore(fingerprint_store)
|
|
64
|
+
stored = store.get_by_dataset(str(dataset_csv))
|
|
65
|
+
if stored is not None:
|
|
66
|
+
staleness = check_fingerprint(stored, data_path=dataset_csv)
|
|
67
|
+
payload["staleness"] = staleness
|
|
68
|
+
warnings = payload.setdefault("warnings", [])
|
|
69
|
+
if staleness.get("status") == "stale":
|
|
70
|
+
reason = staleness.get("stale_reason", "unknown")
|
|
71
|
+
warnings.append(
|
|
72
|
+
f"Dataset fingerprint is stale ({reason}). Re-grade required before AI training."
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
output_json.parent.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
output_json.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
|
77
|
+
typer.echo(f"Saved report: {output_json}")
|
|
78
|
+
typer.echo(f"Grade: {result.grade} | Compliance: {result.compliance_score}%")
|
|
79
|
+
if payload.get("staleness", {}).get("status") == "stale":
|
|
80
|
+
typer.echo(f"Staleness: STALE ({payload['staleness'].get('stale_reason')})")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@app.command("grade")
|
|
84
|
+
def grade(contract_path: Path, dataset_csv: Path) -> None:
|
|
85
|
+
contract = load_contract(contract_path)
|
|
86
|
+
df = pd.read_csv(dataset_csv)
|
|
87
|
+
result = ContractRunner(contract).run(df)
|
|
88
|
+
typer.echo(result.grade)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@app.command("fingerprint")
|
|
92
|
+
def fingerprint(dataset_csv: Path) -> None:
|
|
93
|
+
df = pd.read_csv(dataset_csv)
|
|
94
|
+
typer.echo(f"sha256:{dataframe_fingerprint(df)}")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@app.command("fingerprint-record")
|
|
98
|
+
def fingerprint_record(
|
|
99
|
+
dataset_path: Path,
|
|
100
|
+
output_json: Path = typer.Option(Path("results/fingerprint.json"), help="Output fingerprint JSON"),
|
|
101
|
+
expires_days: int = typer.Option(365, help="Validity period in days"),
|
|
102
|
+
fingerprint_store: Path | None = typer.Option(
|
|
103
|
+
None,
|
|
104
|
+
help="Optional fingerprint store JSON to upsert this record",
|
|
105
|
+
),
|
|
106
|
+
) -> None:
|
|
107
|
+
record = compute_fingerprint(dataset_path, expires_days=expires_days)
|
|
108
|
+
output_json.parent.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
output_json.write_text(json.dumps(record, indent=2), encoding="utf-8")
|
|
110
|
+
typer.echo(f"Saved fingerprint record: {output_json}")
|
|
111
|
+
typer.echo(f"Fingerprint: {record['fingerprint']}")
|
|
112
|
+
typer.echo(f"Expires: {record['expires']}")
|
|
113
|
+
|
|
114
|
+
if fingerprint_store is not None:
|
|
115
|
+
store = FingerprintStore(fingerprint_store)
|
|
116
|
+
store.upsert(dataset_path=str(dataset_path), record=record)
|
|
117
|
+
typer.echo(f"Updated fingerprint store: {fingerprint_store}")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@app.command("fingerprint-check")
|
|
121
|
+
def fingerprint_check(
|
|
122
|
+
fingerprint_json: Path,
|
|
123
|
+
dataset_path: Path,
|
|
124
|
+
output_json: Path | None = typer.Option(None, help="Optional output path for check result"),
|
|
125
|
+
) -> None:
|
|
126
|
+
stored = json.loads(fingerprint_json.read_text(encoding="utf-8"))
|
|
127
|
+
checked = check_fingerprint(stored, data_path=dataset_path)
|
|
128
|
+
typer.echo(f"Fingerprint: {checked.get('fingerprint')}")
|
|
129
|
+
typer.echo(f"Status: {str(checked.get('status', '')).upper()}")
|
|
130
|
+
typer.echo(f"Reason: {checked.get('stale_reason')}")
|
|
131
|
+
typer.echo(f"Created: {checked.get('created')}")
|
|
132
|
+
typer.echo(f"Expires: {checked.get('expires')}")
|
|
133
|
+
typer.echo(f"Checked at: {checked.get('checked_at')}")
|
|
134
|
+
|
|
135
|
+
if output_json is not None:
|
|
136
|
+
output_json.parent.mkdir(parents=True, exist_ok=True)
|
|
137
|
+
output_json.write_text(json.dumps(checked, indent=2), encoding="utf-8")
|
|
138
|
+
typer.echo(f"Saved check result: {output_json}")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@app.command("report")
|
|
142
|
+
def report(
|
|
143
|
+
report_json: Path,
|
|
144
|
+
output_html: Path = typer.Option(Path("results/mdmp_dashboard.html"), help="Output dashboard HTML"),
|
|
145
|
+
title: str = typer.Option("MDMP Dashboard", help="HTML title"),
|
|
146
|
+
) -> None:
|
|
147
|
+
payload = json.loads(report_json.read_text(encoding="utf-8"))
|
|
148
|
+
html = build_dashboard_html(payload, title=title)
|
|
149
|
+
output_html.parent.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
output_html.write_text(html, encoding="utf-8")
|
|
151
|
+
typer.echo(f"Saved dashboard: {output_html}")
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@app.command("lineage-card")
|
|
155
|
+
def lineage_card(
|
|
156
|
+
model: str = typer.Option(..., help="Model name"),
|
|
157
|
+
dataset: Path = typer.Option(..., help="Dataset CSV path"),
|
|
158
|
+
contract: Path = typer.Option(..., help="Contract YAML path"),
|
|
159
|
+
output: Path = typer.Option(Path("results/mdmp_model_card.yaml"), help="Output model card (.yaml/.json)"),
|
|
160
|
+
expires_days: int = typer.Option(365, help="Validity period for dataset fingerprint in days"),
|
|
161
|
+
fingerprint_store: Path | None = typer.Option(
|
|
162
|
+
None,
|
|
163
|
+
help="Optional fingerprint store JSON to persist lineage dataset fingerprints",
|
|
164
|
+
),
|
|
165
|
+
) -> None:
|
|
166
|
+
tracker = LineageTracker()
|
|
167
|
+
record = tracker.register_dataset(dataset, contract=contract, expires_days=expires_days)
|
|
168
|
+
tracker.attach_to_model(model)
|
|
169
|
+
card = tracker.export_card(output)
|
|
170
|
+
|
|
171
|
+
typer.echo(f"Dataset registered: {record['fingerprint']} ({record['grade']})")
|
|
172
|
+
typer.echo(f"Saved model card: {output}")
|
|
173
|
+
typer.echo(yaml.safe_dump(card, sort_keys=False))
|
|
174
|
+
|
|
175
|
+
if fingerprint_store is not None:
|
|
176
|
+
store = FingerprintStore(fingerprint_store)
|
|
177
|
+
store.upsert(
|
|
178
|
+
dataset_path=str(dataset),
|
|
179
|
+
record={
|
|
180
|
+
"fingerprint": record["fingerprint"],
|
|
181
|
+
"created": record["date_validated"],
|
|
182
|
+
"expires": record["expires"],
|
|
183
|
+
"source_path": str(dataset),
|
|
184
|
+
"status": record.get("status", "valid"),
|
|
185
|
+
"stale_reason": record.get("stale_reason"),
|
|
186
|
+
},
|
|
187
|
+
)
|
|
188
|
+
typer.echo(f"Updated fingerprint store: {fingerprint_store}")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@app.command("lineage-card-refresh")
|
|
192
|
+
def lineage_card_refresh(
|
|
193
|
+
card_path: Path,
|
|
194
|
+
output: Path | None = typer.Option(None, help="Optional output path (default: overwrite input card)"),
|
|
195
|
+
data_dir: Path | None = typer.Option(None, help="Optional data directory for resolving relative dataset paths"),
|
|
196
|
+
) -> None:
|
|
197
|
+
card = load_lineage_card(card_path)
|
|
198
|
+
card.refresh(data_dir=data_dir)
|
|
199
|
+
|
|
200
|
+
destination = output or card_path
|
|
201
|
+
payload = card.export(destination)
|
|
202
|
+
stale = payload.get("model", {}).get("stale_datasets", [])
|
|
203
|
+
typer.echo(f"Model: {payload.get('model', {}).get('name')}")
|
|
204
|
+
typer.echo(f"Lineage status: {payload.get('model', {}).get('lineage_status')}")
|
|
205
|
+
typer.echo(f"Stale datasets: {len(stale)}")
|
|
206
|
+
typer.echo(f"Saved lineage card: {destination}")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@registry_app.command("init")
|
|
210
|
+
def registry_init(
|
|
211
|
+
registry: Path = typer.Option(Path("registry/mdmp_registry.json"), help="Registry JSON path"),
|
|
212
|
+
) -> None:
|
|
213
|
+
payload = init_registry(registry)
|
|
214
|
+
typer.echo(f"Initialized registry: {registry}")
|
|
215
|
+
typer.echo(f"Version: {payload['version']}")
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@registry_app.command("push")
|
|
219
|
+
def registry_push(
|
|
220
|
+
registry: Path = typer.Option(Path("registry/mdmp_registry.json"), help="Registry JSON path"),
|
|
221
|
+
fingerprint: str | None = typer.Option(None, help="Dataset fingerprint (sha256:...)"),
|
|
222
|
+
report: Path | None = typer.Option(None, help="Optional MDMP report JSON to ingest"),
|
|
223
|
+
grade: str | None = typer.Option(None, help="Optional grade override"),
|
|
224
|
+
source: str | None = typer.Option(None, help="Optional source label (dataset/report name)"),
|
|
225
|
+
visibility: str = typer.Option("private", help="Record visibility: private|public"),
|
|
226
|
+
model_id: list[str] = typer.Option([], help="Model id(s) using this dataset"),
|
|
227
|
+
expires: str | None = typer.Option(None, help="Optional expires timestamp"),
|
|
228
|
+
status: str | None = typer.Option(None, help="Optional status override: valid|stale"),
|
|
229
|
+
stale_reason: str | None = typer.Option(None, help="Optional stale reason"),
|
|
230
|
+
metadata_json: Path | None = typer.Option(None, help="Optional metadata JSON file"),
|
|
231
|
+
) -> None:
|
|
232
|
+
report_payload = None
|
|
233
|
+
if report is not None:
|
|
234
|
+
report_payload = json.loads(report.read_text(encoding="utf-8"))
|
|
235
|
+
metadata = None
|
|
236
|
+
if metadata_json is not None:
|
|
237
|
+
metadata = json.loads(metadata_json.read_text(encoding="utf-8"))
|
|
238
|
+
if not isinstance(metadata, dict):
|
|
239
|
+
raise typer.BadParameter("metadata_json must contain a JSON object")
|
|
240
|
+
|
|
241
|
+
record = upsert_record(
|
|
242
|
+
registry,
|
|
243
|
+
fingerprint=fingerprint,
|
|
244
|
+
report=report_payload,
|
|
245
|
+
grade=grade,
|
|
246
|
+
source=source,
|
|
247
|
+
visibility=visibility,
|
|
248
|
+
used_in_models=model_id,
|
|
249
|
+
expires=expires,
|
|
250
|
+
status=status,
|
|
251
|
+
stale_reason=stale_reason,
|
|
252
|
+
metadata=metadata,
|
|
253
|
+
)
|
|
254
|
+
typer.echo(f"Registry updated: {registry}")
|
|
255
|
+
typer.echo(f"Fingerprint: {record['fingerprint']}")
|
|
256
|
+
typer.echo(f"Grade: {record['grade']} | Status: {record['status']} | Visibility: {record['visibility']}")
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@registry_app.command("lookup")
|
|
260
|
+
def registry_lookup(
|
|
261
|
+
fingerprint: str,
|
|
262
|
+
registry: Path = typer.Option(Path("registry/mdmp_registry.json"), help="Registry JSON path"),
|
|
263
|
+
output_json: Path | None = typer.Option(None, help="Optional output JSON path"),
|
|
264
|
+
) -> None:
|
|
265
|
+
record = lookup_record(registry, fingerprint)
|
|
266
|
+
if record is None:
|
|
267
|
+
typer.echo(f"Fingerprint not found: {fingerprint}")
|
|
268
|
+
raise typer.Exit(code=1)
|
|
269
|
+
text = json.dumps(record, indent=2)
|
|
270
|
+
typer.echo(text)
|
|
271
|
+
if output_json is not None:
|
|
272
|
+
output_json.parent.mkdir(parents=True, exist_ok=True)
|
|
273
|
+
output_json.write_text(text, encoding="utf-8")
|
|
274
|
+
typer.echo(f"Saved lookup: {output_json}")
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
@registry_app.command("list")
|
|
278
|
+
def registry_list(
|
|
279
|
+
registry: Path = typer.Option(Path("registry/mdmp_registry.json"), help="Registry JSON path"),
|
|
280
|
+
grade: str | None = typer.Option(None, help="Filter by grade"),
|
|
281
|
+
visibility: str | None = typer.Option(None, help="Filter by visibility"),
|
|
282
|
+
status: str | None = typer.Option(None, help="Filter by status"),
|
|
283
|
+
limit: int = typer.Option(20, help="Maximum rows"),
|
|
284
|
+
output_json: Path | None = typer.Option(None, help="Optional output JSON path"),
|
|
285
|
+
) -> None:
|
|
286
|
+
records = list_records(
|
|
287
|
+
registry,
|
|
288
|
+
grade=grade,
|
|
289
|
+
visibility=visibility,
|
|
290
|
+
status=status,
|
|
291
|
+
limit=limit,
|
|
292
|
+
)
|
|
293
|
+
payload = {"count": len(records), "records": records}
|
|
294
|
+
text = json.dumps(payload, indent=2)
|
|
295
|
+
typer.echo(text)
|
|
296
|
+
if output_json is not None:
|
|
297
|
+
output_json.parent.mkdir(parents=True, exist_ok=True)
|
|
298
|
+
output_json.write_text(text, encoding="utf-8")
|
|
299
|
+
typer.echo(f"Saved list: {output_json}")
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
@app.command("hf-export")
|
|
303
|
+
def hf_export(
|
|
304
|
+
dataset_id: str = typer.Option(..., help="Hugging Face dataset id (e.g. owner/name)"),
|
|
305
|
+
report_json: Path | None = typer.Option(None, help="Optional MDMP report JSON"),
|
|
306
|
+
fingerprint: str | None = typer.Option(None, help="Optional dataset fingerprint (sha256:...)"),
|
|
307
|
+
grade: str | None = typer.Option(None, help="Optional grade override"),
|
|
308
|
+
output_md: Path = typer.Option(Path("results/mdmp_hf_section.md"), help="Output Markdown path"),
|
|
309
|
+
registry_url: str | None = typer.Option(None, help="Optional registry URL"),
|
|
310
|
+
) -> None:
|
|
311
|
+
report = load_report(report_json) if report_json is not None else {}
|
|
312
|
+
|
|
313
|
+
resolved_grade = grade or str(report.get("grade", report.get("mdmp_grade", "draft")))
|
|
314
|
+
if fingerprint:
|
|
315
|
+
resolved_fp = fingerprint
|
|
316
|
+
else:
|
|
317
|
+
fp_raw = report.get("dataset_fingerprint_sha256")
|
|
318
|
+
if not fp_raw:
|
|
319
|
+
raise typer.BadParameter("Provide --fingerprint or --report-json with dataset_fingerprint_sha256")
|
|
320
|
+
resolved_fp = f"sha256:{fp_raw}" if not str(fp_raw).startswith("sha256:") else str(fp_raw)
|
|
321
|
+
|
|
322
|
+
contract_fp_raw = report.get("contract_fingerprint_sha256")
|
|
323
|
+
contract_fp = None
|
|
324
|
+
if contract_fp_raw:
|
|
325
|
+
contract_fp = (
|
|
326
|
+
f"sha256:{contract_fp_raw}" if not str(contract_fp_raw).startswith("sha256:") else str(contract_fp_raw)
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
score = report.get("compliance_score")
|
|
330
|
+
protocol = report.get("protocol_version", report.get("mdmp_protocol_version"))
|
|
331
|
+
section = build_hf_mdmp_section(
|
|
332
|
+
dataset_id=dataset_id,
|
|
333
|
+
grade=resolved_grade,
|
|
334
|
+
fingerprint=resolved_fp,
|
|
335
|
+
contract_fingerprint=contract_fp,
|
|
336
|
+
compliance_score=float(score) if score is not None else None,
|
|
337
|
+
protocol_version=str(protocol) if protocol is not None else None,
|
|
338
|
+
registry_url=registry_url,
|
|
339
|
+
)
|
|
340
|
+
output_md.parent.mkdir(parents=True, exist_ok=True)
|
|
341
|
+
output_md.write_text(section, encoding="utf-8")
|
|
342
|
+
typer.echo(f"Saved HF MDMP section: {output_md}")
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
if __name__ == "__main__":
|
|
346
|
+
app()
|