claude-turing 4.2.0 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +4 -2
- package/commands/registry.md +31 -0
- package/commands/turing.md +4 -0
- package/commands/update.md +27 -0
- package/package.json +1 -1
- package/src/install.js +1 -0
- package/src/verify.js +2 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/incremental_update.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/model_lifecycle.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/generate_brief.py +61 -0
- package/templates/scripts/generate_model_card.py +154 -3
- package/templates/scripts/incremental_update.py +586 -0
- package/templates/scripts/model_lifecycle.py +549 -0
- package/templates/scripts/scaffold.py +4 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "4.
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "4.3.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 71 commands, 2 specialized agents, model lifecycle (update + registry), what-if analysis (whatif + counterfactual + simulate), collaboration (onboard + share + review), research communication (cite + present + changelog), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), model surgery (prune + quantize + merge + surgery), feature & training intelligence, model debugging, pre-training intelligence, meta-intelligence, scaling & efficiency, model composition, deep analysis, experiment orchestration, literature + paper, model export, profiling, checkpoints, experiment intelligence, statistical rigor, tree-search, cost-performance, model cards, hypothesis database, novelty guard, anti-cheating, taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -380,6 +380,8 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
380
380
|
| `/turing:whatif "<question>"` | What-if analysis — answer hypotheticals from existing experiment data |
|
|
381
381
|
| `/turing:counterfactual <exp-id>` | Counterfactual explanations — minimum input change to flip a prediction |
|
|
382
382
|
| `/turing:simulate [--configs]` | Experiment outcome prediction — pre-filter configs, save budget |
|
|
383
|
+
| `/turing:update <exp-id>` | Incremental model update — add new data without full retraining |
|
|
384
|
+
| `/turing:registry [action]` | Model registry — track lifecycle from candidate to production with gates |
|
|
383
385
|
|
|
384
386
|
And for fully hands-off operation:
|
|
385
387
|
|
|
@@ -564,11 +566,11 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
564
566
|
|
|
565
567
|
## Architecture of Turing Itself
|
|
566
568
|
|
|
567
|
-
|
|
569
|
+
71 commands, 2 agents, 10 config files, 90 template scripts, model registry, artifact contract, cost-performance frontier, model cards, tree-search exploration, statistical rigor, experiment intelligence, performance profiling, smart checkpoints, production model export, literature integration, paper section drafting, experiment orchestration (queue + retry + fork), deep analysis (diff + watch + regress), model composition (ensemble + stitch + warm), scaling & efficiency (scale + budget + distill), meta-intelligence (transfer + audit), pre-training intelligence (sanity + baseline + leak), model debugging (xray + sensitivity + calibrate), feature & training intelligence (feature + curriculum), model surgery (prune + quantize + merge + surgery), experiment archaeology (trend + flashback + archive + annotate + search + template + replay), research communication (cite + present + changelog), collaboration (onboard + share + review), what-if analysis (whatif + counterfactual + simulate), model lifecycle (update + registry), 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
568
570
|
|
|
569
571
|
```
|
|
570
572
|
turing/
|
|
571
|
-
├── commands/
|
|
573
|
+
├── commands/ 67 skill files (core + taste-leverage + reporting + exploration + statistical rigor + experiment intelligence + performance + deployment + research workflow + orchestration + deep analysis + model composition + scaling & efficiency + meta-intelligence + pre-training intelligence + model debugging + feature & training intelligence + model surgery + experiment archaeology + research communication + what-if analysis + model lifecycle)
|
|
572
574
|
├── agents/ 2 agents (researcher: read/write, evaluator: read-only)
|
|
573
575
|
├── config/ 8 files (lifecycle, taxonomy, archetypes, novelty aliases)
|
|
574
576
|
├── templates/ Scaffolded into user projects by /turing:init
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: registry
|
|
3
|
+
description: Model registry — track, promote, and govern the model lifecycle from candidate to production.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "[list|register|promote|demote|archive|history] [exp-id] [stage]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Track which model is production, staging, candidate, or archived. Promotion requires passing gates.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/model_lifecycle.py $ARGUMENTS`
|
|
14
|
+
3. **Registry:** `experiments/registry.yaml`
|
|
15
|
+
|
|
16
|
+
## Promotion gates
|
|
17
|
+
- **candidate → staging:** regression check + seed study must PASS
|
|
18
|
+
- **staging → production:** audit + calibration check must PASS
|
|
19
|
+
- Use `--force` to skip gate checks
|
|
20
|
+
|
|
21
|
+
## Examples
|
|
22
|
+
```
|
|
23
|
+
/turing:registry list
|
|
24
|
+
/turing:registry register exp-095 --version v4.1
|
|
25
|
+
/turing:registry promote exp-089 staging
|
|
26
|
+
/turing:registry promote exp-089 production --force
|
|
27
|
+
/turing:registry demote exp-078 staging --reason "latency regression"
|
|
28
|
+
/turing:registry archive exp-042 --reason "superseded by v4"
|
|
29
|
+
/turing:registry history
|
|
30
|
+
/turing:registry history exp-089
|
|
31
|
+
```
|
package/commands/turing.md
CHANGED
|
@@ -78,6 +78,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
78
78
|
| "what if", "what-if", "hypothetical", "estimate impact", "would it help" | `/turing:whatif` | Analyze |
|
|
79
79
|
| "counterfactual", "flip prediction", "why this prediction", "minimum change", "explanation" | `/turing:counterfactual` | Explain |
|
|
80
80
|
| "simulate", "predict outcome", "pre-filter", "which configs will work", "forecast" | `/turing:simulate` | Predict |
|
|
81
|
+
| "update", "incremental", "new data", "add data", "fine-tune existing", "partial update" | `/turing:update` | Update |
|
|
82
|
+
| "registry", "promote", "demote", "staging", "production", "which model is deployed", "model lifecycle" | `/turing:registry` | Govern |
|
|
81
83
|
|
|
82
84
|
## Sub-commands
|
|
83
85
|
|
|
@@ -152,6 +154,8 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
152
154
|
| `/turing:whatif "<question>"` | What-if analysis: route hypotheticals to existing estimators (scaling, ablation, sensitivity, ensemble, pruning) | (inline) |
|
|
153
155
|
| `/turing:counterfactual <exp-id> --sample <index>` | Input-level counterfactual explanations: minimum input change to flip a prediction | (inline) |
|
|
154
156
|
| `/turing:simulate [--configs] [--top-k]` | Experiment outcome prediction: pre-filter configs using surrogate model, save budget | (inline) |
|
|
157
|
+
| `/turing:update <exp-id> --new-data <path>` | Incremental model update: add new data without full retraining, forgetting detection | (inline) |
|
|
158
|
+
| `/turing:registry [list\|register\|promote\|demote\|history]` | Model registry: stage lifecycle (candidate → staging → production) with promotion gates | (inline) |
|
|
155
159
|
|
|
156
160
|
## Proactive Detection
|
|
157
161
|
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: update
|
|
3
|
+
description: Incremental model update — add new data without full retraining, with forgetting detection.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
argument-hint: "<exp-id> --new-data <path> [--replay-ratio 0.1] [--tolerance 0.005]"
|
|
6
|
+
allowed-tools: Read, Bash(*), Grep, Glob
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
Add new data to an existing model without starting from scratch. Detects catastrophic forgetting.
|
|
10
|
+
|
|
11
|
+
## Steps
|
|
12
|
+
1. `source .venv/bin/activate`
|
|
13
|
+
2. `python scripts/incremental_update.py $ARGUMENTS`
|
|
14
|
+
3. **Saved:** `experiments/updates/`
|
|
15
|
+
|
|
16
|
+
## Model-specific strategies
|
|
17
|
+
- **XGBoost/LightGBM:** continued boosting with additional rounds
|
|
18
|
+
- **Neural networks:** fine-tune with reduced LR + replay buffer from old data
|
|
19
|
+
- **scikit-learn:** partial_fit() or warm_start=True
|
|
20
|
+
|
|
21
|
+
## Examples
|
|
22
|
+
```
|
|
23
|
+
/turing:update exp-089 --new-data data/new_batch.csv
|
|
24
|
+
/turing:update exp-089 --new-data data/new.csv --replay-ratio 0.2
|
|
25
|
+
/turing:update exp-089 --new-data data/new.csv --tolerance 0.01
|
|
26
|
+
/turing:update exp-089 --new-data data/new.csv --json
|
|
27
|
+
```
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.3.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
package/src/verify.js
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -404,6 +404,38 @@ def load_simulation_results(sim_dir: str = "experiments/simulations") -> dict |
|
|
|
404
404
|
return None
|
|
405
405
|
|
|
406
406
|
|
|
407
|
+
def load_registry_summary(registry_path: str = "experiments/registry.yaml") -> dict | None:
|
|
408
|
+
"""Load model registry summary for briefing."""
|
|
409
|
+
path = Path(registry_path)
|
|
410
|
+
if not path.exists():
|
|
411
|
+
return None
|
|
412
|
+
try:
|
|
413
|
+
with open(path) as f:
|
|
414
|
+
data = yaml.safe_load(f)
|
|
415
|
+
if isinstance(data, dict) and data.get("models"):
|
|
416
|
+
return data
|
|
417
|
+
except (yaml.YAMLError, OSError):
|
|
418
|
+
pass
|
|
419
|
+
return None
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def load_update_history(update_dir: str = "experiments/updates") -> list[dict]:
|
|
423
|
+
"""Load recent incremental update reports."""
|
|
424
|
+
path = Path(update_dir)
|
|
425
|
+
if not path.exists():
|
|
426
|
+
return []
|
|
427
|
+
results = []
|
|
428
|
+
for f in sorted(path.glob("*-update-*.yaml"))[-3:]:
|
|
429
|
+
try:
|
|
430
|
+
with open(f) as fh:
|
|
431
|
+
data = yaml.safe_load(fh)
|
|
432
|
+
if isinstance(data, dict):
|
|
433
|
+
results.append(data)
|
|
434
|
+
except (yaml.YAMLError, OSError):
|
|
435
|
+
continue
|
|
436
|
+
return results
|
|
437
|
+
|
|
438
|
+
|
|
407
439
|
def format_brief(
|
|
408
440
|
campaign: dict,
|
|
409
441
|
best: dict | None,
|
|
@@ -428,6 +460,8 @@ def format_brief(
|
|
|
428
460
|
audit_report: dict | None = None,
|
|
429
461
|
whatif_results: list[dict] | None = None,
|
|
430
462
|
simulation_result: dict | None = None,
|
|
463
|
+
registry_summary: dict | None = None,
|
|
464
|
+
update_history: list[dict] | None = None,
|
|
431
465
|
) -> str:
|
|
432
466
|
"""Format the research briefing as markdown."""
|
|
433
467
|
direction = "lower" if lower_is_better else "higher"
|
|
@@ -758,6 +792,29 @@ def format_brief(
|
|
|
758
792
|
lines.append(f"**Last simulation:** {run} configs recommended, {skip} skipped ({savings}% budget savings)")
|
|
759
793
|
lines.append("")
|
|
760
794
|
|
|
795
|
+
# Model Lifecycle section
|
|
796
|
+
if registry_summary or update_history:
|
|
797
|
+
lines.extend(["", "## Model Lifecycle", ""])
|
|
798
|
+
|
|
799
|
+
if registry_summary:
|
|
800
|
+
models = registry_summary.get("models", [])
|
|
801
|
+
for m in models:
|
|
802
|
+
if m.get("stage") != "archived":
|
|
803
|
+
metric = f"{m['metric']:.4f}" if m.get("metric") is not None else "—"
|
|
804
|
+
lines.append(f"- **{m['stage']}:** {m['exp_id']} ({m.get('version', '?')}, {m.get('metric_name', 'metric')}={metric})")
|
|
805
|
+
if not any(m.get("stage") != "archived" for m in models):
|
|
806
|
+
lines.append("- All models archived — register a new candidate with `/turing:registry register`")
|
|
807
|
+
lines.append("")
|
|
808
|
+
|
|
809
|
+
if update_history:
|
|
810
|
+
lines.append(f"**Recent updates:** {len(update_history)}")
|
|
811
|
+
for u in update_history[-2:]:
|
|
812
|
+
verdict = u.get("verdict", "?")
|
|
813
|
+
exp_id = u.get("experiment_id", "?")
|
|
814
|
+
strategy = u.get("plan", {}).get("strategy", "?")
|
|
815
|
+
lines.append(f"- {exp_id}: {strategy} — {verdict}")
|
|
816
|
+
lines.append("")
|
|
817
|
+
|
|
761
818
|
lines.extend([
|
|
762
819
|
"",
|
|
763
820
|
"## Recommendations",
|
|
@@ -830,6 +887,8 @@ def generate_brief(
|
|
|
830
887
|
audit_report = load_audit_report()
|
|
831
888
|
whatif_results = load_whatif_results()
|
|
832
889
|
simulation_result = load_simulation_results()
|
|
890
|
+
registry_summary = load_registry_summary()
|
|
891
|
+
update_history = load_update_history()
|
|
833
892
|
|
|
834
893
|
return format_brief(
|
|
835
894
|
campaign, best, trajectory, model_types, hypotheses,
|
|
@@ -848,6 +907,8 @@ def generate_brief(
|
|
|
848
907
|
audit_report=audit_report,
|
|
849
908
|
whatif_results=whatif_results if whatif_results else None,
|
|
850
909
|
simulation_result=simulation_result,
|
|
910
|
+
registry_summary=registry_summary,
|
|
911
|
+
update_history=update_history if update_history else None,
|
|
851
912
|
)
|
|
852
913
|
|
|
853
914
|
|
|
@@ -18,6 +18,8 @@ import sys
|
|
|
18
18
|
from datetime import datetime, timezone
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
|
|
21
|
+
import yaml
|
|
22
|
+
|
|
21
23
|
from scripts.turing_io import load_config, load_experiments
|
|
22
24
|
|
|
23
25
|
|
|
@@ -93,22 +95,113 @@ def load_model_contract(contract_path: str) -> dict:
|
|
|
93
95
|
return {"version": version, "bundle_format": bundle_format, "raw": text}
|
|
94
96
|
|
|
95
97
|
|
|
98
|
+
def load_registry_status(registry_path: str = "experiments/registry.yaml") -> dict | None:
|
|
99
|
+
"""Load registry status for the best model."""
|
|
100
|
+
path = Path(registry_path)
|
|
101
|
+
if not path.exists():
|
|
102
|
+
return None
|
|
103
|
+
try:
|
|
104
|
+
with open(path) as f:
|
|
105
|
+
data = yaml.safe_load(f)
|
|
106
|
+
if isinstance(data, dict) and data.get("models"):
|
|
107
|
+
return data
|
|
108
|
+
except (Exception,):
|
|
109
|
+
pass
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def compute_fairness_metrics(
|
|
114
|
+
predictions: list | None = None,
|
|
115
|
+
labels: list | None = None,
|
|
116
|
+
protected_attribute: list | None = None,
|
|
117
|
+
group_names: list[str] | None = None,
|
|
118
|
+
) -> dict | None:
|
|
119
|
+
"""Compute demographic parity and equal opportunity metrics.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
predictions: Model predictions.
|
|
123
|
+
labels: True labels.
|
|
124
|
+
protected_attribute: Group membership for each sample.
|
|
125
|
+
group_names: Names of groups.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Fairness metrics dict or None if insufficient data.
|
|
129
|
+
"""
|
|
130
|
+
if predictions is None or protected_attribute is None:
|
|
131
|
+
return None
|
|
132
|
+
if len(predictions) != len(protected_attribute):
|
|
133
|
+
return None
|
|
134
|
+
if len(predictions) == 0:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
import numpy as np
|
|
138
|
+
|
|
139
|
+
preds = np.array(predictions)
|
|
140
|
+
groups = np.array(protected_attribute)
|
|
141
|
+
unique_groups = sorted(set(groups))
|
|
142
|
+
|
|
143
|
+
if group_names is None:
|
|
144
|
+
group_names = [str(g) for g in unique_groups]
|
|
145
|
+
|
|
146
|
+
# Demographic parity: P(Y_hat=1 | G=g) for each group
|
|
147
|
+
group_positive_rates = {}
|
|
148
|
+
for g, name in zip(unique_groups, group_names):
|
|
149
|
+
mask = groups == g
|
|
150
|
+
if mask.sum() == 0:
|
|
151
|
+
continue
|
|
152
|
+
rate = float(preds[mask].mean()) if preds[mask].size > 0 else 0
|
|
153
|
+
group_positive_rates[name] = round(rate, 4)
|
|
154
|
+
|
|
155
|
+
# Demographic parity difference
|
|
156
|
+
rates = list(group_positive_rates.values())
|
|
157
|
+
dp_diff = round(max(rates) - min(rates), 4) if len(rates) >= 2 else 0
|
|
158
|
+
|
|
159
|
+
result = {
|
|
160
|
+
"group_positive_rates": group_positive_rates,
|
|
161
|
+
"demographic_parity_difference": dp_diff,
|
|
162
|
+
"n_groups": len(unique_groups),
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
# Equal opportunity (if labels available): P(Y_hat=1 | Y=1, G=g)
|
|
166
|
+
if labels is not None and len(labels) == len(predictions):
|
|
167
|
+
labs = np.array(labels)
|
|
168
|
+
group_tpr = {}
|
|
169
|
+
for g, name in zip(unique_groups, group_names):
|
|
170
|
+
mask = (groups == g) & (labs == 1)
|
|
171
|
+
if mask.sum() == 0:
|
|
172
|
+
continue
|
|
173
|
+
tpr = float(preds[mask].mean()) if preds[mask].size > 0 else 0
|
|
174
|
+
group_tpr[name] = round(tpr, 4)
|
|
175
|
+
|
|
176
|
+
result["group_true_positive_rates"] = group_tpr
|
|
177
|
+
tpr_vals = list(group_tpr.values())
|
|
178
|
+
result["equal_opportunity_difference"] = round(max(tpr_vals) - min(tpr_vals), 4) if len(tpr_vals) >= 2 else 0
|
|
179
|
+
|
|
180
|
+
return result
|
|
181
|
+
|
|
182
|
+
|
|
96
183
|
def generate_card(
|
|
97
184
|
config_path: str = "config.yaml",
|
|
98
185
|
log_path: str = "experiments/log.jsonl",
|
|
99
186
|
contract_path: str = "model_contract.md",
|
|
100
187
|
output_path: str | None = None,
|
|
188
|
+
include_fairness: bool = False,
|
|
189
|
+
fairness_data: dict | None = None,
|
|
190
|
+
registry_path: str = "experiments/registry.yaml",
|
|
101
191
|
) -> str:
|
|
102
192
|
"""Produce a model card markdown document.
|
|
103
193
|
|
|
104
194
|
Combines information from the project config, experiment log,
|
|
105
|
-
|
|
195
|
+
model contract, registry, and optional fairness data.
|
|
106
196
|
|
|
107
197
|
Args:
|
|
108
198
|
config_path: Path to config.yaml.
|
|
109
199
|
log_path: Path to experiments/log.jsonl.
|
|
110
200
|
contract_path: Path to model_contract.md.
|
|
111
201
|
output_path: If given, write the card to this file.
|
|
202
|
+
include_fairness: If True, add fairness section.
|
|
203
|
+
fairness_data: Pre-computed fairness data {predictions, labels, protected_attribute}.
|
|
204
|
+
registry_path: Path to registry YAML.
|
|
112
205
|
|
|
113
206
|
Returns:
|
|
114
207
|
The model card as a markdown string.
|
|
@@ -247,7 +340,6 @@ def generate_card(
|
|
|
247
340
|
if best:
|
|
248
341
|
seed_study_path = Path("experiments/seed_studies") / f"{best.get('experiment_id', 'unknown')}-seeds.yaml"
|
|
249
342
|
if seed_study_path.exists():
|
|
250
|
-
import yaml
|
|
251
343
|
with open(seed_study_path) as f:
|
|
252
344
|
seed_study = yaml.safe_load(f) or {}
|
|
253
345
|
if seed_study and "mean" in seed_study:
|
|
@@ -306,6 +398,57 @@ def generate_card(
|
|
|
306
398
|
"- Not intended for: <placeholder for user to fill>",
|
|
307
399
|
])
|
|
308
400
|
|
|
401
|
+
# --- Registry Status ---
|
|
402
|
+
registry_data = load_registry_status(registry_path)
|
|
403
|
+
if registry_data and best:
|
|
404
|
+
exp_id = best.get("experiment_id", "")
|
|
405
|
+
for model in registry_data.get("models", []):
|
|
406
|
+
if model.get("exp_id") == exp_id:
|
|
407
|
+
lines.extend([
|
|
408
|
+
"",
|
|
409
|
+
"## Registry Status",
|
|
410
|
+
"",
|
|
411
|
+
f"- **Stage:** {model.get('stage', 'unregistered')}",
|
|
412
|
+
f"- **Version:** {model.get('version', 'N/A')}",
|
|
413
|
+
f"- **Registered:** {model.get('registered_at', 'N/A')[:10]}",
|
|
414
|
+
f"- **Gates passed:** {', '.join(model.get('gates_passed', [])) or 'none'}",
|
|
415
|
+
])
|
|
416
|
+
break
|
|
417
|
+
|
|
418
|
+
# --- Fairness ---
|
|
419
|
+
if include_fairness:
|
|
420
|
+
lines.extend([
|
|
421
|
+
"",
|
|
422
|
+
"## Fairness Analysis",
|
|
423
|
+
"",
|
|
424
|
+
])
|
|
425
|
+
if fairness_data:
|
|
426
|
+
fairness = compute_fairness_metrics(
|
|
427
|
+
predictions=fairness_data.get("predictions"),
|
|
428
|
+
labels=fairness_data.get("labels"),
|
|
429
|
+
protected_attribute=fairness_data.get("protected_attribute"),
|
|
430
|
+
group_names=fairness_data.get("group_names"),
|
|
431
|
+
)
|
|
432
|
+
if fairness:
|
|
433
|
+
lines.append("### Demographic Parity")
|
|
434
|
+
lines.append("")
|
|
435
|
+
for group, rate in fairness.get("group_positive_rates", {}).items():
|
|
436
|
+
lines.append(f"- **{group}:** {rate:.4f}")
|
|
437
|
+
lines.append(f"- **Parity difference:** {fairness['demographic_parity_difference']:.4f}")
|
|
438
|
+
|
|
439
|
+
if "group_true_positive_rates" in fairness:
|
|
440
|
+
lines.append("")
|
|
441
|
+
lines.append("### Equal Opportunity")
|
|
442
|
+
lines.append("")
|
|
443
|
+
for group, tpr in fairness["group_true_positive_rates"].items():
|
|
444
|
+
lines.append(f"- **{group}:** {tpr:.4f}")
|
|
445
|
+
lines.append(f"- **Opportunity difference:** {fairness['equal_opportunity_difference']:.4f}")
|
|
446
|
+
else:
|
|
447
|
+
lines.append("- Fairness analysis requested but insufficient data provided")
|
|
448
|
+
else:
|
|
449
|
+
lines.append("- Fairness analysis requested but no protected attribute data available")
|
|
450
|
+
lines.append("- Provide `--fairness-data` with predictions, labels, and protected attributes")
|
|
451
|
+
|
|
309
452
|
# --- Ethical Considerations ---
|
|
310
453
|
lines.extend([
|
|
311
454
|
"",
|
|
@@ -354,9 +497,17 @@ def main() -> None:
|
|
|
354
497
|
parser.add_argument("--log", default="experiments/log.jsonl", help="Path to experiment log")
|
|
355
498
|
parser.add_argument("--contract", default="model_contract.md", help="Path to model contract")
|
|
356
499
|
parser.add_argument("--output", default=None, help="Output path (default: print to stdout)")
|
|
500
|
+
parser.add_argument("--include", default=None, help="Include extra sections (e.g., 'fairness')")
|
|
501
|
+
parser.add_argument("--registry", default="experiments/registry.yaml", help="Path to model registry")
|
|
357
502
|
args = parser.parse_args()
|
|
358
503
|
|
|
359
|
-
|
|
504
|
+
include_fairness = args.include and "fairness" in args.include
|
|
505
|
+
|
|
506
|
+
card = generate_card(
|
|
507
|
+
args.config, args.log, args.contract, args.output,
|
|
508
|
+
include_fairness=include_fairness,
|
|
509
|
+
registry_path=args.registry,
|
|
510
|
+
)
|
|
360
511
|
if args.output:
|
|
361
512
|
print(f"Model card written to {args.output}")
|
|
362
513
|
else:
|