claude-turing 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +2 -2
- package/README.md +28 -1
- package/commands/card.md +36 -0
- package/commands/turing.md +2 -0
- package/package.json +1 -1
- package/src/install.js +1 -1
- package/src/verify.js +1 -0
- package/templates/scripts/__pycache__/cost_frontier.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_brief.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/generate_model_card.cpython-314.pyc +0 -0
- package/templates/scripts/__pycache__/scaffold.cpython-314.pyc +0 -0
- package/templates/scripts/cleanup.py +599 -0
- package/templates/scripts/cost_frontier.py +292 -0
- package/templates/scripts/diff_configs.py +534 -0
- package/templates/scripts/export_results.py +457 -0
- package/templates/scripts/generate_brief.py +54 -0
- package/templates/scripts/generate_model_card.py +342 -0
- package/templates/scripts/leaderboard.py +508 -0
- package/templates/scripts/plot_trajectory.py +611 -0
- package/templates/scripts/scaffold.py +7 -0
- package/templates/scripts/show_metrics.py +23 -2
- package/templates/tests/__pycache__/__init__.cpython-314.pyc +0 -0
- package/templates/tests/__pycache__/conftest.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/__pycache__/test_cost_frontier.cpython-314-pytest-9.0.2.pyc +0 -0
- package/templates/tests/test_cost_frontier.py +222 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "turing",
|
|
3
|
-
"version": "1.0
|
|
4
|
-
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol.
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "Autonomous ML research harness — the autoresearch loop as a formal protocol. 16 commands, 2 specialized agents, cost-performance frontier analysis, model cards, model registry, hypothesis database with novelty guard, anti-cheating guardrails, and the taste-leverage loop. Inspired by Karpathy's autoresearch and the scientific method itself.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "pragnition"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -321,6 +321,7 @@ The index (`hypotheses.yaml`) is the lightweight queue. The detail files (`hypot
|
|
|
321
321
|
| Command | What it does |
|
|
322
322
|
|---------|-------------|
|
|
323
323
|
| `/turing:validate [--auto]` | Check metric stability — auto-configure multi-run if noisy |
|
|
324
|
+
| `/turing:card` | Generate a model card — performance, limitations, intended use, artifact contract |
|
|
324
325
|
| `/turing:logbook` | Generate HTML experiment logbook |
|
|
325
326
|
| `/turing:report` | Generate research report |
|
|
326
327
|
| `/turing:poster` | Generate research poster |
|
|
@@ -389,6 +390,32 @@ After N experiments with no meaningful improvement, the agent stops and reports
|
|
|
389
390
|
|
|
390
391
|
For noisy metrics, `/turing:validate` runs the pipeline multiple times and measures variance. If the coefficient of variation exceeds 5%, it auto-configures multi-run evaluation so the agent can't be rewarded for lucky single runs.
|
|
391
392
|
|
|
393
|
+
## Cost-Performance Frontier
|
|
394
|
+
|
|
395
|
+
> *"This model is 2% better but takes 10x longer to train. Is that worth it?"*
|
|
396
|
+
|
|
397
|
+
The briefing now surfaces [Pareto-optimal](https://en.wikipedia.org/wiki/Pareto_efficiency) experiments — the efficient set where no other experiment is both faster AND has a better metric. The cost report tells you the tradeoff in plain language:
|
|
398
|
+
|
|
399
|
+
```
|
|
400
|
+
Best metric: exp-012 (accuracy=0.893, 2400s)
|
|
401
|
+
Best efficiency: exp-003 (accuracy=0.871, 3s)
|
|
402
|
+
The 2.5% improvement costs 800x more compute.
|
|
403
|
+
```
|
|
404
|
+
|
|
405
|
+
Run `python scripts/cost_frontier.py` directly, or read the "Cost-Performance Analysis" section in `/turing:brief`.
|
|
406
|
+
|
|
407
|
+
## Model Cards
|
|
408
|
+
|
|
409
|
+
When it's time to ship, `/turing:card` generates a standardized model card documenting:
|
|
410
|
+
- Model type, framework, training time
|
|
411
|
+
- Performance metrics (all configured metrics)
|
|
412
|
+
- Training data source and split ratios
|
|
413
|
+
- Limitations (including overfit detection)
|
|
414
|
+
- Intended use and ethical considerations (user fills these in)
|
|
415
|
+
- Artifact contract version for production consumers
|
|
416
|
+
|
|
417
|
+
Inspired by [Google's Model Cards](https://arxiv.org/abs/1810.03993) and [Hugging Face model cards](https://huggingface.co/docs/hub/model-cards).
|
|
418
|
+
|
|
392
419
|
## Installation
|
|
393
420
|
|
|
394
421
|
```bash
|
|
@@ -424,7 +451,7 @@ Each project gets independent config, data, experiments, models, and agent memor
|
|
|
424
451
|
|
|
425
452
|
## Architecture of Turing Itself
|
|
426
453
|
|
|
427
|
-
|
|
454
|
+
16 commands, 2 agents, 8 config files, 30 template scripts, model registry, artifact contract, cost-performance frontier, model cards, 345 tests, 16 ADRs. See [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md) for the full codemap.
|
|
428
455
|
|
|
429
456
|
```
|
|
430
457
|
turing/
|
package/commands/card.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: card
|
|
3
|
+
description: Generate a standardized model card documenting the trained model — type, performance, training data, limitations, intended use, and artifact contract.
|
|
4
|
+
disable-model-invocation: true
|
|
5
|
+
allowed-tools: Read, Bash(python scripts/*:*, source .venv/bin/activate:*), Grep, Glob
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You generate a standardized model card from the experiment log, model contract, and config.
|
|
9
|
+
|
|
10
|
+
## Steps
|
|
11
|
+
|
|
12
|
+
1. **Activate the virtual environment:**
|
|
13
|
+
```bash
|
|
14
|
+
source .venv/bin/activate
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
2. **Run the model card generator:**
|
|
18
|
+
```bash
|
|
19
|
+
python scripts/generate_model_card.py --config config.yaml --log experiments/log.jsonl --contract model_contract.md --output MODEL_CARD.md
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
3. **Read and present the generated card:**
|
|
23
|
+
- Read `MODEL_CARD.md` and display it to the user.
|
|
24
|
+
- If no experiments exist yet, inform the user and show the skeleton card.
|
|
25
|
+
|
|
26
|
+
4. **Suggest next steps:**
|
|
27
|
+
- Review the **Ethical Considerations** section and fill in bias, fairness, and impact notes.
|
|
28
|
+
- Review the **Intended Use** section and document what the model is NOT intended for.
|
|
29
|
+
- If limitations mention overfitting, suggest running `/turing:validate` for stability checks.
|
|
30
|
+
- If the card looks complete, suggest committing it to version control.
|
|
31
|
+
|
|
32
|
+
## Error Handling
|
|
33
|
+
|
|
34
|
+
- If `config.yaml` is missing, tell the user to run `/turing:init` first.
|
|
35
|
+
- If `experiments/log.jsonl` is missing or empty, generate a skeleton card and note that training is needed.
|
|
36
|
+
- If `.venv` doesn't exist, try `python3 scripts/generate_model_card.py` directly.
|
package/commands/turing.md
CHANGED
|
@@ -24,6 +24,7 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
24
24
|
| "design", "plan experiment", "how should I test", "experiment design" | `/turing:design` | Design |
|
|
25
25
|
| "mode", "explore", "exploit", "replicate", "strategy" | `/turing:mode` | Strategy |
|
|
26
26
|
| "preflight", "resources", "VRAM", "memory", "can I run", "OOM", "GPU" | `/turing:preflight` | Check |
|
|
27
|
+
| "card", "model card", "document model", "model documentation" | `/turing:card` | Document |
|
|
27
28
|
|
|
28
29
|
## Sub-commands
|
|
29
30
|
|
|
@@ -44,6 +45,7 @@ You are the Turing ML research router. Detect the user's intent and route to the
|
|
|
44
45
|
| `/turing:report` | Structured markdown research report | (inline) |
|
|
45
46
|
| `/turing:mode <mode>` | Set research strategy (explore/exploit/replicate) | (inline) |
|
|
46
47
|
| `/turing:preflight` | Pre-flight resource check (VRAM/RAM/disk) | (inline) |
|
|
48
|
+
| `/turing:card` | Generate standardized model card (type, performance, data, limitations, contract) | (inline) |
|
|
47
49
|
|
|
48
50
|
## Proactive Detection
|
|
49
51
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-turing",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Autonomous ML research harness for Claude Code. The autoresearch loop as a formal protocol — iteratively trains, evaluates, and improves ML models with structured experiment tracking, convergence detection, immutable evaluation infrastructure, and safety guardrails.",
|
|
6
6
|
"bin": {
|
package/src/install.js
CHANGED
|
@@ -22,7 +22,7 @@ const PLUGIN_ROOT = join(__dirname, "..");
|
|
|
22
22
|
const SUB_COMMANDS = [
|
|
23
23
|
"init", "train", "status", "compare", "sweep", "validate",
|
|
24
24
|
"try", "brief", "suggest", "design", "logbook", "poster",
|
|
25
|
-
"report", "mode", "preflight",
|
|
25
|
+
"report", "mode", "preflight", "card",
|
|
26
26
|
];
|
|
27
27
|
|
|
28
28
|
export async function install(opts = {}) {
|
package/src/verify.js
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|