harness-plugin 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- harness_plugin-0.1.0/.gitignore +52 -0
- harness_plugin-0.1.0/PKG-INFO +20 -0
- harness_plugin-0.1.0/README.md +134 -0
- harness_plugin-0.1.0/pyproject.toml +43 -0
- harness_plugin-0.1.0/skills/ml-workflow/SKILL.md +344 -0
- harness_plugin-0.1.0/src/harnessml/plugin/__init__.py +1 -0
- harness_plugin-0.1.0/src/harnessml/plugin/demo_data/housing.csv +20641 -0
- harness_plugin-0.1.0/src/harnessml/plugin/event_emitter.py +114 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/__init__.py +0 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/_common.py +74 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/_validation.py +221 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/competitions.py +590 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/config.py +322 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/data.py +472 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/experiments.py +302 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/features.py +149 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/models.py +254 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/notebook.py +188 -0
- harness_plugin-0.1.0/src/harnessml/plugin/handlers/pipeline.py +369 -0
- harness_plugin-0.1.0/src/harnessml/plugin/mcp_server.py +940 -0
- harness_plugin-0.1.0/src/harnessml/plugin/setup.py +198 -0
- harness_plugin-0.1.0/src/harnessml/plugin/skills/harness-domain-research.md +275 -0
- harness_plugin-0.1.0/src/harnessml/plugin/skills/harness-explore-space.md +247 -0
- harness_plugin-0.1.0/src/harnessml/plugin/skills/harness-run-experiment.md +246 -0
- harness_plugin-0.1.0/tests/__init__.py +0 -0
- harness_plugin-0.1.0/tests/handlers/__init__.py +0 -0
- harness_plugin-0.1.0/tests/handlers/test_competitions.py +195 -0
- harness_plugin-0.1.0/tests/handlers/test_handler_dispatch.py +409 -0
- harness_plugin-0.1.0/tests/handlers/test_notebook.py +126 -0
- harness_plugin-0.1.0/tests/test_event_emitter.py +33 -0
- harness_plugin-0.1.0/tests/test_handler_data.py +231 -0
- harness_plugin-0.1.0/tests/test_handler_experiments.py +180 -0
- harness_plugin-0.1.0/tests/test_handler_features.py +133 -0
- harness_plugin-0.1.0/tests/test_handler_models.py +165 -0
- harness_plugin-0.1.0/tests/test_thread_safety.py +93 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.so
|
|
5
|
+
.venv/
|
|
6
|
+
venv/
|
|
7
|
+
env/
|
|
8
|
+
dist/
|
|
9
|
+
build/
|
|
10
|
+
*.egg-info/
|
|
11
|
+
*.egg
|
|
12
|
+
.eggs/
|
|
13
|
+
*.whl
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
.mypy_cache/
|
|
16
|
+
.ruff_cache/
|
|
17
|
+
htmlcov/
|
|
18
|
+
.coverage
|
|
19
|
+
.coverage.*
|
|
20
|
+
coverage.xml
|
|
21
|
+
*.cover
|
|
22
|
+
.tox/
|
|
23
|
+
.nox/
|
|
24
|
+
.hypothesis/
|
|
25
|
+
*.log
|
|
26
|
+
.DS_Store
|
|
27
|
+
Thumbs.db
|
|
28
|
+
.env
|
|
29
|
+
.env.*
|
|
30
|
+
!.env.example
|
|
31
|
+
catboost_info/
|
|
32
|
+
docs/plans/
|
|
33
|
+
.claude/
|
|
34
|
+
.worktrees/
|
|
35
|
+
.mcp.json
|
|
36
|
+
harness-demo/
|
|
37
|
+
projects/
|
|
38
|
+
|
|
39
|
+
# HarnessML generated artifacts
|
|
40
|
+
**/data/features/cache/
|
|
41
|
+
**/outputs/
|
|
42
|
+
**/experiments/
|
|
43
|
+
!**/src/views/Experiments/
|
|
44
|
+
|
|
45
|
+
# Jekyll
|
|
46
|
+
docs/_site/
|
|
47
|
+
|
|
48
|
+
# Harness Studio
|
|
49
|
+
packages/harness-studio/frontend/node_modules/
|
|
50
|
+
packages/harness-studio/frontend/dist/
|
|
51
|
+
packages/harness-studio/frontend/.vite/
|
|
52
|
+
.studio/
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: harness-plugin
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Claude Code plugin for AI-driven ML experimentation
|
|
5
|
+
Project-URL: Homepage, https://github.com/msilverblatt/harness-ml
|
|
6
|
+
Project-URL: Repository, https://github.com/msilverblatt/harness-ml
|
|
7
|
+
Project-URL: Issues, https://github.com/msilverblatt/harness-ml/issues
|
|
8
|
+
Author: Matt Silverblatt-Buser
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.11
|
|
18
|
+
Requires-Dist: click>=8.0
|
|
19
|
+
Requires-Dist: harness-core<1.0,>=0.1.0
|
|
20
|
+
Requires-Dist: mcp>=1.0
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# Harness Plugin
|
|
2
|
+
|
|
3
|
+
MCP (Model Context Protocol) server for [HarnessML](https://github.com/msilverblatt/harness-ml). Provides AI-driven ML experimentation through a thin async dispatcher with hot-reloadable handlers.
|
|
4
|
+
|
|
5
|
+
## Architecture
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
mcp_server.py # Tool signatures + docstrings (thin dispatcher)
|
|
9
|
+
handlers/
|
|
10
|
+
├── data.py # 19 actions: add, validate, fill_nulls, inspect, profile, views...
|
|
11
|
+
├── features.py # 6 actions: add, add_batch, test, discover, diversity, auto_search
|
|
12
|
+
├── models.py # 10 actions: add, update, remove, list, show, presets, batch ops, clone
|
|
13
|
+
├── config.py # 12 actions: init, update_data, ensemble, backtest, show, targets...
|
|
14
|
+
├── pipeline.py # 13 actions: run_backtest, predict, diagnostics, compare, explain...
|
|
15
|
+
├── experiments.py # 10 actions: create, write_overlay, run, promote, quick_run, explore...
|
|
16
|
+
├── competitions.py # 13 actions: create, simulate, standings, brackets, score, adjust...
|
|
17
|
+
├── _validation.py # Fuzzy enum matching with "Did you mean?" hints
|
|
18
|
+
└── _common.py # Shared helpers (resolve_project_dir, parse_json_param)
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Key Design Principles
|
|
22
|
+
|
|
23
|
+
**Thin dispatcher**: `mcp_server.py` contains only tool signatures and docstrings. All business logic lives in `handlers/*.py`.
|
|
24
|
+
|
|
25
|
+
**Hot-reload in dev mode**: Set `HARNESS_DEV=1` to enable hot-reloading of handler code. Changes to handler files take effect immediately without restarting the server. Changes to tool signatures or docstrings in `mcp_server.py` still require a restart.
|
|
26
|
+
|
|
27
|
+
**Fuzzy enum matching**: Invalid action names get helpful "Did you mean?" suggestions using edit distance matching. Cross-parameter hints guide users toward correct tool usage.
|
|
28
|
+
|
|
29
|
+
**Event emission**: Every tool call emits a structured event to SQLite for Studio observability. Emission is fail-safe -- it never blocks or breaks tool execution.
|
|
30
|
+
|
|
31
|
+
## Tools
|
|
32
|
+
|
|
33
|
+
7 MCP tools exposing ~83 actions:
|
|
34
|
+
|
|
35
|
+
| Tool | Actions | Purpose |
|
|
36
|
+
|------|---------|---------|
|
|
37
|
+
| `data` | 19 | Data ingestion, validation, profiling, views, sources |
|
|
38
|
+
| `features` | 6 | Feature engineering: add, batch, discover, auto-search |
|
|
39
|
+
| `models` | 10 | Model configuration: add, update, remove, presets, clone |
|
|
40
|
+
| `configure` | 12 | Project setup: init, ensemble, backtest, targets, guardrails |
|
|
41
|
+
| `pipeline` | 13 | Execution: backtest, predict, diagnostics, compare, explain |
|
|
42
|
+
| `experiments` | 10 | Experiment management: create, run, promote, compare, journal |
|
|
43
|
+
| `competitions` | 13 | Tournament simulation: brackets, scoring, strategies |
|
|
44
|
+
|
|
45
|
+
## Handler Dispatch Pattern
|
|
46
|
+
|
|
47
|
+
Each handler module follows the same pattern:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
ACTIONS = {
|
|
51
|
+
"add": _handle_add,
|
|
52
|
+
"remove": _handle_remove,
|
|
53
|
+
...
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
async def dispatch(action: str, **kwargs) -> str:
|
|
57
|
+
err = validate_enum(action, set(ACTIONS), "action")
|
|
58
|
+
if err:
|
|
59
|
+
return err # "Did you mean 'add'?"
|
|
60
|
+
result = ACTIONS[action](**kwargs)
|
|
61
|
+
if asyncio.iscoroutine(result):
|
|
62
|
+
result = await result
|
|
63
|
+
return result
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Setup
|
|
67
|
+
|
|
68
|
+
Add to your `.mcp.json` (Claude Desktop, Claude Code, or any MCP host):
|
|
69
|
+
|
|
70
|
+
```json
|
|
71
|
+
{
|
|
72
|
+
"mcpServers": {
|
|
73
|
+
"harness-ml": {
|
|
74
|
+
"command": "uv",
|
|
75
|
+
"args": [
|
|
76
|
+
"--directory", "/path/to/harness-ml",
|
|
77
|
+
"run", "harness-ml"
|
|
78
|
+
]
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
For dev mode with hot-reload:
|
|
85
|
+
|
|
86
|
+
```json
|
|
87
|
+
{
|
|
88
|
+
"mcpServers": {
|
|
89
|
+
"harness-ml": {
|
|
90
|
+
"command": "uv",
|
|
91
|
+
"args": [
|
|
92
|
+
"--directory", "/path/to/harness-ml",
|
|
93
|
+
"run", "harness-ml"
|
|
94
|
+
],
|
|
95
|
+
"env": {
|
|
96
|
+
"HARNESS_DEV": "1"
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Event Emission
|
|
104
|
+
|
|
105
|
+
Tool calls emit events to SQLite for Harness Studio observability:
|
|
106
|
+
|
|
107
|
+
- Event type, tool name, action, parameters, result summary
|
|
108
|
+
- Timestamps and session tracking
|
|
109
|
+
- Fail-safe: exceptions in emission are swallowed, never breaking tool execution
|
|
110
|
+
|
|
111
|
+
## Extending
|
|
112
|
+
|
|
113
|
+
**Adding a new action** to an existing handler:
|
|
114
|
+
|
|
115
|
+
1. Add handler function `_handle_my_action(**kwargs) -> str` in the handler module
|
|
116
|
+
2. Add entry to the `ACTIONS` dict
|
|
117
|
+
3. No server restart needed in dev mode
|
|
118
|
+
|
|
119
|
+
**Adding a new tool**:
|
|
120
|
+
|
|
121
|
+
1. Add tool function with signature and docstring in `mcp_server.py`
|
|
122
|
+
2. Create handler module in `handlers/`
|
|
123
|
+
3. Server restart required (tool signatures changed)
|
|
124
|
+
|
|
125
|
+
When adding new model config fields, update in 3 places:
|
|
126
|
+
1. `harness-core` config_writer (`add_model`, `update_model`)
|
|
127
|
+
2. `harness-plugin` handler (`handlers/models.py`)
|
|
128
|
+
3. `harness-plugin` tool signature (`mcp_server.py`, restart required)
|
|
129
|
+
|
|
130
|
+
## Testing
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
uv run pytest packages/harness-plugin/tests/ -v
|
|
134
|
+
```
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "harness-plugin"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Claude Code plugin for AI-driven ML experimentation"
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
authors = [{ name = "Matt Silverblatt-Buser" }]
|
|
8
|
+
classifiers = [
|
|
9
|
+
"Development Status :: 3 - Alpha",
|
|
10
|
+
"Intended Audience :: Developers",
|
|
11
|
+
"License :: OSI Approved :: MIT License",
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"Programming Language :: Python :: 3.11",
|
|
14
|
+
"Programming Language :: Python :: 3.12",
|
|
15
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
16
|
+
]
|
|
17
|
+
dependencies = [
|
|
18
|
+
"harness-core>=0.1.0,<1.0",
|
|
19
|
+
"mcp>=1.0",
|
|
20
|
+
"click>=8.0",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.scripts]
|
|
24
|
+
harness-ml = "harnessml.plugin.mcp_server:main"
|
|
25
|
+
harness-setup = "harnessml.plugin.setup:main"
|
|
26
|
+
|
|
27
|
+
[project.entry-points."harnessml.plugins"]
|
|
28
|
+
sports = "harnessml.sports.hooks:register"
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/msilverblatt/harness-ml"
|
|
32
|
+
Repository = "https://github.com/msilverblatt/harness-ml"
|
|
33
|
+
Issues = "https://github.com/msilverblatt/harness-ml/issues"
|
|
34
|
+
|
|
35
|
+
[tool.uv.sources]
|
|
36
|
+
harness-core = { workspace = true }
|
|
37
|
+
|
|
38
|
+
[build-system]
|
|
39
|
+
requires = ["hatchling"]
|
|
40
|
+
build-backend = "hatchling.build"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["src/harnessml"]
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ml-workflow
|
|
3
|
+
description: |
|
|
4
|
+
Use when working on ML experimentation tasks — data preparation, feature
|
|
5
|
+
engineering, model selection, hyperparameter tuning in an harnessml project.
|
|
6
|
+
Guides the complete ML workflow with sound data science practices.
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# ML Experimentation Workflow for HarnessML
|
|
10
|
+
|
|
11
|
+
**System Prompt for AI Agents:**
|
|
12
|
+
|
|
13
|
+
You are conducting iterative ML experimentation with HarnessML, a framework designed to eliminate context overhead from infrastructure work. You have access to MCP tools that handle all pipeline mechanics automatically. **Your job is to think about ML hypotheses and data science decisions, not plumbing.**
|
|
14
|
+
|
|
15
|
+
## Core Data Science Principles
|
|
16
|
+
|
|
17
|
+
**Follow this workflow in order. Do not skip steps or reorder:**
|
|
18
|
+
|
|
19
|
+
### Phase 1: Data Preparation (Offline)
|
|
20
|
+
|
|
21
|
+
**Goal:** Ensure data quality, temporal integrity, and source freshness *before* touching features or models.
|
|
22
|
+
|
|
23
|
+
**Why first?** Bad data corrupts everything downstream. Temporal issues create invisible leakage (hardest to debug). Fix now, not later.
|
|
24
|
+
|
|
25
|
+
**Steps:**
|
|
26
|
+
1. Register raw data sources via `data(action="add_source", name="...", data_path="...")`
|
|
27
|
+
2. Ingest into feature store: `data(action="add", data_path="...", join_on=[...], prefix="...")`
|
|
28
|
+
3. Profile data: `data(action="profile")` — check types, null rates, distributions
|
|
29
|
+
4. Check freshness: `data(action="check_freshness")` — verify sources are current
|
|
30
|
+
5. Validate sources: `data(action="validate_source", name="...")` — schema checks
|
|
31
|
+
6. Resolve issues: drop duplicates, fill nulls (single or batch), rename columns
|
|
32
|
+
|
|
33
|
+
**Red flags:** If your data has temporal issues or heavy leakage, all downstream models fail.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
### Phase 2: Feature Engineering (Exploratory)
|
|
38
|
+
|
|
39
|
+
**Goal:** Discover transformations and combinations that improve predictive power. Curate a diverse, high-quality feature set.
|
|
40
|
+
|
|
41
|
+
**Key principle:** Features are cheap to compute. Be aggressive about exploring.
|
|
42
|
+
|
|
43
|
+
**Why before models?** Good features make all downstream models better. It's cheaper to improve features than tune hyperparameters.
|
|
44
|
+
|
|
45
|
+
**Steps:**
|
|
46
|
+
1. Ingest base features from raw data sources
|
|
47
|
+
2. Test transformations: `features(action="test_transformations", ...)`
|
|
48
|
+
- Log, sqrt, rank, z-score, interactions
|
|
49
|
+
- Returns which transforms improve correlation most
|
|
50
|
+
3. Auto-search for features: `features(action="auto_search", features=[...], search_types=["interactions","lags","rolling"])`
|
|
51
|
+
- Automatically discovers interactions, lag features, and rolling aggregations
|
|
52
|
+
4. Add winning transformations
|
|
53
|
+
5. Discover important features: `features(action="discover", ...)`
|
|
54
|
+
- Correlation analysis with target
|
|
55
|
+
- XGBoost feature importance (what models will use)
|
|
56
|
+
- Redundancy detection (drop correlated pairs)
|
|
57
|
+
6. Check diversity: `features(action="diversity")` — ensure models use different feature sets
|
|
58
|
+
7. Create composite features (pairwise differences, ratios, interactions)
|
|
59
|
+
8. Define regimes (context flags that gate feature sets)
|
|
60
|
+
|
|
61
|
+
**Available formula functions:** abs, log, sqrt, cbrt, clip, log1p, sign, square, reciprocal, exp, expm1, power, sin_cycle, cos_cycle, zscore, minmax, rank_pct, winsorize, maximum, minimum, where, isnull, safe_div, pct_of_total
|
|
62
|
+
|
|
63
|
+
**Selection criteria:** Choose top 20-30 features based on:
|
|
64
|
+
- Correlation with target
|
|
65
|
+
- Feature importance
|
|
66
|
+
- Diversity (different types, sources, patterns)
|
|
67
|
+
|
|
68
|
+
**Red flags:** If median feature correlation < 0.3, revisit data quality or feature engineering.
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
### Phase 3: Model Selection (Structured)
|
|
73
|
+
|
|
74
|
+
**Goal:** Find which model architectures generalize best on holdout data. Use CV to ensure honest evaluation.
|
|
75
|
+
|
|
76
|
+
**Key principle:** Preset defaults are sensible. Don't tweak hyperparameters yet—just pick good base models.
|
|
77
|
+
|
|
78
|
+
**Why before hyperparams?** Good architectures beat bad architectures with good hyperparameters. Diversity improves ensembles.
|
|
79
|
+
|
|
80
|
+
**Steps:**
|
|
81
|
+
1. Configure backtest: `configure(action="backtest", cv_strategy="...", seasons=[...], metrics=[...])`
|
|
82
|
+
2. Add baseline model: `models(action="add", name="xgb_baseline", preset="xgboost_classifier", ...)`
|
|
83
|
+
3. Add comparison models: Try different architectures (XGB, LGB, MLP)
|
|
84
|
+
4. Clone and tweak: `models(action="clone", name="xgb_baseline", ...)` — clone with overrides
|
|
85
|
+
5. Run backtest: `pipeline(action="run_backtest", ...)`
|
|
86
|
+
6. Inspect diagnostics: `pipeline(action="diagnostics")`
|
|
87
|
+
- Brier score (overall accuracy)
|
|
88
|
+
- ECE (is model well-calibrated?)
|
|
89
|
+
- Model agreement (do they learn different patterns?)
|
|
90
|
+
7. Compare runs: `pipeline(action="compare_runs", run_ids=["run-001", "run-002"])`
|
|
91
|
+
8. Keep top performers, disable underperformers
|
|
92
|
+
9. Configure ensemble: `configure(action="ensemble", method="stacked|average", ...)`
|
|
93
|
+
- Calibration options: spline, isotonic, platt, beta, none
|
|
94
|
+
- Per-model pre-calibration: `pre_calibration={"model_name": "platt"}`
|
|
95
|
+
|
|
96
|
+
**Red flags:**
|
|
97
|
+
- If all models have similar performance, you may have weak features (revisit Phase 2)
|
|
98
|
+
- If ECE > 0.10, models are miscalibrated (add calibration later)
|
|
99
|
+
- If models strongly disagree (agreement < 0.5), ensemble may not help much
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
### Phase 4: Hyperparameter Tuning (Constrained, Last)
|
|
104
|
+
|
|
105
|
+
**Goal:** Fine-tune best model architectures within computational budget.
|
|
106
|
+
|
|
107
|
+
**CRITICAL:** Hyperparameters are the LAST thing to tune, only after exhausting all better options.
|
|
108
|
+
|
|
109
|
+
**You should tune only if:**
|
|
110
|
+
- Data quality is validated (Phase 1)
|
|
111
|
+
- Features selected and tested (Phase 2)
|
|
112
|
+
- Model architectures chosen (Phase 3)
|
|
113
|
+
- Baseline metrics established
|
|
114
|
+
|
|
115
|
+
**You should NOT tune if:**
|
|
116
|
+
- Features are weak (Phase 2 incomplete)
|
|
117
|
+
- Models show obvious overfitting
|
|
118
|
+
- CV metrics vary wildly (Phase 1 issues)
|
|
119
|
+
- You haven't tried different architectures
|
|
120
|
+
|
|
121
|
+
**Approaches:**
|
|
122
|
+
|
|
123
|
+
1. **Manual Single-Variable** (slower, more interpretable)
|
|
124
|
+
- `experiments(action="create", description="...", hypothesis="...")`
|
|
125
|
+
- `experiments(action="write_overlay", experiment_id="...", overlay={...})`
|
|
126
|
+
- `experiments(action="run", experiment_id="...")`
|
|
127
|
+
- Compare to baseline
|
|
128
|
+
|
|
129
|
+
2. **Bayesian Exploration** (recommended, faster)
|
|
130
|
+
- `experiments(action="explore", search_space={axes: [...], budget: 50, primary_metric: "brier"})`
|
|
131
|
+
- Returns best hyperparams, parameter importance, full trial history
|
|
132
|
+
- Prediction cache shared across trials (unchanged models never retrain)
|
|
133
|
+
|
|
134
|
+
3. **Quick Run** (one-shot experiment)
|
|
135
|
+
- `experiments(action="quick_run", description="...", overlay={...})`
|
|
136
|
+
- Creates, configures, and runs in one call
|
|
137
|
+
|
|
138
|
+
**Expected ROI:**
|
|
139
|
+
- Phase 2 improvements: 5-20% metric gain (high ROI)
|
|
140
|
+
- Phase 3 improvements: 2-10% via architecture/diversity (medium ROI)
|
|
141
|
+
- Phase 4 improvements: 0.5-2% via hyperparams (low ROI, use only if available budget)
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Available MCP Tools
|
|
146
|
+
|
|
147
|
+
### Data Management (`data`)
|
|
148
|
+
|
|
149
|
+
**Ingestion & Profiling:**
|
|
150
|
+
- `action="add"` — Ingest CSV/parquet/Excel into feature store. Params: `data_path`, `join_on`, `prefix`, `auto_clean` (default false)
|
|
151
|
+
- `action="validate"` — Preview dataset without ingesting. Params: `data_path`
|
|
152
|
+
- `action="profile"` — Summary statistics per column. Optional: `category`
|
|
153
|
+
- `action="status"` — Quick overview (row/col count, target distribution, time range)
|
|
154
|
+
- `action="list_features"` — List available feature columns. Optional: `prefix`
|
|
155
|
+
|
|
156
|
+
**Data Cleaning:**
|
|
157
|
+
- `action="fill_nulls"` — Fill nulls in one column. Params: `column`, `strategy` (median/mean/mode/zero/value), `value`
|
|
158
|
+
- `action="fill_nulls_batch"` — Fill nulls in multiple columns at once
|
|
159
|
+
- `action="drop_duplicates"` — Remove duplicates. Optional: `columns` (subset)
|
|
160
|
+
- `action="rename"` — Rename columns. Params: `mapping` (JSON `{"old": "new"}`)
|
|
161
|
+
|
|
162
|
+
**Source Registry:**
|
|
163
|
+
- `action="add_source"` — Register a raw data source. Params: `name`, `data_path`, `format`
|
|
164
|
+
- `action="add_sources_batch"` — Register multiple sources at once
|
|
165
|
+
- `action="list_sources"` — List all registered sources
|
|
166
|
+
- `action="check_freshness"` — Check source staleness against frequency expectations
|
|
167
|
+
- `action="refresh"` — Re-fetch a specific source. Params: `name`
|
|
168
|
+
- `action="refresh_all"` — Re-fetch all sources
|
|
169
|
+
- `action="validate_source"` — Run schema validation on a source. Params: `name`
|
|
170
|
+
|
|
171
|
+
**Views (Transform Chains):**
|
|
172
|
+
- `action="add_view"` — Declare a view. Params: `name`, `source`, `steps` (JSON array), `description`
|
|
173
|
+
- `action="add_views_batch"` — Declare multiple views at once
|
|
174
|
+
- `action="update_view"` — Update existing view. Params: `name`, `source`, `steps`, `description`
|
|
175
|
+
- `action="remove_view"` — Remove a view. Params: `name`
|
|
176
|
+
- `action="list_views"` — List all views with descriptions
|
|
177
|
+
- `action="preview_view"` — Materialize and show first N rows. Params: `name`, `n_rows`
|
|
178
|
+
- `action="set_features_view"` — Set which view becomes prediction table. Params: `name`
|
|
179
|
+
- `action="view_dag"` — Show view dependency graph
|
|
180
|
+
|
|
181
|
+
**Available view step ops:** filter, select, derive, group_by, join, union, unpivot, sort, head, rolling, cast, distinct, rank, isin, cond_agg, lag, ewm, diff, trend, encode, bin, datetime, null_indicator
|
|
182
|
+
|
|
183
|
+
### Feature Engineering (`features`)
|
|
184
|
+
- `action="add"` — Create a feature. Params: `name`, `type` (team/pairwise/matchup/regime), `formula`, `source`, `column`, `condition`, `pairwise_mode`, `category`, `description`
|
|
185
|
+
- `action="add_batch"` — Create multiple features with topological ordering. Params: `features` (JSON array)
|
|
186
|
+
- `action="test_transformations"` — Test math transforms. Params: `features` (column names), `test_interactions`
|
|
187
|
+
- `action="discover"` — Run feature discovery. Params: `method` (xgboost/mutual_info), `top_n`
|
|
188
|
+
- `action="diversity"` — Analyze feature diversity across models
|
|
189
|
+
- `action="auto_search"` — Auto-search for features. Params: `features`, `search_types` (interactions/lags/rolling), `top_n`
|
|
190
|
+
|
|
191
|
+
### Model Management (`models`)
|
|
192
|
+
- `action="add"` — Add a model. Params: `name`, `preset` or `model_type`, `features`, `params`, `mode`, `prediction_type`, `cdf_scale`, `zero_fill_features`
|
|
193
|
+
- `action="update"` — Update model config. Same params as add (merges)
|
|
194
|
+
- `action="remove"` — Disable model. Params: `name`, `purge` (permanent delete)
|
|
195
|
+
- `action="clone"` — Clone model with overrides. Params: `name`, plus any override params
|
|
196
|
+
- `action="list"` — List all models with type, status, feature count
|
|
197
|
+
- `action="presets"` — Show available model presets
|
|
198
|
+
- `action="add_batch"` — Add multiple models. Params: `items` (JSON array)
|
|
199
|
+
- `action="update_batch"` — Update multiple models. Params: `items`
|
|
200
|
+
- `action="remove_batch"` — Remove multiple models. Params: `items`
|
|
201
|
+
|
|
202
|
+
### Configuration (`configure`)
|
|
203
|
+
- `action="init"` — Initialize new project. Params: `project_name`, `task`, `target_column`, `key_columns`, `time_column`
|
|
204
|
+
- `action="ensemble"` — Update ensemble. Params: `method` (stacked/average), `temperature`, `exclude_models`, `calibration` (spline/isotonic/platt/beta/none), `pre_calibration`, `prior_feature`, `spline_prob_max`, `spline_n_bins`
|
|
205
|
+
- `action="backtest"` — Update backtest. Params: `cv_strategy`, `seasons`, `metrics`, `min_train_folds`
|
|
206
|
+
- `action="show"` — Show full config. Optional: `section`, `detail`
|
|
207
|
+
- `action="check_guardrails"` — Run safety guardrails (leakage, naming, model config)
|
|
208
|
+
- `action="exclude_columns"` — Manage excluded columns. Params: `add_columns`, `remove_columns`
|
|
209
|
+
- `action="set_denylist"` — Manage feature leakage denylist. Params: `add_columns`, `remove_columns`
|
|
210
|
+
|
|
211
|
+
### Experiments (`experiments`)
|
|
212
|
+
- `action="create"` — Create experiment. Params: `description`, `hypothesis`
|
|
213
|
+
- `action="write_overlay"` — Write overlay YAML. Params: `experiment_id`, `overlay` (JSON, supports dot-notation keys)
|
|
214
|
+
- `action="run"` — Run experiment backtest. Params: `experiment_id`, `primary_metric`, `variant`
|
|
215
|
+
- `action="promote"` — Promote experiment to production. Params: `experiment_id`, `primary_metric`
|
|
216
|
+
- `action="quick_run"` — One-shot create+configure+run. Params: `description`, `overlay`, `hypothesis`, `primary_metric`
|
|
217
|
+
- `action="explore"` — Bayesian search. Params: `search_space` (JSON with axes, budget, primary_metric)
|
|
218
|
+
- `action="promote_trial"` — Promote exploration trial. Params: `experiment_id`, `trial`, `primary_metric`, `hypothesis`
|
|
219
|
+
- `action="compare"` — Compare two experiments. Params: `experiment_ids` (list of 2)
|
|
220
|
+
|
|
221
|
+
### Pipeline (`pipeline`)
|
|
222
|
+
- `action="run_backtest"` — Run full backtest. Optional: `experiment_id`, `variant`
|
|
223
|
+
- `action="predict"` — Generate predictions. Params: `season`, `run_id`, `variant`
|
|
224
|
+
- `action="diagnostics"` — Per-model metrics, calibration, SHAP. Optional: `run_id`, `detail`
|
|
225
|
+
- `action="list_runs"` — List all pipeline runs
|
|
226
|
+
- `action="show_run"` — Show run results. Optional: `run_id`, `detail`
|
|
227
|
+
- `action="compare_runs"` — Compare two runs. Params: `run_ids` (list of 2)
|
|
228
|
+
|
|
229
|
+
---
|
|
230
|
+
|
|
231
|
+
## Workflow Patterns
|
|
232
|
+
|
|
233
|
+
### Pattern: Project Initialization
|
|
234
|
+
|
|
235
|
+
```
|
|
236
|
+
1. configure(action="init", project_name="...", task="binary", target_column="...")
|
|
237
|
+
2. data(action="add_source", name="...", data_path="...") # register sources
|
|
238
|
+
3. data(action="add", data_path="...") # ingest into feature store
|
|
239
|
+
4. data(action="profile") # check for issues
|
|
240
|
+
5. data(action="check_freshness") # verify data is current
|
|
241
|
+
6. features(action="discover") # what's useful?
|
|
242
|
+
7. configure(action="backtest", cv_strategy="...", seasons=[...])
|
|
243
|
+
8. models(action="add", preset="xgboost_classifier", ...)
|
|
244
|
+
9. pipeline(action="run_backtest") # establish baseline
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Pattern: Feature Engineering Cycle
|
|
248
|
+
|
|
249
|
+
```
|
|
250
|
+
1. features(action="test_transformations", features=[...])
|
|
251
|
+
-> Which transforms improved correlation?
|
|
252
|
+
2. features(action="auto_search", features=[...], search_types=["interactions","lags","rolling"])
|
|
253
|
+
-> Automated discovery of interactions, lags, rolling features
|
|
254
|
+
3. features(action="add", name="...", formula="...", ...)
|
|
255
|
+
-> Add winning transforms
|
|
256
|
+
4. features(action="discover", method="xgboost", top_n=30)
|
|
257
|
+
-> Which features does XGBoost think matter?
|
|
258
|
+
5. features(action="diversity")
|
|
259
|
+
-> Are models using diverse feature sets?
|
|
260
|
+
6. models(action="update", name="xgb_baseline", features=[...])
|
|
261
|
+
-> Add top features to models
|
|
262
|
+
7. pipeline(action="run_backtest") # did metrics improve?
|
|
263
|
+
8. Repeat or advance to Phase 3
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Pattern: Model Selection
|
|
267
|
+
|
|
268
|
+
```
|
|
269
|
+
1. Add baseline: models(action="add", preset="xgboost_classifier", ...)
|
|
270
|
+
2. Add comparison: models(action="add", preset="lightgbm_classifier", ...)
|
|
271
|
+
3. Clone variant: models(action="clone", name="xgb_baseline", ...)
|
|
272
|
+
4. pipeline(action="run_backtest") # compare architectures
|
|
273
|
+
5. pipeline(action="diagnostics") # check calibration, agreement
|
|
274
|
+
6. pipeline(action="compare_runs", run_ids=["run-001", "run-002"]) # compare runs
|
|
275
|
+
7. Disable underperformers: models(action="update", name="...", active=false)
|
|
276
|
+
8. configure(action="ensemble", method="stacked", calibration="spline")
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
### Pattern: Hyperparameter Tuning (Bayesian)
|
|
280
|
+
|
|
281
|
+
```
|
|
282
|
+
experiments(action="explore", search_space={
|
|
283
|
+
"axes": [
|
|
284
|
+
{"key": "models.xgb.params.max_depth", "type": "integer", "low": 3, "high": 10},
|
|
285
|
+
{"key": "models.xgb.params.learning_rate", "type": "continuous", "low": 0.001, "high": 0.3, "log": true},
|
|
286
|
+
{"key": "ensemble.temperature", "type": "continuous", "low": 0.9, "high": 1.1}
|
|
287
|
+
],
|
|
288
|
+
"budget": 50,
|
|
289
|
+
"primary_metric": "brier"
|
|
290
|
+
})
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
- **Best trial** — Optimal hyperparams found
|
|
295
|
+
- **Parameter importance** — Which hyperparams matter (focus next exploration here)
|
|
296
|
+
- **Trial history** — All 50 runs with metrics
|
|
297
|
+
- **Baseline comparison** — How much did tuning help?
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
## Key Principles
|
|
302
|
+
|
|
303
|
+
- **Data first** — Temporal issues and leakage corrupt everything. Validate before moving on.
|
|
304
|
+
- **Features second** — Good features beat tuned hyperparameters. Explore aggressively.
|
|
305
|
+
- **Architectures third** — Different models learn different patterns. Diversity improves ensembles.
|
|
306
|
+
- **Hyperparams last** — Only tune after everything else is solid. Low ROI anyway.
|
|
307
|
+
|
|
308
|
+
- **One variable per experiment** — Change one thing, measure impact.
|
|
309
|
+
- **Use presets** — Don't manually configure hyperparameters; start from presets.
|
|
310
|
+
- **Formula features are cheap** — Test transformations and interactions without fear.
|
|
311
|
+
- **Trust the tools** — All mechanics (caching, logging, fingerprinting) are automatic.
|
|
312
|
+
- **Verify assumptions** — Check temporal ordering, feature correlations, model calibration.
|
|
313
|
+
|
|
314
|
+
---
|
|
315
|
+
|
|
316
|
+
## Common Pitfalls (Avoid These!)
|
|
317
|
+
|
|
318
|
+
**Jumping to hyperparameter tuning before features are good**
|
|
319
|
+
- Features with correlation < 0.3 to target = problem in Phase 2, not Phase 4
|
|
320
|
+
- Tuning bad features won't help
|
|
321
|
+
|
|
322
|
+
**Mutating production config directly**
|
|
323
|
+
- Always use experiment overlays
|
|
324
|
+
- Revert/promote workflow keeps history clean
|
|
325
|
+
|
|
326
|
+
**Training models on post-tournament data for tournament prediction**
|
|
327
|
+
- Hard guardrail blocks this automatically
|
|
328
|
+
- Temporal safety is non-overridable
|
|
329
|
+
|
|
330
|
+
**Running single experiment then declaring victory**
|
|
331
|
+
- CV ensures honest evaluation
|
|
332
|
+
- One fold can be lucky; cross all folds
|
|
333
|
+
|
|
334
|
+
**Ignoring model calibration (ECE > 0.10)**
|
|
335
|
+
- Miscalibrated probabilities mislead downstream users
|
|
336
|
+
- Add post-calibration (platt, isotonic, spline, beta) if needed
|
|
337
|
+
|
|
338
|
+
---
|
|
339
|
+
|
|
340
|
+
## Further Reading
|
|
341
|
+
|
|
342
|
+
- [GETTING_STARTED.md](../../../../GETTING_STARTED.md) — Complete workflow guide with examples
|
|
343
|
+
- [README.md](../../../../README.md) — System overview
|
|
344
|
+
- [CLAUDE.md](../../../../CLAUDE.md) — Dev conventions
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""HarnessML Claude Code plugin — MCP server for AI-driven ML experimentation."""
|