ctx-cc 3.5.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +375 -676
- package/agents/ctx-arch-mapper.md +5 -3
- package/agents/ctx-auditor.md +5 -3
- package/agents/ctx-codex-reviewer.md +214 -0
- package/agents/ctx-concerns-mapper.md +5 -3
- package/agents/ctx-criteria-suggester.md +6 -4
- package/agents/ctx-debugger.md +5 -3
- package/agents/ctx-designer.md +488 -114
- package/agents/ctx-discusser.md +5 -3
- package/agents/ctx-executor.md +5 -3
- package/agents/ctx-handoff.md +6 -4
- package/agents/ctx-learner.md +5 -3
- package/agents/ctx-mapper.md +4 -3
- package/agents/ctx-ml-analyst.md +600 -0
- package/agents/ctx-ml-engineer.md +933 -0
- package/agents/ctx-ml-reviewer.md +485 -0
- package/agents/ctx-ml-scientist.md +626 -0
- package/agents/ctx-parallelizer.md +4 -3
- package/agents/ctx-planner.md +5 -3
- package/agents/ctx-predictor.md +4 -3
- package/agents/ctx-qa.md +5 -3
- package/agents/ctx-quality-mapper.md +5 -3
- package/agents/ctx-researcher.md +5 -3
- package/agents/ctx-reviewer.md +6 -4
- package/agents/ctx-team-coordinator.md +5 -3
- package/agents/ctx-tech-mapper.md +5 -3
- package/agents/ctx-verifier.md +5 -3
- package/bin/ctx.js +199 -27
- package/commands/brand.md +309 -0
- package/commands/ctx.md +10 -10
- package/commands/design.md +304 -0
- package/commands/experiment.md +251 -0
- package/commands/help.md +57 -7
- package/commands/init.md +25 -0
- package/commands/metrics.md +1 -1
- package/commands/milestone.md +1 -1
- package/commands/ml-status.md +197 -0
- package/commands/monitor.md +1 -1
- package/commands/train.md +266 -0
- package/commands/visual-qa.md +559 -0
- package/commands/voice.md +1 -1
- package/hooks/post-tool-use.js +39 -0
- package/hooks/pre-tool-use.js +94 -0
- package/hooks/subagent-stop.js +32 -0
- package/package.json +9 -3
- package/plugin.json +46 -0
- package/skills/ctx-design-system/SKILL.md +572 -0
- package/skills/ctx-ml-experiment/SKILL.md +334 -0
- package/skills/ctx-ml-pipeline/SKILL.md +437 -0
- package/skills/ctx-orchestrator/SKILL.md +91 -0
- package/skills/ctx-review-gate/SKILL.md +147 -0
- package/skills/ctx-state/SKILL.md +100 -0
- package/skills/ctx-visual-qa/SKILL.md +587 -0
- package/src/agents.js +109 -0
- package/src/auto.js +287 -0
- package/src/capabilities.js +226 -0
- package/src/commits.js +94 -0
- package/src/config.js +112 -0
- package/src/context.js +241 -0
- package/src/handoff.js +156 -0
- package/src/hooks.js +218 -0
- package/src/install.js +125 -50
- package/src/lifecycle.js +194 -0
- package/src/metrics.js +198 -0
- package/src/pipeline.js +269 -0
- package/src/review-gate.js +338 -0
- package/src/runner.js +120 -0
- package/src/skills.js +143 -0
- package/src/state.js +267 -0
- package/src/worktree.js +244 -0
- package/templates/PRD.json +1 -1
- package/templates/config.json +4 -237
- package/workflows/ctx-router.md +0 -485
- package/workflows/map-codebase.md +0 -329
|
@@ -0,0 +1,437 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ctx-ml-pipeline
|
|
3
|
+
description: |
|
|
4
|
+
WHEN: Building ML training pipelines, data processing pipelines, inference services, model deployment, CI/CT/CD for ML, or adding production reliability patterns (circuit breaker, drift detection, conformal prediction).
|
|
5
|
+
WHEN NOT: One-off analysis, EDA, non-ML infrastructure, experiment hypothesis formation.
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# CTX ML Pipeline — Production ML Engineering Patterns
|
|
9
|
+
|
|
10
|
+
You build production-grade ML pipelines. Speed matters less than correctness and reliability. Every pipeline component must be testable, observable, and reproducible.
|
|
11
|
+
|
|
12
|
+
## Core Principle
|
|
13
|
+
|
|
14
|
+
A model is not done when training converges. It is done when it can be deployed, monitored, degraded gracefully, and retrained without manual intervention.
|
|
15
|
+
|
|
16
|
+
## Pipeline Architecture
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
┌─────────────┐
|
|
20
|
+
│ Raw Data │
|
|
21
|
+
└──────┬──────┘
|
|
22
|
+
│
|
|
23
|
+
┌──────▼──────┐
|
|
24
|
+
│ Validation │ Pandera schema — fail fast
|
|
25
|
+
└──────┬──────┘
|
|
26
|
+
│
|
|
27
|
+
┌──────▼──────┐
|
|
28
|
+
│ Features │ Deterministic transforms
|
|
29
|
+
└──────┬──────┘
|
|
30
|
+
│
|
|
31
|
+
┌────────────┼────────────┐
|
|
32
|
+
│ │ │
|
|
33
|
+
┌──────▼──────┐ ┌──▼──────┐ ┌──▼──────────┐
|
|
34
|
+
│ Training │ │ HPO │ │ Evaluation │
|
|
35
|
+
└──────┬──────┘ └──┬──────┘ └──────┬───────┘
|
|
36
|
+
│ │ │
|
|
37
|
+
└────────────┴───────────────┘
|
|
38
|
+
│
|
|
39
|
+
┌──────▼──────┐
|
|
40
|
+
│ Registry │ Version + lineage + promotion
|
|
41
|
+
└──────┬──────┘
|
|
42
|
+
│
|
|
43
|
+
┌──────▼──────┐
|
|
44
|
+
│ Inference │ Envelope + circuit breaker
|
|
45
|
+
└──────┬──────┘
|
|
46
|
+
│
|
|
47
|
+
┌──────▼──────┐
|
|
48
|
+
│ Monitoring │ Drift + calibration alerts
|
|
49
|
+
└─────────────┘
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Stage 1: Data Validation
|
|
53
|
+
|
|
54
|
+
Use Pandera for schema enforcement. Fail at ingestion, not at training time.
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
import pandera as pa
|
|
58
|
+
from pandera import Column, DataFrameSchema, Check
|
|
59
|
+
|
|
60
|
+
schema = DataFrameSchema({
|
|
61
|
+
"age": Column(float, Check.between(0, 120), nullable=False),
|
|
62
|
+
"cholesterol": Column(float, Check.greater_than(0), nullable=True),
|
|
63
|
+
"label": Column(int, Check.isin([0, 1]), nullable=False),
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
def validate(df: pd.DataFrame) -> pd.DataFrame:
|
|
67
|
+
try:
|
|
68
|
+
return schema.validate(df, lazy=True)
|
|
69
|
+
except pa.errors.SchemaErrors as e:
|
|
70
|
+
raise ValueError(f"Schema validation failed:\n{e.failure_cases}")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Validation failures are hard errors. Never train on unvalidated data.
|
|
74
|
+
|
|
75
|
+
## Stage 2: Feature Pipeline
|
|
76
|
+
|
|
77
|
+
Transforms must be:
|
|
78
|
+
- Deterministic given the same input
|
|
79
|
+
- Serializable (fit statistics saved alongside model)
|
|
80
|
+
- Versioned in the feature registry
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from sklearn.pipeline import Pipeline
|
|
84
|
+
from sklearn.preprocessing import StandardScaler
|
|
85
|
+
import joblib
|
|
86
|
+
|
|
87
|
+
def build_feature_pipeline(feature_registry: dict) -> Pipeline:
|
|
88
|
+
steps = []
|
|
89
|
+
for feature, spec in feature_registry.items():
|
|
90
|
+
if spec["type"] == "numeric":
|
|
91
|
+
steps.append((f"scale_{feature}", StandardScaler()))
|
|
92
|
+
return Pipeline(steps)
|
|
93
|
+
|
|
94
|
+
def fit_and_save(pipeline, X_train, path):
|
|
95
|
+
pipeline.fit(X_train)
|
|
96
|
+
joblib.dump(pipeline, path)
|
|
97
|
+
return pipeline
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Save fit statistics (mean, std, bin edges) alongside model artifacts. Inference must use the same fitted pipeline as training.
|
|
101
|
+
|
|
102
|
+
## Stage 3: Training
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
import xgboost as xgb
|
|
106
|
+
from sklearn.model_selection import cross_val_score
|
|
107
|
+
import numpy as np
|
|
108
|
+
|
|
109
|
+
def train(X_train, y_train, config: dict):
|
|
110
|
+
model = xgb.XGBClassifier(
|
|
111
|
+
max_depth=config["max_depth"],
|
|
112
|
+
n_estimators=config["n_estimators"],
|
|
113
|
+
learning_rate=config["learning_rate"],
|
|
114
|
+
subsample=config["subsample"],
|
|
115
|
+
colsample_bytree=config["colsample_bytree"],
|
|
116
|
+
random_state=config["seed"],
|
|
117
|
+
eval_metric="auc",
|
|
118
|
+
early_stopping_rounds=20,
|
|
119
|
+
)
|
|
120
|
+
model.fit(
|
|
121
|
+
X_train, y_train,
|
|
122
|
+
eval_set=[(X_val, y_val)],
|
|
123
|
+
verbose=50,
|
|
124
|
+
)
|
|
125
|
+
return model
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Always use early stopping. Log training curves to artifacts.
|
|
129
|
+
|
|
130
|
+
## Stage 4: Hyperparameter Optimization
|
|
131
|
+
|
|
132
|
+
Use Optuna for HPO. Never tune manually.
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
import optuna
|
|
136
|
+
|
|
137
|
+
def objective(trial, X_train, y_train, X_val, y_val):
|
|
138
|
+
params = {
|
|
139
|
+
"max_depth": trial.suggest_int("max_depth", 3, 8),
|
|
140
|
+
"n_estimators": trial.suggest_int("n_estimators", 100, 500),
|
|
141
|
+
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
|
|
142
|
+
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
|
|
143
|
+
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
|
|
144
|
+
}
|
|
145
|
+
model = xgb.XGBClassifier(**params, random_state=42)
|
|
146
|
+
model.fit(X_train, y_train)
|
|
147
|
+
return roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])
|
|
148
|
+
|
|
149
|
+
study = optuna.create_study(direction="maximize")
|
|
150
|
+
study.optimize(objective, n_trials=100, timeout=3600)
|
|
151
|
+
best_params = study.best_params
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Save study results to `artifacts/hpo_study.pkl` for reproducibility.
|
|
155
|
+
|
|
156
|
+
## Stage 5: Evaluation
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
from sklearn.metrics import roc_auc_score, precision_score, recall_score
|
|
160
|
+
from sklearn.calibration import calibration_curve
|
|
161
|
+
import matplotlib.pyplot as plt
|
|
162
|
+
|
|
163
|
+
def evaluate(model, X_test, y_test, threshold=0.5) -> dict:
|
|
164
|
+
proba = model.predict_proba(X_test)[:, 1]
|
|
165
|
+
pred = (proba >= threshold).astype(int)
|
|
166
|
+
|
|
167
|
+
metrics = {
|
|
168
|
+
"auc": roc_auc_score(y_test, proba),
|
|
169
|
+
"precision": precision_score(y_test, pred),
|
|
170
|
+
"recall": recall_score(y_test, pred),
|
|
171
|
+
"threshold": threshold,
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
# Calibration check
|
|
175
|
+
fraction_pos, mean_pred = calibration_curve(y_test, proba, n_bins=10)
|
|
176
|
+
metrics["calibration_error"] = float(np.mean(np.abs(fraction_pos - mean_pred)))
|
|
177
|
+
|
|
178
|
+
return metrics
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Write metrics to `RESULTS.md` and update `models/registry.yaml` if promotion criteria are met.
|
|
182
|
+
|
|
183
|
+
## Stage 6: Model Registry
|
|
184
|
+
|
|
185
|
+
Registry enforces promotion criteria before writing a new production version.
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
import yaml
|
|
189
|
+
from pathlib import Path
|
|
190
|
+
|
|
191
|
+
def promote_model(name: str, version: str, metrics: dict, experiment_id: str):
|
|
192
|
+
registry_path = Path(".ctx/ml/models/registry.yaml")
|
|
193
|
+
registry = yaml.safe_load(registry_path.read_text())
|
|
194
|
+
|
|
195
|
+
model = registry["models"][name]
|
|
196
|
+
current_version = model["current"]
|
|
197
|
+
current_metrics = model["versions"][current_version]["metrics"]
|
|
198
|
+
criteria = model["promotion_criteria"]
|
|
199
|
+
|
|
200
|
+
primary_field, primary_op, primary_threshold = parse_criterion(criteria["primary"])
|
|
201
|
+
guard_field, guard_op, guard_threshold = parse_criterion(criteria["guard"])
|
|
202
|
+
|
|
203
|
+
primary_delta = metrics[primary_field] - current_metrics[primary_field]
|
|
204
|
+
guard_delta = abs(metrics[guard_field] - current_metrics[guard_field])
|
|
205
|
+
|
|
206
|
+
if not eval(f"{primary_delta} {primary_op} {primary_threshold}"):
|
|
207
|
+
raise ValueError(f"Promotion rejected: {primary_field} delta={primary_delta:.4f} < {primary_threshold}")
|
|
208
|
+
if not eval(f"{guard_delta} {guard_op} {guard_threshold}"):
|
|
209
|
+
raise ValueError(f"Promotion rejected: {guard_field} regression={guard_delta:.4f} > {guard_threshold}")
|
|
210
|
+
|
|
211
|
+
model["versions"][version] = {
|
|
212
|
+
"metrics": metrics,
|
|
213
|
+
"experiment": experiment_id,
|
|
214
|
+
"date": date.today().isoformat(),
|
|
215
|
+
"status": "production",
|
|
216
|
+
"artifacts": f".ctx/ml/experiments/{experiment_id}/artifacts/",
|
|
217
|
+
}
|
|
218
|
+
model["versions"][current_version]["status"] = "retired"
|
|
219
|
+
model["current"] = version
|
|
220
|
+
|
|
221
|
+
registry_path.write_text(yaml.dump(registry, default_flow_style=False))
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
Never promote by hand-editing registry.yaml. Always go through this gate.
|
|
225
|
+
|
|
226
|
+
## Stage 7: Inference Envelope
|
|
227
|
+
|
|
228
|
+
Every prediction must return a full envelope. Callers receive lineage and uncertainty, not just a score.
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
from datetime import datetime, timezone
|
|
232
|
+
from dataclasses import dataclass, asdict
|
|
233
|
+
from typing import List, Optional
|
|
234
|
+
|
|
235
|
+
@dataclass
|
|
236
|
+
class PredictionEnvelope:
|
|
237
|
+
prediction: float
|
|
238
|
+
confidence: float
|
|
239
|
+
prediction_set: List[str] # Conformal prediction set
|
|
240
|
+
lineage: dict
|
|
241
|
+
|
|
242
|
+
def predict(model, pipeline, mapie, X, model_meta: dict) -> PredictionEnvelope:
|
|
243
|
+
X_transformed = pipeline.transform(X)
|
|
244
|
+
proba = model.predict_proba(X_transformed)[:, 1]
|
|
245
|
+
|
|
246
|
+
# Conformal prediction set at 90% coverage
|
|
247
|
+
_, y_set = mapie.predict(X_transformed, alpha=0.1)
|
|
248
|
+
|
|
249
|
+
return PredictionEnvelope(
|
|
250
|
+
prediction=float(proba[0]),
|
|
251
|
+
confidence=float(max(proba[0], 1 - proba[0])),
|
|
252
|
+
prediction_set=[str(c) for c in y_set[0]],
|
|
253
|
+
lineage={
|
|
254
|
+
"model_name": model_meta["name"],
|
|
255
|
+
"model_version": model_meta["version"],
|
|
256
|
+
"build_hash": model_meta["build_hash"],
|
|
257
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
258
|
+
}
|
|
259
|
+
)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
Never return a bare float from inference. The envelope is non-negotiable.
|
|
263
|
+
|
|
264
|
+
## Stage 8: Circuit Breaker
|
|
265
|
+
|
|
266
|
+
Wrap all inference calls in a circuit breaker. When the model degrades, return safe defaults — not errors.
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
import time
|
|
270
|
+
from enum import Enum
|
|
271
|
+
from collections import deque
|
|
272
|
+
|
|
273
|
+
class CircuitState(Enum):
|
|
274
|
+
CLOSED = "closed" # Normal operation
|
|
275
|
+
OPEN = "open" # Failing, return defaults
|
|
276
|
+
HALF_OPEN = "half_open" # Testing recovery
|
|
277
|
+
|
|
278
|
+
class CircuitBreaker:
|
|
279
|
+
def __init__(self, error_threshold=0.05, latency_p95_ms=500, cooldown_s=60):
|
|
280
|
+
self.state = CircuitState.CLOSED
|
|
281
|
+
self.error_threshold = error_threshold
|
|
282
|
+
self.latency_p95_ms = latency_p95_ms
|
|
283
|
+
self.cooldown_s = cooldown_s
|
|
284
|
+
self.errors = deque(maxlen=100)
|
|
285
|
+
self.latencies = deque(maxlen=100)
|
|
286
|
+
self.opened_at: Optional[float] = None
|
|
287
|
+
|
|
288
|
+
def call(self, fn, *args, safe_default=None, **kwargs):
|
|
289
|
+
if self.state == CircuitState.OPEN:
|
|
290
|
+
if time.time() - self.opened_at > self.cooldown_s:
|
|
291
|
+
self.state = CircuitState.HALF_OPEN
|
|
292
|
+
else:
|
|
293
|
+
return safe_default
|
|
294
|
+
|
|
295
|
+
start = time.time()
|
|
296
|
+
try:
|
|
297
|
+
result = fn(*args, **kwargs)
|
|
298
|
+
self.errors.append(0)
|
|
299
|
+
self.latencies.append((time.time() - start) * 1000)
|
|
300
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
301
|
+
self.state = CircuitState.CLOSED
|
|
302
|
+
self._check_thresholds()
|
|
303
|
+
return result
|
|
304
|
+
except Exception as e:
|
|
305
|
+
self.errors.append(1)
|
|
306
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
307
|
+
self._open()
|
|
308
|
+
self._check_thresholds()
|
|
309
|
+
raise
|
|
310
|
+
|
|
311
|
+
def _check_thresholds(self):
|
|
312
|
+
if len(self.errors) < 20:
|
|
313
|
+
return
|
|
314
|
+
error_rate = sum(self.errors) / len(self.errors)
|
|
315
|
+
p95_lat = sorted(self.latencies)[int(len(self.latencies) * 0.95)]
|
|
316
|
+
if error_rate > self.error_threshold or p95_lat > self.latency_p95_ms:
|
|
317
|
+
self._open()
|
|
318
|
+
|
|
319
|
+
def _open(self):
|
|
320
|
+
self.state = CircuitState.OPEN
|
|
321
|
+
self.opened_at = time.time()
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
```
|
|
325
|
+
State transitions:
|
|
326
|
+
CLOSED → (error_rate > 5% OR p95_latency > 500ms) → OPEN
|
|
327
|
+
OPEN → (cooldown 60s elapsed) → HALF_OPEN
|
|
328
|
+
HALF_OPEN → (success) → CLOSED
|
|
329
|
+
HALF_OPEN → (failure) → OPEN
|
|
330
|
+
|
|
331
|
+
When OPEN: return safe_default, log alert, never raise to caller
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
## Stage 9: Drift Detection
|
|
335
|
+
|
|
336
|
+
Run drift checks on a schedule or on each inference batch. Alert immediately, do not silently degrade.
|
|
337
|
+
|
|
338
|
+
```python
|
|
339
|
+
from scipy.stats import ks_2samp
|
|
340
|
+
import pandas as pd
|
|
341
|
+
|
|
342
|
+
def check_drift(train_df: pd.DataFrame, prod_df: pd.DataFrame, features: list, alpha=0.05) -> list:
|
|
343
|
+
alerts = []
|
|
344
|
+
for feature in features:
|
|
345
|
+
if feature not in train_df.columns or feature not in prod_df.columns:
|
|
346
|
+
continue
|
|
347
|
+
stat, pvalue = ks_2samp(
|
|
348
|
+
train_df[feature].dropna(),
|
|
349
|
+
prod_df[feature].dropna()
|
|
350
|
+
)
|
|
351
|
+
if pvalue < alpha:
|
|
352
|
+
alerts.append({
|
|
353
|
+
"feature": feature,
|
|
354
|
+
"ks_stat": round(stat, 4),
|
|
355
|
+
"pvalue": round(pvalue, 6),
|
|
356
|
+
"severity": "high" if pvalue < 0.001 else "medium",
|
|
357
|
+
})
|
|
358
|
+
return alerts
|
|
359
|
+
|
|
360
|
+
def log_drift_alerts(alerts: list, experiment_id: str):
|
|
361
|
+
if not alerts:
|
|
362
|
+
return
|
|
363
|
+
path = Path(f".ctx/ml/experiments/{experiment_id}/artifacts/drift_alerts.json")
|
|
364
|
+
path.write_text(json.dumps(alerts, indent=2))
|
|
365
|
+
print(f"[DRIFT] {len(alerts)} feature(s) drifted — see {path}")
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
## Stage 10: Conformal Prediction
|
|
369
|
+
|
|
370
|
+
All classifiers must support conformal prediction sets. Use MAPIE.
|
|
371
|
+
|
|
372
|
+
```python
|
|
373
|
+
from mapie.classification import MapieClassifier
|
|
374
|
+
|
|
375
|
+
def fit_conformal(model, X_train, y_train) -> MapieClassifier:
|
|
376
|
+
mapie = MapieClassifier(estimator=model, method="score", cv=5)
|
|
377
|
+
mapie.fit(X_train, y_train)
|
|
378
|
+
return mapie
|
|
379
|
+
|
|
380
|
+
def predict_with_coverage(mapie, X, alpha=0.1):
|
|
381
|
+
# alpha=0.1 → 90% marginal coverage guaranteed
|
|
382
|
+
y_pred, y_set = mapie.predict(X, alpha=alpha)
|
|
383
|
+
return y_pred, y_set
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
Coverage is a contract, not a suggestion. If the prediction set is too wide, the model needs retraining — not a tighter alpha.
|
|
387
|
+
|
|
388
|
+
## CI/CT/CD for ML
|
|
389
|
+
|
|
390
|
+
```
|
|
391
|
+
On pull request:
|
|
392
|
+
1. Validate schema (pandera)
|
|
393
|
+
2. Run feature pipeline (determinism check)
|
|
394
|
+
3. Train on sample data (smoke test)
|
|
395
|
+
4. Evaluate on holdout (metric regression gate)
|
|
396
|
+
5. Drift check on validation set
|
|
397
|
+
|
|
398
|
+
On merge to main:
|
|
399
|
+
1. Full training run
|
|
400
|
+
2. Full evaluation
|
|
401
|
+
3. Registry promotion check
|
|
402
|
+
4. Deploy if criteria met
|
|
403
|
+
|
|
404
|
+
On scheduled cadence (weekly):
|
|
405
|
+
1. Drift check on production traffic
|
|
406
|
+
2. Calibration check
|
|
407
|
+
3. Trigger retraining if drift thresholds exceeded
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
## Reproducibility Requirements
|
|
411
|
+
|
|
412
|
+
Every artifact directory must contain:
|
|
413
|
+
|
|
414
|
+
```
|
|
415
|
+
artifacts/
|
|
416
|
+
├── model.pkl # Serialized model
|
|
417
|
+
├── pipeline.pkl # Fitted feature pipeline
|
|
418
|
+
├── mapie.pkl # Fitted conformal wrapper
|
|
419
|
+
├── config.yaml # Exact config used for this run
|
|
420
|
+
├── metrics.json # All evaluation metrics
|
|
421
|
+
├── train.log # Full training log
|
|
422
|
+
├── plots/
|
|
423
|
+
│ ├── roc_curve.png
|
|
424
|
+
│ ├── calibration.png
|
|
425
|
+
│ └── feature_importance.png
|
|
426
|
+
└── hpo_study.pkl # If HPO was run
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
No artifact is complete without all of these files.
|
|
430
|
+
|
|
431
|
+
## Pipeline Wiring Rules
|
|
432
|
+
|
|
433
|
+
1. Feature pipeline is always fit on training data only. Validation and test data are transformed, never used to fit.
|
|
434
|
+
2. Conformal calibration uses a dedicated calibration split, not val or test.
|
|
435
|
+
3. Circuit breaker wraps the full inference stack, not just the model call.
|
|
436
|
+
4. Drift detection compares production distribution against training distribution. Never compare to previous week.
|
|
437
|
+
5. Promotion criteria are code, not conversation. No manual overrides.
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ctx-orchestrator
|
|
3
|
+
description: |
|
|
4
|
+
WHEN: User wants to run a multi-step workflow like plan→execute→verify, asks for "ctx pipeline", "ctx next", "ctx auto", or wants autonomous story execution.
|
|
5
|
+
WHEN NOT: Single agent invocation, simple questions, or non-CTX work.
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# CTX Orchestrator — Pipeline & Lifecycle Execution
|
|
9
|
+
|
|
10
|
+
You orchestrate multi-agent workflows by spawning CTX agents via the Agent tool.
|
|
11
|
+
|
|
12
|
+
## Phase Lifecycle
|
|
13
|
+
|
|
14
|
+
CTX follows a strict phase progression:
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
init → plan → execute → verify → complete
|
|
18
|
+
↑ ↓
|
|
19
|
+
←── (fix failures)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## How to Advance Phases
|
|
23
|
+
|
|
24
|
+
Read `.ctx/STATE.json` to determine current phase, then spawn the right agent:
|
|
25
|
+
|
|
26
|
+
| Current Phase | Action | Agent to Spawn |
|
|
27
|
+
|---------------|--------|----------------|
|
|
28
|
+
| init | Plan the story | `subagent_type: "ctx-planner"` |
|
|
29
|
+
| plan | Execute the plan | `subagent_type: "ctx-executor"` |
|
|
30
|
+
| execute | Verify acceptance criteria | `subagent_type: "ctx-verifier"` |
|
|
31
|
+
| verify (pass) | Mark complete | Update STATE.json + PRD.json |
|
|
32
|
+
| verify (fail) | Fix and retry | `subagent_type: "ctx-executor"` with failure context |
|
|
33
|
+
|
|
34
|
+
## Pipeline Execution
|
|
35
|
+
|
|
36
|
+
When the user requests a pipeline (e.g., "plan, execute, verify"):
|
|
37
|
+
|
|
38
|
+
1. Read `.ctx/STATE.json` for current state
|
|
39
|
+
2. For each step in the pipeline:
|
|
40
|
+
a. Spawn the agent using the Agent tool with the appropriate `subagent_type`
|
|
41
|
+
b. Wait for completion
|
|
42
|
+
c. Update `.ctx/STATE.json` with the result
|
|
43
|
+
d. Pass output summary as context to the next agent
|
|
44
|
+
3. If any step fails, halt and report
|
|
45
|
+
|
|
46
|
+
Example agent spawn:
|
|
47
|
+
```
|
|
48
|
+
Agent({
|
|
49
|
+
subagent_type: "ctx-planner",
|
|
50
|
+
prompt: "Plan story S001: <title>. Acceptance criteria: <list>. Write plan to .ctx/phases/S001/PLAN.md",
|
|
51
|
+
description: "Plan story S001"
|
|
52
|
+
})
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Autonomous Mode
|
|
56
|
+
|
|
57
|
+
When user requests "ctx auto" or autonomous execution:
|
|
58
|
+
|
|
59
|
+
1. Read `.ctx/PRD.json` for pending stories (sorted by priority)
|
|
60
|
+
2. For each story:
|
|
61
|
+
a. Set as active in STATE.json
|
|
62
|
+
b. Run pipeline: plan → execute → verify
|
|
63
|
+
c. If verify passes: mark story passed, commit, continue to next
|
|
64
|
+
d. If verify fails: retry up to 3 times, then skip and log failure
|
|
65
|
+
3. Check for `.ctx/STOP` file before each story (graceful halt)
|
|
66
|
+
4. Write summary to `.ctx/AUTO-LOG.md`
|
|
67
|
+
|
|
68
|
+
## State Management
|
|
69
|
+
|
|
70
|
+
Always update `.ctx/STATE.json` after phase transitions:
|
|
71
|
+
|
|
72
|
+
```json
|
|
73
|
+
{
|
|
74
|
+
"version": "4.0",
|
|
75
|
+
"phase": "<current phase>",
|
|
76
|
+
"activeStory": "<story ID>",
|
|
77
|
+
"storyTitle": "<title>",
|
|
78
|
+
"completedTasks": [],
|
|
79
|
+
"agentHistory": [
|
|
80
|
+
{ "agent": "<name>", "invokedAt": "<ISO>", "completedAt": "<ISO>", "taskSummary": "<text>" }
|
|
81
|
+
]
|
|
82
|
+
}
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Rules
|
|
86
|
+
|
|
87
|
+
- NEVER skip phases. init→plan→execute→verify is mandatory.
|
|
88
|
+
- ALWAYS spawn agents via `Agent` tool with `subagent_type` — never shell out to `claude` CLI.
|
|
89
|
+
- ALWAYS update STATE.json after each phase transition.
|
|
90
|
+
- Max 3 retries per story before skipping.
|
|
91
|
+
- Check `.ctx/STOP` file before starting each story in auto mode.
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ctx-review-gate
|
|
3
|
+
description: |
|
|
4
|
+
WHEN: Code has been implemented and needs quality verification before marking a story complete. Runs three-stage review: spec compliance, code quality, and optional cross-model adversarial review via OpenAI Codex.
|
|
5
|
+
WHEN NOT: During planning, research, or when review gate is disabled in config.
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# CTX Three-Stage Review Gate
|
|
9
|
+
|
|
10
|
+
Automated quality gate that runs after execution and before verification.
|
|
11
|
+
|
|
12
|
+
## Three Stages
|
|
13
|
+
|
|
14
|
+
### Stage 1: Spec Compliance (ctx-reviewer)
|
|
15
|
+
Checks whether the code satisfies the story's acceptance criteria.
|
|
16
|
+
|
|
17
|
+
Spawn:
|
|
18
|
+
```
|
|
19
|
+
Agent({
|
|
20
|
+
subagent_type: "ctx-reviewer",
|
|
21
|
+
prompt: "Review recent changes for SPEC COMPLIANCE against story <ID> acceptance criteria: <list>. Output VERDICT: PASS or FAIL with ISSUES list.",
|
|
22
|
+
description: "Spec compliance review"
|
|
23
|
+
})
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Stage 2: Code Quality (ctx-reviewer)
|
|
27
|
+
Reuses ctx-reviewer with a quality-focused prompt: security, performance, error handling, style. **Only runs if Stage 1 passes.**
|
|
28
|
+
|
|
29
|
+
(Note: earlier versions of this skill called `ctx-auditor` here. That was a miscast — `ctx-auditor` is an audit-trail/compliance agent, not a code-quality reviewer. `ctx-reviewer` already covers type checks, imports, security scans, and best-practice enforcement, so it handles both stages with different framings.)
|
|
30
|
+
|
|
31
|
+
Spawn:
|
|
32
|
+
```
|
|
33
|
+
Agent({
|
|
34
|
+
subagent_type: "ctx-reviewer",
|
|
35
|
+
prompt: "Review recent changes for CODE QUALITY. Check: security vulnerabilities, performance, error handling, style. Output VERDICT: PASS or FAIL with ISSUES list.",
|
|
36
|
+
description: "Code quality review"
|
|
37
|
+
})
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Stage 3: Cross-Model Review (ctx-codex-reviewer) — optional
|
|
41
|
+
Sends the diff to OpenAI Codex via MCP for a second-pair-of-eyes review with different model priors. **Only runs if Stage 2 passes AND `config.codexReview !== false`.**
|
|
42
|
+
|
|
43
|
+
Short-circuits on docs-only, test-only, or trivial (<20 LOC) diffs. Fails soft — if the Codex MCP is unavailable, rate-limited, or unauthenticated, returns `SKIP` rather than `FAIL` so infrastructure problems never block the gate.
|
|
44
|
+
|
|
45
|
+
Spawn:
|
|
46
|
+
```
|
|
47
|
+
Agent({
|
|
48
|
+
subagent_type: "ctx-codex-reviewer",
|
|
49
|
+
prompt: "Cross-model review story <ID>. Dispatch the current diff to Codex via mcp__codex__codex with sandbox=read-only. Acceptance criteria: <list>. Output VERDICT: PASS, FAIL, or SKIP.",
|
|
50
|
+
description: "Codex adversarial review"
|
|
51
|
+
})
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Prerequisites (user-side, not automated by CTX):
|
|
55
|
+
- Codex CLI installed (`npm i -g @openai/codex`)
|
|
56
|
+
- Signed in via ChatGPT subscription (`codex login` — no `--api-key` flag)
|
|
57
|
+
- MCP registered (`claude mcp add codex -- codex mcp-server`)
|
|
58
|
+
|
|
59
|
+
## Flow
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
executor completes
|
|
63
|
+
│
|
|
64
|
+
▼
|
|
65
|
+
Stage 1: ctx-reviewer (spec compliance)
|
|
66
|
+
│
|
|
67
|
+
├── FAIL → Feed issues back to executor, increment cycle
|
|
68
|
+
│
|
|
69
|
+
▼ PASS
|
|
70
|
+
Stage 2: ctx-auditor (code quality)
|
|
71
|
+
│
|
|
72
|
+
├── FAIL → Feed issues back to executor, increment cycle
|
|
73
|
+
│
|
|
74
|
+
▼ PASS
|
|
75
|
+
Stage 3: ctx-codex-reviewer (cross-model, if enabled)
|
|
76
|
+
│
|
|
77
|
+
├── FAIL → Feed issues back to executor, increment cycle
|
|
78
|
+
├── SKIP → Treat as pass (infra problem, not code problem)
|
|
79
|
+
│
|
|
80
|
+
▼ PASS
|
|
81
|
+
Mark story for verification
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Retry Logic
|
|
85
|
+
|
|
86
|
+
- Max 3 review cycles per story
|
|
87
|
+
- Each cycle: execute → review → audit
|
|
88
|
+
- If cycle 3 fails: **ESCALATE** — halt and ask human for review
|
|
89
|
+
- Record each cycle in `.ctx/STATE.json` under `reviewGate.history`
|
|
90
|
+
|
|
91
|
+
## Review Result Format
|
|
92
|
+
|
|
93
|
+
Expect agents to output:
|
|
94
|
+
```
|
|
95
|
+
VERDICT: PASS
|
|
96
|
+
```
|
|
97
|
+
or:
|
|
98
|
+
```
|
|
99
|
+
VERDICT: FAIL
|
|
100
|
+
ISSUES:
|
|
101
|
+
- Missing error handling in auth middleware
|
|
102
|
+
- No test for edge case X
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Parse the VERDICT line to determine pass/fail. Extract ISSUES for feedback.
|
|
106
|
+
|
|
107
|
+
## State Tracking
|
|
108
|
+
|
|
109
|
+
Update `.ctx/STATE.json`:
|
|
110
|
+
```json
|
|
111
|
+
{
|
|
112
|
+
"reviewGate": {
|
|
113
|
+
"cycle": 2,
|
|
114
|
+
"history": [
|
|
115
|
+
{ "cycle": 1, "timestamp": "ISO", "stage1": { "passed": true }, "stage2": { "passed": false, "issues": "..." }, "stage3": null, "result": "fail" },
|
|
116
|
+
{ "cycle": 2, "timestamp": "ISO", "stage1": { "passed": true }, "stage2": { "passed": true }, "stage3": { "passed": true, "threadId": "thr_...", "skipped": false }, "result": "pass" }
|
|
117
|
+
]
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
`stage3` is `null` when Stage 2 fails (not reached) or when `codexReview` is disabled. When Stage 3 runs, record `threadId` so follow-ups reuse the same Codex session.
|
|
123
|
+
|
|
124
|
+
## Save Review Artifacts
|
|
125
|
+
|
|
126
|
+
Write review results to `.ctx/reviews/<story-id>-<timestamp>.json`.
|
|
127
|
+
|
|
128
|
+
## Configuration
|
|
129
|
+
|
|
130
|
+
Review gate can be disabled entirely:
|
|
131
|
+
- Check `.ctx/config.json` for `"reviewGate": false`
|
|
132
|
+
- If disabled, skip directly to verification
|
|
133
|
+
|
|
134
|
+
Stage 3 (Codex cross-review) can be disabled independently:
|
|
135
|
+
- Check `.ctx/config.json` for `"codexReview": false`
|
|
136
|
+
- Useful when offline, when the ChatGPT rate-limit budget is depleted, or when the change is trivial
|
|
137
|
+
- Stages 1 and 2 continue to run normally
|
|
138
|
+
|
|
139
|
+
## Rules
|
|
140
|
+
|
|
141
|
+
- ALWAYS run Stage 1 before Stage 2, Stage 2 before Stage 3 (fail-fast ordering)
|
|
142
|
+
- NEVER run Stage 2 if Stage 1 fails
|
|
143
|
+
- NEVER run Stage 3 if Stage 2 fails, or if `codexReview === false`
|
|
144
|
+
- Stage 3 SKIP (infrastructure failure) is NOT a gate failure — treat as pass
|
|
145
|
+
- ALWAYS feed review issues back to executor as context on retry
|
|
146
|
+
- Max 3 cycles — then escalate to human
|
|
147
|
+
- Record every cycle in state, including `stage3: null` when not reached
|