martin-loop 0.1.5 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CODE_OF_CONDUCT.md +32 -0
- package/LICENSE +21 -21
- package/README.md +307 -398
- package/demo/seeded-workspace/README.md +35 -35
- package/demo/seeded-workspace/TASKS.md +29 -29
- package/demo/seeded-workspace/martin.config.yaml +11 -11
- package/demo/seeded-workspace/package.json +8 -8
- package/demo/seeded-workspace/src/invoice-summary.js +11 -11
- package/demo/seeded-workspace/test/invoice-summary.test.js +20 -20
- package/dist/bin/martin-loop.js +0 -0
- package/dist/vendor/adapters/counter.d.ts +1 -0
- package/dist/vendor/adapters/counter.js +4 -0
- package/dist/vendor/adapters/git-baseline.d.ts +50 -0
- package/dist/vendor/adapters/git-baseline.js +233 -0
- package/dist/vendor/adapters/openrouter-adapter.d.ts +15 -0
- package/dist/vendor/adapters/openrouter-adapter.js +302 -0
- package/dist/vendor/adapters/usage.d.ts +48 -0
- package/dist/vendor/adapters/usage.js +66 -0
- package/dist/vendor/cli/bin/exit.d.ts +12 -0
- package/dist/vendor/cli/bin/exit.js +28 -0
- package/dist/vendor/cli/commands/analyze.d.ts +5 -0
- package/dist/vendor/cli/commands/analyze.js +58 -0
- package/dist/vendor/cli/commands/audit-log-verify.d.ts +34 -0
- package/dist/vendor/cli/commands/audit-log-verify.js +99 -0
- package/dist/vendor/cli/commands/audit.d.ts +8 -0
- package/dist/vendor/cli/commands/audit.js +199 -0
- package/dist/vendor/cli/commands/corpus.d.ts +5 -0
- package/dist/vendor/cli/commands/corpus.js +60 -0
- package/dist/vendor/cli/commands/doctor.d.ts +8 -0
- package/dist/vendor/cli/commands/doctor.js +219 -0
- package/dist/vendor/cli/commands/explain.d.ts +17 -0
- package/dist/vendor/cli/commands/explain.js +176 -0
- package/dist/vendor/cli/commands/export.d.ts +5 -0
- package/dist/vendor/cli/commands/export.js +60 -0
- package/dist/vendor/cli/commands/governance.d.ts +8 -0
- package/dist/vendor/cli/commands/governance.js +95 -0
- package/dist/vendor/cli/commands/improve.d.ts +18 -0
- package/dist/vendor/cli/commands/improve.js +396 -0
- package/dist/vendor/cli/commands/init.d.ts +8 -0
- package/dist/vendor/cli/commands/init.js +281 -0
- package/dist/vendor/cli/commands/migration.d.ts +8 -0
- package/dist/vendor/cli/commands/migration.js +67 -0
- package/dist/vendor/cli/commands/prior.d.ts +23 -0
- package/dist/vendor/cli/commands/prior.js +145 -0
- package/dist/vendor/cli/commands/resume.d.ts +21 -0
- package/dist/vendor/cli/commands/resume.js +73 -0
- package/dist/vendor/cli/commands/verify.d.ts +6 -0
- package/dist/vendor/cli/commands/verify.js +43 -0
- package/dist/vendor/cli/research/public-corpus.d.ts +43 -0
- package/dist/vendor/cli/research/public-corpus.js +151 -0
- package/dist/vendor/cli/ui/error-card.d.ts +38 -0
- package/dist/vendor/cli/ui/error-card.js +103 -0
- package/dist/vendor/cli/ui/mission-brief.d.ts +41 -0
- package/dist/vendor/cli/ui/mission-brief.js +173 -0
- package/dist/vendor/cli/ui/summary-card.d.ts +34 -0
- package/dist/vendor/cli/ui/summary-card.js +102 -0
- package/dist/vendor/contracts/audit.d.ts +46 -0
- package/dist/vendor/contracts/audit.js +360 -0
- package/dist/vendor/contracts/post-phase15.d.ts +240 -0
- package/dist/vendor/contracts/post-phase15.js +166 -0
- package/dist/vendor/core/agent/mandates.d.ts +46 -0
- package/dist/vendor/core/agent/mandates.js +178 -0
- package/dist/vendor/core/agent/receipts.d.ts +38 -0
- package/dist/vendor/core/agent/receipts.js +131 -0
- package/dist/vendor/core/agent/signing.d.ts +17 -0
- package/dist/vendor/core/agent/signing.js +91 -0
- package/dist/vendor/core/attestation/sign.d.ts +25 -0
- package/dist/vendor/core/attestation/sign.js +216 -0
- package/dist/vendor/core/autonomy/autonomous-promotion.d.ts +120 -0
- package/dist/vendor/core/autonomy/autonomous-promotion.js +346 -0
- package/dist/vendor/core/autonomy/envelope-v2.d.ts +29 -0
- package/dist/vendor/core/autonomy/envelope-v2.js +60 -0
- package/dist/vendor/core/autonomy/envelope.d.ts +17 -0
- package/dist/vendor/core/autonomy/envelope.js +27 -0
- package/dist/vendor/core/autonomy/escalation-ledger.d.ts +20 -0
- package/dist/vendor/core/autonomy/escalation-ledger.js +18 -0
- package/dist/vendor/core/autonomy/resume.d.ts +15 -0
- package/dist/vendor/core/autonomy/resume.js +23 -0
- package/dist/vendor/core/circuit/circuit-breaker.d.ts +60 -0
- package/dist/vendor/core/circuit/circuit-breaker.js +143 -0
- package/dist/vendor/core/context-distillation.d.ts +3 -0
- package/dist/vendor/core/context-distillation.js +44 -0
- package/dist/vendor/core/context-flow/compile-context.d.ts +8 -0
- package/dist/vendor/core/context-flow/compile-context.js +111 -0
- package/dist/vendor/core/context-flow/entities.d.ts +2 -0
- package/dist/vendor/core/context-flow/entities.js +44 -0
- package/dist/vendor/core/context-flow/evaluate-policy.d.ts +2 -0
- package/dist/vendor/core/context-flow/evaluate-policy.js +42 -0
- package/dist/vendor/core/context-flow/index.d.ts +11 -0
- package/dist/vendor/core/context-flow/index.js +24 -0
- package/dist/vendor/core/context-flow/labels.d.ts +3 -0
- package/dist/vendor/core/context-flow/labels.js +17 -0
- package/dist/vendor/core/context-flow/normalizer.d.ts +9 -0
- package/dist/vendor/core/context-flow/normalizer.js +69 -0
- package/dist/vendor/core/context-flow/profiles.d.ts +33 -0
- package/dist/vendor/core/context-flow/profiles.js +36 -0
- package/dist/vendor/core/context-flow/redaction.d.ts +1 -0
- package/dist/vendor/core/context-flow/redaction.js +6 -0
- package/dist/vendor/core/context-flow/sensitivity.d.ts +2 -0
- package/dist/vendor/core/context-flow/sensitivity.js +27 -0
- package/dist/vendor/core/context-flow/sync-preview.d.ts +2 -0
- package/dist/vendor/core/context-flow/sync-preview.js +22 -0
- package/dist/vendor/core/context-flow/token-estimator.d.ts +3 -0
- package/dist/vendor/core/context-flow/token-estimator.js +13 -0
- package/dist/vendor/core/context-flow/types.d.ts +91 -0
- package/dist/vendor/core/context-flow/types.js +2 -0
- package/dist/vendor/core/context-utility.d.ts +47 -0
- package/dist/vendor/core/context-utility.js +405 -0
- package/dist/vendor/core/cost/pipeline.d.ts +92 -0
- package/dist/vendor/core/cost/pipeline.js +141 -0
- package/dist/vendor/core/cost/tagged-cost.d.ts +27 -0
- package/dist/vendor/core/cost/tagged-cost.js +55 -0
- package/dist/vendor/core/cost-governor.d.ts +2 -0
- package/dist/vendor/core/cost-governor.js +50 -0
- package/dist/vendor/core/cve/cve-check.d.ts +80 -0
- package/dist/vendor/core/cve/cve-check.js +172 -0
- package/dist/vendor/core/digital-twin/index.d.ts +27 -0
- package/dist/vendor/core/digital-twin/index.js +90 -0
- package/dist/vendor/core/drift/drift-graph.d.ts +47 -0
- package/dist/vendor/core/drift/drift-graph.js +100 -0
- package/dist/vendor/core/drift/objective-lock.d.ts +69 -0
- package/dist/vendor/core/drift/objective-lock.js +88 -0
- package/dist/vendor/core/drift/scope.d.ts +46 -0
- package/dist/vendor/core/drift/scope.js +102 -0
- package/dist/vendor/core/drift/signature-lock.d.ts +48 -0
- package/dist/vendor/core/drift/signature-lock.js +202 -0
- package/dist/vendor/core/drift/stale-proof-gate.d.ts +21 -0
- package/dist/vendor/core/drift/stale-proof-gate.js +19 -0
- package/dist/vendor/core/eval/known-bad-world-runner.d.ts +24 -0
- package/dist/vendor/core/eval/known-bad-world-runner.js +256 -0
- package/dist/vendor/core/evidence/claim-audit.d.ts +18 -0
- package/dist/vendor/core/evidence/claim-audit.js +89 -0
- package/dist/vendor/core/exit-intelligence.d.ts +2 -0
- package/dist/vendor/core/exit-intelligence.js +58 -0
- package/dist/vendor/core/explain/formatter.d.ts +42 -0
- package/dist/vendor/core/explain/formatter.js +171 -0
- package/dist/vendor/core/explain/timeline.d.ts +29 -0
- package/dist/vendor/core/explain/timeline.js +213 -0
- package/dist/vendor/core/failure-taxonomy.d.ts +2 -0
- package/dist/vendor/core/failure-taxonomy.js +76 -0
- package/dist/vendor/core/gateway/index.d.ts +10 -0
- package/dist/vendor/core/gateway/index.js +12 -0
- package/dist/vendor/core/gateway/registry.d.ts +40 -0
- package/dist/vendor/core/gateway/registry.js +97 -0
- package/dist/vendor/core/gateway/transport.d.ts +31 -0
- package/dist/vendor/core/gateway/transport.js +82 -0
- package/dist/vendor/core/gateway/vault.d.ts +19 -0
- package/dist/vendor/core/gateway/vault.js +29 -0
- package/dist/vendor/core/graph/adapters.d.ts +43 -0
- package/dist/vendor/core/graph/adapters.js +91 -0
- package/dist/vendor/core/graph/hotspots.d.ts +22 -0
- package/dist/vendor/core/graph/hotspots.js +30 -0
- package/dist/vendor/core/graph/index.d.ts +1 -0
- package/dist/vendor/core/graph/index.js +2 -0
- package/dist/vendor/core/honey/honey-tokens.d.ts +32 -0
- package/dist/vendor/core/honey/honey-tokens.js +44 -0
- package/dist/vendor/core/index.d.ts +2 -2
- package/dist/vendor/core/index.js +38 -12
- package/dist/vendor/core/learning/bayesian-update.d.ts +31 -0
- package/dist/vendor/core/learning/bayesian-update.js +60 -0
- package/dist/vendor/core/learning/prior-sets.d.ts +42 -0
- package/dist/vendor/core/learning/prior-sets.js +111 -0
- package/dist/vendor/core/learning/promotion-gate.d.ts +17 -0
- package/dist/vendor/core/learning/promotion-gate.js +23 -0
- package/dist/vendor/core/leash/blast-radius.d.ts +42 -0
- package/dist/vendor/core/leash/blast-radius.js +156 -0
- package/dist/vendor/core/leash/policy-leash.d.ts +31 -0
- package/dist/vendor/core/leash/policy-leash.js +117 -0
- package/dist/vendor/core/memo/memo.d.ts +63 -0
- package/dist/vendor/core/memo/memo.js +97 -0
- package/dist/vendor/core/memory/learning-pipeline.d.ts +154 -0
- package/dist/vendor/core/memory/learning-pipeline.js +391 -0
- package/dist/vendor/core/memory/palace.d.ts +84 -0
- package/dist/vendor/core/memory/palace.js +379 -0
- package/dist/vendor/core/merge/ast-merge.d.ts +22 -0
- package/dist/vendor/core/merge/ast-merge.js +350 -0
- package/dist/vendor/core/merge/text-merge.d.ts +12 -0
- package/dist/vendor/core/merge/text-merge.js +182 -0
- package/dist/vendor/core/otel/tracer.d.ts +45 -0
- package/dist/vendor/core/otel/tracer.js +116 -0
- package/dist/vendor/core/parallel/parallel-attempts.d.ts +28 -0
- package/dist/vendor/core/parallel/parallel-attempts.js +41 -0
- package/dist/vendor/core/parallel/scorer.d.ts +24 -0
- package/dist/vendor/core/parallel/scorer.js +65 -0
- package/dist/vendor/core/pattern-detection.d.ts +64 -0
- package/dist/vendor/core/pattern-detection.js +108 -0
- package/dist/vendor/core/persistence/checkpoint.d.ts +44 -0
- package/dist/vendor/core/persistence/checkpoint.js +156 -0
- package/dist/vendor/core/persistence/cleanup.d.ts +22 -0
- package/dist/vendor/core/persistence/cleanup.js +131 -0
- package/dist/vendor/core/persistence/index.d.ts +2 -0
- package/dist/vendor/core/persistence/index.js +1 -0
- package/dist/vendor/core/persistence/runs-reader.d.ts +52 -0
- package/dist/vendor/core/persistence/runs-reader.js +84 -0
- package/dist/vendor/core/persistence/store.d.ts +6 -1
- package/dist/vendor/core/persistence/store.js +5 -0
- package/dist/vendor/core/policy/file-touch-quota.d.ts +60 -0
- package/dist/vendor/core/policy/file-touch-quota.js +105 -0
- package/dist/vendor/core/policy/policy-loader.d.ts +30 -0
- package/dist/vendor/core/policy/policy-loader.js +170 -0
- package/dist/vendor/core/policy/policy-schema.d.ts +55 -0
- package/dist/vendor/core/policy/policy-schema.js +78 -0
- package/dist/vendor/core/probe/probe.d.ts +49 -0
- package/dist/vendor/core/probe/probe.js +115 -0
- package/dist/vendor/core/proof/patch-proof.d.ts +58 -0
- package/dist/vendor/core/proof/patch-proof.js +84 -0
- package/dist/vendor/core/proof/semantic-probe.d.ts +25 -0
- package/dist/vendor/core/proof/semantic-probe.js +82 -0
- package/dist/vendor/core/recovery/failure-mode-runner.d.ts +29 -0
- package/dist/vendor/core/recovery/failure-mode-runner.js +39 -0
- package/dist/vendor/core/red-blue/red-phase.d.ts +64 -0
- package/dist/vendor/core/red-blue/red-phase.js +141 -0
- package/dist/vendor/core/red-blue/risk-tiers.d.ts +22 -0
- package/dist/vendor/core/red-blue/risk-tiers.js +33 -0
- package/dist/vendor/core/replay/replay.d.ts +85 -0
- package/dist/vendor/core/replay/replay.js +109 -0
- package/dist/vendor/core/router/engine.d.ts +54 -0
- package/dist/vendor/core/router/engine.js +131 -0
- package/dist/vendor/core/router/index.d.ts +1 -0
- package/dist/vendor/core/router/index.js +2 -0
- package/dist/vendor/core/router/trust-calibration.d.ts +57 -0
- package/dist/vendor/core/router/trust-calibration.js +127 -0
- package/dist/vendor/core/run-martin.d.ts +2 -0
- package/dist/vendor/core/run-martin.js +287 -0
- package/dist/vendor/core/security/cve-scanner.d.ts +62 -0
- package/dist/vendor/core/security/cve-scanner.js +178 -0
- package/dist/vendor/core/sentinel/efficiency-sentinel.d.ts +29 -0
- package/dist/vendor/core/sentinel/efficiency-sentinel.js +30 -0
- package/dist/vendor/core/sentinel/progress-guard.d.ts +35 -0
- package/dist/vendor/core/sentinel/progress-guard.js +46 -0
- package/dist/vendor/core/siem/siem-emitter.d.ts +49 -0
- package/dist/vendor/core/siem/siem-emitter.js +157 -0
- package/dist/vendor/core/strategy/attempt-brief.d.ts +22 -0
- package/dist/vendor/core/strategy/attempt-brief.js +89 -0
- package/dist/vendor/core/summarize/diff-summary.d.ts +35 -0
- package/dist/vendor/core/summarize/diff-summary.js +204 -0
- package/dist/vendor/core/surface-signals.d.ts +21 -0
- package/dist/vendor/core/surface-signals.js +139 -0
- package/dist/vendor/core/truth/truth-wall.d.ts +51 -0
- package/dist/vendor/core/truth/truth-wall.js +69 -0
- package/dist/vendor/core/truth-spine.d.ts +26 -0
- package/dist/vendor/core/truth-spine.js +62 -0
- package/dist/vendor/core/types.d.ts +115 -0
- package/dist/vendor/core/types.js +2 -0
- package/dist/vendor/core/verification/tiered-verify.d.ts +17 -0
- package/dist/vendor/core/verification/tiered-verify.js +29 -0
- package/dist/vendor/core/verifier-pyramid.d.ts +32 -0
- package/dist/vendor/core/verifier-pyramid.js +111 -0
- package/dist/vendor/core/workflow-artifacts.d.ts +99 -0
- package/dist/vendor/core/workflow-artifacts.js +668 -0
- package/dist/vendor/core/wrap/supervised-run.d.ts +96 -0
- package/dist/vendor/core/wrap/supervised-run.js +178 -0
- package/docs/assets/cli-animated.svg +139 -0
- package/docs/assets/cli-static.svg +34 -0
- package/docs/assets/github-hero-v2.svg +23 -0
- package/docs/assets/martin-raplph.png.jpg +0 -0
- package/docs/assets/martinloop-logo.png +0 -0
- package/docs/assets/nvidia-inception-program-light.png +0 -0
- package/docs/assets/nvidia-inception-program.png +0 -0
- package/docs/assets/phase3c-sidesidebyside-demo.html +228 -0
- package/docs/assets/side-by-side.svg +134 -0
- package/docs/oss/CLAUDE-CODE-WALKTHROUGH.md +142 -142
- package/docs/oss/EXAMPLES.md +134 -134
- package/docs/oss/OSS-BOUNDARY-REPORT.json +1 -1
- package/docs/oss/OSS-BOUNDARY-REPORT.md +1 -1
- package/docs/oss/QUICKSTART.md +170 -165
- package/docs/oss/RALPH-LOOP-SAFETY.md +113 -113
- package/docs/oss/README.md +96 -96
- package/docs/oss/RELEASE-SURFACE-REPORT.json +2 -1
- package/docs/oss/RELEASE-SURFACE-REPORT.md +2 -1
- package/package.json +130 -58
- package/docs/distribution/DIRECTORY-SUBMISSIONS.md +0 -89
- package/docs/distribution/INTEGRATION-OUTREACH.md +0 -61
- package/docs/distribution/UNDER-3-CHALLENGE.md +0 -65
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import { PROBE_COUNTS, RED_PHASE_MODEL, resolveRedBudgetPolicy } from "./risk-tiers.js";
|
|
2
|
+
// ─── Programmatic probes ──────────────────────────────────────────────────────
|
|
3
|
+
/**
|
|
4
|
+
* Runs programmatic adversarial probes against a patch diff.
|
|
5
|
+
* These are deterministic checks — no model call.
|
|
6
|
+
*/
|
|
7
|
+
function runProgrammaticProbes(patch, paranoid) {
|
|
8
|
+
const findings = [];
|
|
9
|
+
// Probe 1: assertion deletion
|
|
10
|
+
if (/^\-.*expect\(.*\)\.to/m.test(patch.diff)) {
|
|
11
|
+
findings.push({
|
|
12
|
+
trapId: "T01",
|
|
13
|
+
severity: "warn",
|
|
14
|
+
description: "Patch removes test assertions — possible weak-test false green."
|
|
15
|
+
});
|
|
16
|
+
}
|
|
17
|
+
// Probe 2: grounding evasion pragma
|
|
18
|
+
if (/@martin-ignore|@ts-nocheck|eslint-disable/i.test(patch.diff)) {
|
|
19
|
+
findings.push({
|
|
20
|
+
trapId: "T11",
|
|
21
|
+
severity: "block",
|
|
22
|
+
description: "Grounding evasion pragma detected in patch."
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
// Probe 3: context poisoning
|
|
26
|
+
if (/\.martin\/|\.claude\//.test(patch.diff)) {
|
|
27
|
+
findings.push({
|
|
28
|
+
trapId: "T07",
|
|
29
|
+
severity: "block",
|
|
30
|
+
description: "Patch writes to agent context directory (.martin/ or .claude/)."
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
// Probe 4: budget self-report
|
|
34
|
+
if (/budgetSource.*model_reported|model_reported.*budget/i.test(patch.diff)) {
|
|
35
|
+
findings.push({
|
|
36
|
+
trapId: "T10",
|
|
37
|
+
severity: "block",
|
|
38
|
+
description: "Patch introduces model-reported budget sourcing."
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
if (paranoid) {
|
|
42
|
+
// Probe 5: scope creep — manifest changes
|
|
43
|
+
if (/^\+.*"[^"]+"\s*:\s*"[^"]+".*$/m.test(patch.diff) &&
|
|
44
|
+
/package\.json|Cargo\.toml|go\.mod/i.test(patch.changedFiles.join(","))) {
|
|
45
|
+
findings.push({
|
|
46
|
+
trapId: "T03",
|
|
47
|
+
severity: "warn",
|
|
48
|
+
description: "Paranoid scan: substantive manifest change detected."
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
// Probe 6: silent revert — removal of recently added symbols
|
|
52
|
+
const removedExportPattern = /^\-.*export\s+(function|const|class)\s+\w+/m;
|
|
53
|
+
if (removedExportPattern.test(patch.diff)) {
|
|
54
|
+
findings.push({
|
|
55
|
+
trapId: "T02",
|
|
56
|
+
severity: "warn",
|
|
57
|
+
description: "Paranoid scan: exported symbol removed — potential silent revert."
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return findings;
|
|
62
|
+
}
|
|
63
|
+
// ─── Red phase runner ─────────────────────────────────────────────────────────
|
|
64
|
+
/**
|
|
65
|
+
* Runs the Red phase for a given patch and risk tier.
|
|
66
|
+
*
|
|
67
|
+
* - baseline: programmatic probes only, no model call
|
|
68
|
+
* - high_risk: paranoid programmatic scan, no model call
|
|
69
|
+
* - release_critical: paranoid scan + one Haiku model call
|
|
70
|
+
*/
|
|
71
|
+
export async function runRedPhase(patch, tier, blueBudgetUsd, options = {}) {
|
|
72
|
+
const policy = resolveRedBudgetPolicy(tier, blueBudgetUsd);
|
|
73
|
+
const paranoid = tier !== "baseline";
|
|
74
|
+
let findings = runProgrammaticProbes(patch, paranoid);
|
|
75
|
+
let modelCallMade = false;
|
|
76
|
+
let modelUsed;
|
|
77
|
+
let budgetUsedUsd = 0;
|
|
78
|
+
const probesRun = PROBE_COUNTS[tier];
|
|
79
|
+
if (policy.modelCallAllowed && options.modelClient) {
|
|
80
|
+
const prompt = buildRedPhasePrompt(patch, findings);
|
|
81
|
+
const result = await options.modelClient.complete(prompt);
|
|
82
|
+
findings = [...findings, ...result.findings];
|
|
83
|
+
modelCallMade = true;
|
|
84
|
+
modelUsed = RED_PHASE_MODEL;
|
|
85
|
+
budgetUsedUsd += result.costUsd;
|
|
86
|
+
}
|
|
87
|
+
const result = {
|
|
88
|
+
riskTier: tier,
|
|
89
|
+
probesRun,
|
|
90
|
+
findingsCount: findings.length,
|
|
91
|
+
findings,
|
|
92
|
+
modelCallMade,
|
|
93
|
+
...(modelUsed !== undefined ? { modelUsed } : {}),
|
|
94
|
+
budgetUsedUsd
|
|
95
|
+
};
|
|
96
|
+
// Emit ledger event
|
|
97
|
+
options.onLedgerEvent?.({
|
|
98
|
+
type: "red_phase_findings",
|
|
99
|
+
riskTier: tier,
|
|
100
|
+
probesRun,
|
|
101
|
+
findingsCount: findings.length,
|
|
102
|
+
modelCallMade,
|
|
103
|
+
timestamp: new Date().toISOString()
|
|
104
|
+
});
|
|
105
|
+
return result;
|
|
106
|
+
}
|
|
107
|
+
// ─── Policy helpers ───────────────────────────────────────────────────────────
|
|
108
|
+
/**
|
|
109
|
+
* Returns true only if the findings contain zero block-severity entries.
|
|
110
|
+
* A single block finding rejects the patch regardless of warn count.
|
|
111
|
+
*/
|
|
112
|
+
export function shouldAcceptPatch(findings) {
|
|
113
|
+
return !findings.findings.some((f) => f.severity === "block");
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Convenience builder for RedFindings — useful in tests and policy engine.
|
|
117
|
+
*/
|
|
118
|
+
export function buildRedFindings(input) {
|
|
119
|
+
return {
|
|
120
|
+
riskTier: input.riskTier,
|
|
121
|
+
probesRun: input.probesRun ?? PROBE_COUNTS[input.riskTier],
|
|
122
|
+
findingsCount: input.findings.length,
|
|
123
|
+
findings: input.findings,
|
|
124
|
+
modelCallMade: input.modelCallMade ?? false,
|
|
125
|
+
...(input.modelUsed !== undefined ? { modelUsed: input.modelUsed } : {}),
|
|
126
|
+
budgetUsedUsd: input.budgetUsedUsd ?? 0
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
// ─── Internal helpers ─────────────────────────────────────────────────────────
|
|
130
|
+
function buildRedPhasePrompt(patch, existingFindings) {
|
|
131
|
+
return [
|
|
132
|
+
"You are a security-focused code reviewer running adversarial analysis.",
|
|
133
|
+
"Analyze this patch for adversarial patterns not caught by programmatic probes.",
|
|
134
|
+
"Return JSON array of findings with: trapId, severity (warn|block), description.",
|
|
135
|
+
`Already detected: ${JSON.stringify(existingFindings.map(f => f.trapId))}`,
|
|
136
|
+
`Changed files: ${patch.changedFiles.join(", ")}`,
|
|
137
|
+
"Diff (truncated to 2000 chars):",
|
|
138
|
+
patch.diff.slice(0, 2000)
|
|
139
|
+
].join("\n");
|
|
140
|
+
}
|
|
141
|
+
//# sourceMappingURL=red-phase.js.map
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export type RiskTier = "baseline" | "high_risk" | "release_critical";
|
|
2
|
+
export interface RedBudgetPolicy {
|
|
3
|
+
riskTier: RiskTier;
|
|
4
|
+
blueBudgetUsd: number;
|
|
5
|
+
/** Cap on Red phase spend: 30% / 100% / 150% of Blue */
|
|
6
|
+
redBudgetCapUsd: number;
|
|
7
|
+
/** Only release_critical permits a Haiku model call */
|
|
8
|
+
modelCallAllowed: boolean;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Returns the Red phase budget policy for a given risk tier and Blue budget.
|
|
12
|
+
*/
|
|
13
|
+
export declare function resolveRedBudgetPolicy(tier: RiskTier, blueBudgetUsd: number): RedBudgetPolicy;
|
|
14
|
+
/**
|
|
15
|
+
* Probe counts per tier.
|
|
16
|
+
* baseline = standard 6-probe sweep
|
|
17
|
+
* high_risk = paranoid 12-probe sweep
|
|
18
|
+
* release_critical = paranoid 12-probe sweep + model
|
|
19
|
+
*/
|
|
20
|
+
export declare const PROBE_COUNTS: Record<RiskTier, number>;
|
|
21
|
+
/** The only model ever permitted in the Red phase. */
|
|
22
|
+
export declare const RED_PHASE_MODEL: "claude-haiku-4-5-20251001";
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
// ─── Risk Tier Definitions ────────────────────────────────────────────────────
|
|
2
|
+
// Governs how aggressively Red phase probes a patch and whether a model call
|
|
3
|
+
// is permitted. Budget caps are expressed as fractions of the Blue phase budget.
|
|
4
|
+
const BUDGET_MULTIPLIERS = {
|
|
5
|
+
baseline: 0.30,
|
|
6
|
+
high_risk: 1.00,
|
|
7
|
+
release_critical: 1.50
|
|
8
|
+
};
|
|
9
|
+
/**
|
|
10
|
+
* Returns the Red phase budget policy for a given risk tier and Blue budget.
|
|
11
|
+
*/
|
|
12
|
+
export function resolveRedBudgetPolicy(tier, blueBudgetUsd) {
|
|
13
|
+
return {
|
|
14
|
+
riskTier: tier,
|
|
15
|
+
blueBudgetUsd,
|
|
16
|
+
redBudgetCapUsd: blueBudgetUsd * BUDGET_MULTIPLIERS[tier],
|
|
17
|
+
modelCallAllowed: tier === "release_critical"
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Probe counts per tier.
|
|
22
|
+
* baseline = standard 6-probe sweep
|
|
23
|
+
* high_risk = paranoid 12-probe sweep
|
|
24
|
+
* release_critical = paranoid 12-probe sweep + model
|
|
25
|
+
*/
|
|
26
|
+
export const PROBE_COUNTS = {
|
|
27
|
+
baseline: 6,
|
|
28
|
+
high_risk: 12,
|
|
29
|
+
release_critical: 12
|
|
30
|
+
};
|
|
31
|
+
/** The only model ever permitted in the Red phase. */
|
|
32
|
+
export const RED_PHASE_MODEL = "claude-haiku-4-5-20251001";
|
|
33
|
+
//# sourceMappingURL=risk-tiers.js.map
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* replay.ts — SLICE-10
|
|
3
|
+
*
|
|
4
|
+
* Reproducibility seal: re-runs the decision pipeline over stored attempt
|
|
5
|
+
* artifacts (no model calls) and confirms every gate decision still matches.
|
|
6
|
+
*
|
|
7
|
+
* Any deterministic mismatch is a P0 defect in the decision pipeline.
|
|
8
|
+
*/
|
|
9
|
+
export interface StoredAttemptArtifact {
|
|
10
|
+
attemptId: string;
|
|
11
|
+
loopId: string;
|
|
12
|
+
/** The diff that was evaluated */
|
|
13
|
+
diff: string;
|
|
14
|
+
/** The objective at time of evaluation */
|
|
15
|
+
objective: string;
|
|
16
|
+
/** Decisions recorded during the original run */
|
|
17
|
+
decisions: {
|
|
18
|
+
leash: {
|
|
19
|
+
blocked: boolean;
|
|
20
|
+
matchedPattern?: string;
|
|
21
|
+
};
|
|
22
|
+
grounding: {
|
|
23
|
+
contradictions: number;
|
|
24
|
+
};
|
|
25
|
+
proof: {
|
|
26
|
+
grade: "A" | "B" | "C";
|
|
27
|
+
passed: boolean;
|
|
28
|
+
};
|
|
29
|
+
finalVerdict: "ACCEPTED" | "REJECTED";
|
|
30
|
+
};
|
|
31
|
+
/** ISO timestamp of original run */
|
|
32
|
+
recordedAt: string;
|
|
33
|
+
}
|
|
34
|
+
export interface ReplayDecisions {
|
|
35
|
+
leash: {
|
|
36
|
+
blocked: boolean;
|
|
37
|
+
matchedPattern?: string;
|
|
38
|
+
};
|
|
39
|
+
grounding: {
|
|
40
|
+
contradictions: number;
|
|
41
|
+
};
|
|
42
|
+
proof: {
|
|
43
|
+
grade: "A" | "B" | "C";
|
|
44
|
+
passed: boolean;
|
|
45
|
+
};
|
|
46
|
+
finalVerdict: "ACCEPTED" | "REJECTED";
|
|
47
|
+
}
|
|
48
|
+
export interface ReplayMismatch {
|
|
49
|
+
gate: "leash" | "grounding" | "proof" | "finalVerdict";
|
|
50
|
+
original: unknown;
|
|
51
|
+
replayed: unknown;
|
|
52
|
+
severity: "P0" | "P1";
|
|
53
|
+
}
|
|
54
|
+
export interface ReplayReport {
|
|
55
|
+
loopId: string;
|
|
56
|
+
attemptId: string;
|
|
57
|
+
match: boolean;
|
|
58
|
+
mismatches: ReplayMismatch[];
|
|
59
|
+
replayedAt: string;
|
|
60
|
+
/** SHA-256 over (loopId + attemptId + replayedAt + match) */
|
|
61
|
+
replayHash: string;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Re-runs the leash check over a stored diff.
|
|
65
|
+
* Uses the same BLOCKED_PATTERNS from leash.ts — but as a lightweight
|
|
66
|
+
* re-implementation to avoid circular deps in the replay layer.
|
|
67
|
+
*/
|
|
68
|
+
export declare function replayLeashCheck(diff: string): {
|
|
69
|
+
blocked: boolean;
|
|
70
|
+
matchedPattern?: string;
|
|
71
|
+
};
|
|
72
|
+
/**
|
|
73
|
+
* Re-evaluates the proof grade from stored inputs.
|
|
74
|
+
* Grade is deterministic from the verifier result string.
|
|
75
|
+
*/
|
|
76
|
+
export declare function replayProofGrade(verifierResult: string, objective: string): {
|
|
77
|
+
grade: "A" | "B" | "C";
|
|
78
|
+
passed: boolean;
|
|
79
|
+
};
|
|
80
|
+
export declare function replayAttempt(artifact: StoredAttemptArtifact, verifierResult?: string): ReplayReport;
|
|
81
|
+
export declare function replayLoop(artifacts: StoredAttemptArtifact[], verifierResults?: Map<string, string>): {
|
|
82
|
+
reports: ReplayReport[];
|
|
83
|
+
allMatch: boolean;
|
|
84
|
+
p0Count: number;
|
|
85
|
+
};
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* replay.ts — SLICE-10
|
|
3
|
+
*
|
|
4
|
+
* Reproducibility seal: re-runs the decision pipeline over stored attempt
|
|
5
|
+
* artifacts (no model calls) and confirms every gate decision still matches.
|
|
6
|
+
*
|
|
7
|
+
* Any deterministic mismatch is a P0 defect in the decision pipeline.
|
|
8
|
+
*/
|
|
9
|
+
import { createHash } from "node:crypto";
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Pipeline re-runners (deterministic, no model calls)
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
/**
|
|
14
|
+
* Re-runs the leash check over a stored diff.
|
|
15
|
+
* Uses the same BLOCKED_PATTERNS from leash.ts — but as a lightweight
|
|
16
|
+
* re-implementation to avoid circular deps in the replay layer.
|
|
17
|
+
*/
|
|
18
|
+
export function replayLeashCheck(diff) {
|
|
19
|
+
const DANGEROUS_PATTERNS = [
|
|
20
|
+
{ name: "SHELL_RM_RF", re: /rm\s+-rf?\b/ },
|
|
21
|
+
{ name: "SHELL_PIPE_EVAL", re: /curl.*\|\s*(?:ba)?sh|wget.*\|\s*(?:ba)?sh/ },
|
|
22
|
+
{ name: "GIT_FORCE_PUSH", re: /git\s+push\s+.*--force/ },
|
|
23
|
+
{ name: "GIT_RESET_HARD", re: /git\s+reset\s+--hard/ },
|
|
24
|
+
{ name: "FORK_BOMB", re: /:\s*\(\s*\)\s*\{.*:.*\|.*:.*\}/ },
|
|
25
|
+
{ name: "SHELL_CHMOD_777", re: /chmod\s+(?:a\+rwx|777)/ },
|
|
26
|
+
{ name: "NODE_EXEC_EVAL", re: /eval\s*\(/ },
|
|
27
|
+
{ name: "SUDO_ESCALATION", re: /sudo\s+/ },
|
|
28
|
+
];
|
|
29
|
+
for (const { name, re } of DANGEROUS_PATTERNS) {
|
|
30
|
+
if (re.test(diff))
|
|
31
|
+
return { blocked: true, matchedPattern: name };
|
|
32
|
+
}
|
|
33
|
+
return { blocked: false };
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Re-evaluates the proof grade from stored inputs.
|
|
37
|
+
* Grade is deterministic from the verifier result string.
|
|
38
|
+
*/
|
|
39
|
+
export function replayProofGrade(verifierResult, objective) {
|
|
40
|
+
const lower = verifierResult.toLowerCase();
|
|
41
|
+
const hasPass = lower.includes("pass") || lower.includes("ok") || lower.includes("✓");
|
|
42
|
+
const hasFail = lower.includes("fail") || lower.includes("error") || lower.includes("✗");
|
|
43
|
+
if (hasPass && !hasFail)
|
|
44
|
+
return { grade: "A", passed: true };
|
|
45
|
+
if (hasPass && hasFail)
|
|
46
|
+
return { grade: "B", passed: false };
|
|
47
|
+
return { grade: "C", passed: false };
|
|
48
|
+
}
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// Core replay function
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
export function replayAttempt(artifact, verifierResult) {
|
|
53
|
+
const replayedAt = new Date().toISOString();
|
|
54
|
+
const mismatches = [];
|
|
55
|
+
// Re-run leash
|
|
56
|
+
const replayLeash = replayLeashCheck(artifact.diff);
|
|
57
|
+
if (replayLeash.blocked !== artifact.decisions.leash.blocked) {
|
|
58
|
+
mismatches.push({
|
|
59
|
+
gate: "leash",
|
|
60
|
+
original: artifact.decisions.leash,
|
|
61
|
+
replayed: replayLeash,
|
|
62
|
+
severity: "P0"
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
// Re-run proof grade (if verifier result provided)
|
|
66
|
+
if (verifierResult !== undefined) {
|
|
67
|
+
const replayProof = replayProofGrade(verifierResult, artifact.objective);
|
|
68
|
+
if (replayProof.grade !== artifact.decisions.proof.grade) {
|
|
69
|
+
mismatches.push({
|
|
70
|
+
gate: "proof",
|
|
71
|
+
original: artifact.decisions.proof,
|
|
72
|
+
replayed: replayProof,
|
|
73
|
+
severity: "P0"
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
// Final verdict: derive from replayed decisions
|
|
78
|
+
const replayedVerdict = (!replayLeash.blocked && (verifierResult === undefined || artifact.decisions.proof.passed)) ? "ACCEPTED" : "REJECTED";
|
|
79
|
+
if (replayedVerdict !== artifact.decisions.finalVerdict) {
|
|
80
|
+
mismatches.push({
|
|
81
|
+
gate: "finalVerdict",
|
|
82
|
+
original: artifact.decisions.finalVerdict,
|
|
83
|
+
replayed: replayedVerdict,
|
|
84
|
+
severity: "P0"
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
const match = mismatches.length === 0;
|
|
88
|
+
const replayHash = createHash("sha256")
|
|
89
|
+
.update(`${artifact.loopId}:${artifact.attemptId}:${replayedAt}:${match}`)
|
|
90
|
+
.digest("hex");
|
|
91
|
+
return {
|
|
92
|
+
loopId: artifact.loopId,
|
|
93
|
+
attemptId: artifact.attemptId,
|
|
94
|
+
match,
|
|
95
|
+
mismatches,
|
|
96
|
+
replayedAt,
|
|
97
|
+
replayHash
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
// ---------------------------------------------------------------------------
|
|
101
|
+
// Replay runner: multiple attempts
|
|
102
|
+
// ---------------------------------------------------------------------------
|
|
103
|
+
export function replayLoop(artifacts, verifierResults) {
|
|
104
|
+
const reports = artifacts.map(a => replayAttempt(a, verifierResults?.get(a.attemptId)));
|
|
105
|
+
const allMatch = reports.every(r => r.match);
|
|
106
|
+
const p0Count = reports.reduce((n, r) => n + r.mismatches.filter(m => m.severity === "P0").length, 0);
|
|
107
|
+
return { reports, allMatch, p0Count };
|
|
108
|
+
}
|
|
109
|
+
//# sourceMappingURL=replay.js.map
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import type { LoopTask, FailureClass } from "../../contracts/index.js";
|
|
2
|
+
import type { CostGovernorState } from "../policy.js";
|
|
3
|
+
import { type ModelTrustProfile } from "./trust-calibration.js";
|
|
4
|
+
/**
|
|
5
|
+
* Minimal structural interface for what the router requires from an adapter.
|
|
6
|
+
* Any MartinAdapter from @martin/core is structurally compatible with this.
|
|
7
|
+
*/
|
|
8
|
+
export interface RouterAdapterRef {
|
|
9
|
+
metadata: {
|
|
10
|
+
model: string;
|
|
11
|
+
provider?: string;
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
export interface RouteConfig {
|
|
15
|
+
adapter: RouterAdapterRef;
|
|
16
|
+
baseCostUsdPer1kTcs: number;
|
|
17
|
+
maxLatencyMs?: number;
|
|
18
|
+
trustTier: "high" | "medium" | "low";
|
|
19
|
+
}
|
|
20
|
+
export interface RouteEvaluationContext {
|
|
21
|
+
task: LoopTask;
|
|
22
|
+
costState: CostGovernorState;
|
|
23
|
+
currentFailure?: FailureClass;
|
|
24
|
+
complexityScore: number;
|
|
25
|
+
/**
|
|
26
|
+
* Estimated blast radius of the planned action on a 0–100 scale.
|
|
27
|
+
* Exposed in run summary and OTel span.
|
|
28
|
+
* When > 70, forces high-trust route regardless of other heuristics.
|
|
29
|
+
*/
|
|
30
|
+
blastRadius?: number;
|
|
31
|
+
/**
|
|
32
|
+
* Trust profiles derived from historical run data by the Trust Calibration Engine.
|
|
33
|
+
* When present, the router uses these to auto-downgrade to cheaper models that
|
|
34
|
+
* have proven reliability and to deprioritize models with poor track records.
|
|
35
|
+
*/
|
|
36
|
+
trustProfiles?: ModelTrustProfile[];
|
|
37
|
+
}
|
|
38
|
+
export interface RouteDecision {
|
|
39
|
+
adapter: RouterAdapterRef;
|
|
40
|
+
rationale: string;
|
|
41
|
+
/** Estimated cost per 1k tokens for the selected route — exposed in run summary and OTel span */
|
|
42
|
+
selectedCostPer1kTcs: number;
|
|
43
|
+
/** Trust tier of the selected route */
|
|
44
|
+
selectedTrustTier: "high" | "medium" | "low";
|
|
45
|
+
}
|
|
46
|
+
export declare class MartinRouter {
|
|
47
|
+
private readonly availableRoutes;
|
|
48
|
+
constructor(availableRoutes: RouteConfig[]);
|
|
49
|
+
/**
|
|
50
|
+
* Dynamically selects the optimal provider/model adapter for the next attempt.
|
|
51
|
+
* Balances the necessity for intelligence (trustTier, complexity) against budget constraints.
|
|
52
|
+
*/
|
|
53
|
+
evaluateRoute(context: RouteEvaluationContext): RouteDecision;
|
|
54
|
+
}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import { getTracer } from "../otel/tracer.js";
|
|
2
|
+
import { shouldDeprioritize } from "./trust-calibration.js";
|
|
3
|
+
export class MartinRouter {
|
|
4
|
+
availableRoutes;
|
|
5
|
+
constructor(availableRoutes) {
|
|
6
|
+
this.availableRoutes = availableRoutes;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Dynamically selects the optimal provider/model adapter for the next attempt.
|
|
10
|
+
* Balances the necessity for intelligence (trustTier, complexity) against budget constraints.
|
|
11
|
+
*/
|
|
12
|
+
evaluateRoute(context) {
|
|
13
|
+
if (this.availableRoutes.length === 0) {
|
|
14
|
+
throw new Error("MartinRouter has no available routes configured.");
|
|
15
|
+
}
|
|
16
|
+
const { costState, currentFailure, complexityScore, blastRadius, trustProfiles } = context;
|
|
17
|
+
const tracer = getTracer();
|
|
18
|
+
const span = tracer.startSpan("martin.router_decision", {
|
|
19
|
+
"router.pressure": costState.pressure,
|
|
20
|
+
"router.remaining_budget_usd": costState.remainingBudgetUsd,
|
|
21
|
+
"router.complexity_score": complexityScore,
|
|
22
|
+
...(currentFailure ? { "router.failure_class": currentFailure } : {}),
|
|
23
|
+
...(blastRadius !== undefined ? { "router.blast_radius": blastRadius } : {})
|
|
24
|
+
});
|
|
25
|
+
const emit = (selected, rationale) => {
|
|
26
|
+
span.attributes["router.selected_model"] = selected.adapter.metadata.model;
|
|
27
|
+
span.attributes["router.selected_trust_tier"] = selected.trustTier;
|
|
28
|
+
span.attributes["router.selected_cost_per_1k_tcs"] = selected.baseCostUsdPer1kTcs;
|
|
29
|
+
span.attributes["router.rationale"] = rationale;
|
|
30
|
+
tracer.endSpan(span, "OK");
|
|
31
|
+
return {
|
|
32
|
+
adapter: selected.adapter,
|
|
33
|
+
rationale,
|
|
34
|
+
selectedCostPer1kTcs: selected.baseCostUsdPer1kTcs,
|
|
35
|
+
selectedTrustTier: selected.trustTier
|
|
36
|
+
};
|
|
37
|
+
};
|
|
38
|
+
// Filter out adapters that would aggressively violate remaining budget estimations
|
|
39
|
+
const budgetFilteredRoutes = this.availableRoutes.filter((route) => {
|
|
40
|
+
const estimatedCost = route.baseCostUsdPer1kTcs * 15;
|
|
41
|
+
return estimatedCost <= costState.remainingBudgetUsd;
|
|
42
|
+
});
|
|
43
|
+
const routes = budgetFilteredRoutes.length > 0 ? budgetFilteredRoutes : this.availableRoutes;
|
|
44
|
+
// High blast radius: force a high-trust route to reduce regression risk
|
|
45
|
+
if (blastRadius !== undefined && blastRadius > 70) {
|
|
46
|
+
const highTrust = routes.filter((r) => r.trustTier === "high");
|
|
47
|
+
if (highTrust.length > 0) {
|
|
48
|
+
const safest = highTrust.reduce((min, r) => r.baseCostUsdPer1kTcs < min.baseCostUsdPer1kTcs ? r : min);
|
|
49
|
+
return emit(safest, `Forced high-trust route ${safest.adapter.metadata.model} due to blast radius ${blastRadius}/100.`);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
// Trust calibration: deprioritize models with proven poor track records
|
|
53
|
+
// and prefer cheaper models with proven high efficiency when data warrants it
|
|
54
|
+
let calibratedRoutes = routes;
|
|
55
|
+
if (trustProfiles && trustProfiles.length > 0) {
|
|
56
|
+
// Remove routes whose model has been deprioritized by evidence
|
|
57
|
+
const evidenceBacked = routes.filter((r) => {
|
|
58
|
+
const profile = trustProfiles.find((p) => profileMatchesRoute(p, r));
|
|
59
|
+
return profile === undefined || !shouldDeprioritize(profile);
|
|
60
|
+
});
|
|
61
|
+
if (evidenceBacked.length > 0)
|
|
62
|
+
calibratedRoutes = evidenceBacked;
|
|
63
|
+
const trustedRoutes = calibratedRoutes
|
|
64
|
+
.map((route) => ({
|
|
65
|
+
route,
|
|
66
|
+
profile: trustProfiles.find((profile) => profileMatchesRoute(profile, route))
|
|
67
|
+
}))
|
|
68
|
+
.filter((item) => {
|
|
69
|
+
const profile = item.profile;
|
|
70
|
+
return (profile !== undefined &&
|
|
71
|
+
profile.efficiencyScore > 0.85 &&
|
|
72
|
+
profile.runsObserved >= 3);
|
|
73
|
+
})
|
|
74
|
+
.sort((a, b) => a.route.baseCostUsdPer1kTcs - b.route.baseCostUsdPer1kTcs);
|
|
75
|
+
const trustedRoute = trustedRoutes[0];
|
|
76
|
+
const defaultRoute = routes[0];
|
|
77
|
+
if (trustedRoute &&
|
|
78
|
+
defaultRoute &&
|
|
79
|
+
trustedRoute.route.baseCostUsdPer1kTcs < defaultRoute.baseCostUsdPer1kTcs) {
|
|
80
|
+
return emit(trustedRoute.route, `Auto-selected ${trustedRoute.route.adapter.metadata.model} based on ${String(trustedRoute.profile.runsObserved)} historical runs (efficiency: ${String(Math.round(trustedRoute.profile.efficiencyScore * 100))}%, completion: ${String(Math.round(trustedRoute.profile.completionRate * 100))}%).`);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
const effectiveRoutes = calibratedRoutes;
|
|
84
|
+
// Default to cheapest route when budget is at the hard ceiling
|
|
85
|
+
if (costState.pressure === "hard_limit") {
|
|
86
|
+
const cheapest = effectiveRoutes.reduce((min, r) => r.baseCostUsdPer1kTcs < min.baseCostUsdPer1kTcs ? r : min);
|
|
87
|
+
return emit(cheapest, `Selected ${cheapest.adapter.metadata.model} due to severe budget pressure (hard_limit).`);
|
|
88
|
+
}
|
|
89
|
+
// If we're failing on reasoning or grounding, escalate to a high-trust model immediately
|
|
90
|
+
if (currentFailure === "verification_failure" ||
|
|
91
|
+
currentFailure === "repo_grounding_failure" ||
|
|
92
|
+
complexityScore > 0.7) {
|
|
93
|
+
const highTrust = effectiveRoutes.filter((r) => r.trustTier === "high");
|
|
94
|
+
if (highTrust.length > 0) {
|
|
95
|
+
const best = highTrust.reduce((min, r) => r.baseCostUsdPer1kTcs < min.baseCostUsdPer1kTcs ? r : min);
|
|
96
|
+
return emit(best, `Escalated to ${best.adapter.metadata.model} (high-trust) due to failure profile '${currentFailure ?? "complex_task"}'.`);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
// Prefer economical route for simple or trivially-classified errors
|
|
100
|
+
if (currentFailure === "syntax_error" ||
|
|
101
|
+
currentFailure === "repo_grounding_failure" ||
|
|
102
|
+
complexityScore < 0.3) {
|
|
103
|
+
const economical = effectiveRoutes.filter((r) => r.trustTier !== "high");
|
|
104
|
+
if (economical.length > 0) {
|
|
105
|
+
const best = economical.reduce((min, r) => r.baseCostUsdPer1kTcs < min.baseCostUsdPer1kTcs ? r : min);
|
|
106
|
+
return emit(best, `Selected ${best.adapter.metadata.model} to preserve budget on low-complexity task recovery.`);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Default to a balanced medium-tier route if no strict heuristic applies
|
|
110
|
+
const mediumTrust = effectiveRoutes.filter((r) => r.trustTier === "medium");
|
|
111
|
+
if (mediumTrust.length > 0) {
|
|
112
|
+
const best = mediumTrust.reduce((min, r) => Math.abs(r.maxLatencyMs ?? 5000) < Math.abs(min.maxLatencyMs ?? 5000) ? r : min);
|
|
113
|
+
return emit(best, `Selected balanced route ${best.adapter.metadata.model} for nominal execution.`);
|
|
114
|
+
}
|
|
115
|
+
// Ultimate fallback — effectiveRoutes is non-empty (guaranteed by the guard above)
|
|
116
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
117
|
+
const bestRoute = effectiveRoutes[0];
|
|
118
|
+
return emit(bestRoute, `Fallback route ${bestRoute.adapter.metadata.model} selected as default.`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
function profileMatchesRoute(profile, route) {
|
|
122
|
+
const model = normalizeModelKey(route.adapter.metadata.model);
|
|
123
|
+
const profileModel = normalizeModelKey(profile.model);
|
|
124
|
+
return (model === profileModel ||
|
|
125
|
+
model.includes(profileModel) ||
|
|
126
|
+
profileModel.includes(model));
|
|
127
|
+
}
|
|
128
|
+
function normalizeModelKey(value) {
|
|
129
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, "-");
|
|
130
|
+
}
|
|
131
|
+
//# sourceMappingURL=engine.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from "./engine.js";
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Trust Calibration Engine — the self-improvement loop.
|
|
3
|
+
*
|
|
4
|
+
* Reads historical run records from ~/.martin/runs/ and computes a reliability
|
|
5
|
+
* profile for each model that has been used. The router uses these profiles to
|
|
6
|
+
* automatically downgrade to cheaper models when evidence shows they perform
|
|
7
|
+
* as well as more expensive ones, and to deprioritize models with poor track records.
|
|
8
|
+
*
|
|
9
|
+
* This closes the feedback loop that was missing: every completed run writes
|
|
10
|
+
* evidence to disk; this module reads it back into routing decisions.
|
|
11
|
+
*/
|
|
12
|
+
export interface ModelTrustProfile {
|
|
13
|
+
/** Model identifier as recorded in attempt records (e.g. "claude-sonnet-4-6") */
|
|
14
|
+
model: string;
|
|
15
|
+
/** Total runs where this model was used for at least one attempt */
|
|
16
|
+
runsObserved: number;
|
|
17
|
+
/** Fraction of observed runs that completed successfully (0–1) */
|
|
18
|
+
completionRate: number;
|
|
19
|
+
/** Average USD cost per iteration (attempt) */
|
|
20
|
+
avgCostPerIteration: number;
|
|
21
|
+
/** Average iterations used vs budget.maxIterations (lower = more efficient) */
|
|
22
|
+
avgIterationEfficiency: number;
|
|
23
|
+
/**
|
|
24
|
+
* Composite score 0–1: completionRate * (1 - avgIterationEfficiency).
|
|
25
|
+
* High score = completes well AND uses fewer iterations than the budget allows.
|
|
26
|
+
*/
|
|
27
|
+
efficiencyScore: number;
|
|
28
|
+
/** ISO timestamp of the most recent run that informed this profile */
|
|
29
|
+
lastUpdated: string;
|
|
30
|
+
}
|
|
31
|
+
export interface TrustCalibrationResult {
|
|
32
|
+
/** Per-model reliability profiles, sorted by efficiencyScore descending */
|
|
33
|
+
profiles: ModelTrustProfile[];
|
|
34
|
+
/**
|
|
35
|
+
* The model with the best efficiencyScore that also meets minRuns threshold.
|
|
36
|
+
* Null if insufficient data exists yet.
|
|
37
|
+
*/
|
|
38
|
+
recommendedModel: string | null;
|
|
39
|
+
/** Total number of runs analyzed to produce this result */
|
|
40
|
+
calibrationBasis: number;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Reads historical loop records and computes a trust profile for each model.
|
|
44
|
+
*
|
|
45
|
+
* @param runsDir - Override the default ~/.martin/runs path (useful for testing)
|
|
46
|
+
* @param minRuns - Minimum observations required before a profile is considered
|
|
47
|
+
* reliable enough to influence routing. Default: 3.
|
|
48
|
+
* @param efficiencyThreshold - Minimum efficiencyScore for a model to be
|
|
49
|
+
* eligible for auto-recommendation. Default: 0.75.
|
|
50
|
+
*/
|
|
51
|
+
export declare function calibrateTrust(runsDir?: string, minRuns?: number, efficiencyThreshold?: number): Promise<TrustCalibrationResult>;
|
|
52
|
+
/**
|
|
53
|
+
* Returns true if a model should be deprioritized based on its trust profile.
|
|
54
|
+
* A model is deprioritized when it has enough observations to be confident
|
|
55
|
+
* it performs poorly (low completion rate).
|
|
56
|
+
*/
|
|
57
|
+
export declare function shouldDeprioritize(profile: ModelTrustProfile, minRuns?: number, minCompletionRate?: number): boolean;
|