martin-loop 0.1.4 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CODE_OF_CONDUCT.md +32 -0
- package/README.md +172 -227
- package/demo/seeded-workspace/README.md +35 -0
- package/demo/seeded-workspace/TASKS.md +29 -0
- package/demo/seeded-workspace/martin.config.yaml +11 -0
- package/demo/seeded-workspace/package.json +8 -0
- package/demo/seeded-workspace/src/invoice-summary.js +11 -0
- package/demo/seeded-workspace/test/invoice-summary.test.js +20 -0
- package/dist/bin/martin-loop.js +0 -0
- package/dist/vendor/adapters/claude-cli.d.ts +19 -4
- package/dist/vendor/adapters/claude-cli.js +55 -24
- package/dist/vendor/adapters/cli-bridge.d.ts +1 -0
- package/dist/vendor/adapters/cli-bridge.js +154 -28
- package/dist/vendor/adapters/counter.d.ts +1 -0
- package/dist/vendor/adapters/counter.js +4 -0
- package/dist/vendor/adapters/git-baseline.d.ts +50 -0
- package/dist/vendor/adapters/git-baseline.js +233 -0
- package/dist/vendor/adapters/index.d.ts +1 -0
- package/dist/vendor/adapters/index.js +1 -0
- package/dist/vendor/adapters/openrouter-adapter.d.ts +15 -0
- package/dist/vendor/adapters/openrouter-adapter.js +302 -0
- package/dist/vendor/adapters/usage.d.ts +48 -0
- package/dist/vendor/adapters/usage.js +66 -0
- package/dist/vendor/adapters/verifier-only.d.ts +7 -0
- package/dist/vendor/adapters/verifier-only.js +57 -0
- package/dist/vendor/cli/bin/exit.d.ts +12 -0
- package/dist/vendor/cli/bin/exit.js +28 -0
- package/dist/vendor/cli/commands/analyze.d.ts +5 -0
- package/dist/vendor/cli/commands/analyze.js +58 -0
- package/dist/vendor/cli/commands/audit-log-verify.d.ts +34 -0
- package/dist/vendor/cli/commands/audit-log-verify.js +99 -0
- package/dist/vendor/cli/commands/audit.d.ts +8 -0
- package/dist/vendor/cli/commands/audit.js +199 -0
- package/dist/vendor/cli/commands/corpus.d.ts +5 -0
- package/dist/vendor/cli/commands/corpus.js +60 -0
- package/dist/vendor/cli/commands/doctor.d.ts +8 -0
- package/dist/vendor/cli/commands/doctor.js +219 -0
- package/dist/vendor/cli/commands/explain.d.ts +17 -0
- package/dist/vendor/cli/commands/explain.js +176 -0
- package/dist/vendor/cli/commands/export.d.ts +5 -0
- package/dist/vendor/cli/commands/export.js +60 -0
- package/dist/vendor/cli/commands/governance.d.ts +8 -0
- package/dist/vendor/cli/commands/governance.js +95 -0
- package/dist/vendor/cli/commands/improve.d.ts +18 -0
- package/dist/vendor/cli/commands/improve.js +396 -0
- package/dist/vendor/cli/commands/init.d.ts +8 -0
- package/dist/vendor/cli/commands/init.js +281 -0
- package/dist/vendor/cli/commands/migration.d.ts +8 -0
- package/dist/vendor/cli/commands/migration.js +67 -0
- package/dist/vendor/cli/commands/prior.d.ts +23 -0
- package/dist/vendor/cli/commands/prior.js +145 -0
- package/dist/vendor/cli/commands/resume.d.ts +21 -0
- package/dist/vendor/cli/commands/resume.js +73 -0
- package/dist/vendor/cli/commands/verify.d.ts +6 -0
- package/dist/vendor/cli/commands/verify.js +43 -0
- package/dist/vendor/cli/index.d.ts +6 -1
- package/dist/vendor/cli/index.js +124 -7
- package/dist/vendor/cli/research/public-corpus.d.ts +43 -0
- package/dist/vendor/cli/research/public-corpus.js +151 -0
- package/dist/vendor/cli/ui/error-card.d.ts +38 -0
- package/dist/vendor/cli/ui/error-card.js +103 -0
- package/dist/vendor/cli/ui/mission-brief.d.ts +41 -0
- package/dist/vendor/cli/ui/mission-brief.js +173 -0
- package/dist/vendor/cli/ui/summary-card.d.ts +34 -0
- package/dist/vendor/cli/ui/summary-card.js +102 -0
- package/dist/vendor/contracts/audit.d.ts +46 -0
- package/dist/vendor/contracts/audit.js +360 -0
- package/dist/vendor/contracts/index.d.ts +3 -1
- package/dist/vendor/contracts/post-phase15.d.ts +240 -0
- package/dist/vendor/contracts/post-phase15.js +166 -0
- package/dist/vendor/core/agent/mandates.d.ts +46 -0
- package/dist/vendor/core/agent/mandates.js +178 -0
- package/dist/vendor/core/agent/receipts.d.ts +38 -0
- package/dist/vendor/core/agent/receipts.js +131 -0
- package/dist/vendor/core/agent/signing.d.ts +17 -0
- package/dist/vendor/core/agent/signing.js +91 -0
- package/dist/vendor/core/attestation/sign.d.ts +25 -0
- package/dist/vendor/core/attestation/sign.js +216 -0
- package/dist/vendor/core/autonomy/autonomous-promotion.d.ts +120 -0
- package/dist/vendor/core/autonomy/autonomous-promotion.js +346 -0
- package/dist/vendor/core/autonomy/envelope-v2.d.ts +29 -0
- package/dist/vendor/core/autonomy/envelope-v2.js +60 -0
- package/dist/vendor/core/autonomy/envelope.d.ts +17 -0
- package/dist/vendor/core/autonomy/envelope.js +27 -0
- package/dist/vendor/core/autonomy/escalation-ledger.d.ts +20 -0
- package/dist/vendor/core/autonomy/escalation-ledger.js +18 -0
- package/dist/vendor/core/autonomy/resume.d.ts +15 -0
- package/dist/vendor/core/autonomy/resume.js +23 -0
- package/dist/vendor/core/circuit/circuit-breaker.d.ts +60 -0
- package/dist/vendor/core/circuit/circuit-breaker.js +143 -0
- package/dist/vendor/core/compiler.d.ts +2 -0
- package/dist/vendor/core/compiler.js +10 -4
- package/dist/vendor/core/context-distillation.d.ts +3 -0
- package/dist/vendor/core/context-distillation.js +44 -0
- package/dist/vendor/core/context-flow/compile-context.d.ts +8 -0
- package/dist/vendor/core/context-flow/compile-context.js +111 -0
- package/dist/vendor/core/context-flow/entities.d.ts +2 -0
- package/dist/vendor/core/context-flow/entities.js +44 -0
- package/dist/vendor/core/context-flow/evaluate-policy.d.ts +2 -0
- package/dist/vendor/core/context-flow/evaluate-policy.js +42 -0
- package/dist/vendor/core/context-flow/index.d.ts +11 -0
- package/dist/vendor/core/context-flow/index.js +24 -0
- package/dist/vendor/core/context-flow/labels.d.ts +3 -0
- package/dist/vendor/core/context-flow/labels.js +17 -0
- package/dist/vendor/core/context-flow/normalizer.d.ts +9 -0
- package/dist/vendor/core/context-flow/normalizer.js +69 -0
- package/dist/vendor/core/context-flow/profiles.d.ts +33 -0
- package/dist/vendor/core/context-flow/profiles.js +36 -0
- package/dist/vendor/core/context-flow/redaction.d.ts +1 -0
- package/dist/vendor/core/context-flow/redaction.js +6 -0
- package/dist/vendor/core/context-flow/sensitivity.d.ts +2 -0
- package/dist/vendor/core/context-flow/sensitivity.js +27 -0
- package/dist/vendor/core/context-flow/sync-preview.d.ts +2 -0
- package/dist/vendor/core/context-flow/sync-preview.js +22 -0
- package/dist/vendor/core/context-flow/token-estimator.d.ts +3 -0
- package/dist/vendor/core/context-flow/token-estimator.js +13 -0
- package/dist/vendor/core/context-flow/types.d.ts +91 -0
- package/dist/vendor/core/context-flow/types.js +2 -0
- package/dist/vendor/core/context-integrity.d.ts +26 -0
- package/dist/vendor/core/context-integrity.js +56 -0
- package/dist/vendor/core/context-utility.d.ts +47 -0
- package/dist/vendor/core/context-utility.js +405 -0
- package/dist/vendor/core/cost/pipeline.d.ts +92 -0
- package/dist/vendor/core/cost/pipeline.js +141 -0
- package/dist/vendor/core/cost/tagged-cost.d.ts +27 -0
- package/dist/vendor/core/cost/tagged-cost.js +55 -0
- package/dist/vendor/core/cost-governor.d.ts +2 -0
- package/dist/vendor/core/cost-governor.js +50 -0
- package/dist/vendor/core/cve/cve-check.d.ts +80 -0
- package/dist/vendor/core/cve/cve-check.js +172 -0
- package/dist/vendor/core/digital-twin/index.d.ts +27 -0
- package/dist/vendor/core/digital-twin/index.js +90 -0
- package/dist/vendor/core/drift/drift-graph.d.ts +47 -0
- package/dist/vendor/core/drift/drift-graph.js +100 -0
- package/dist/vendor/core/drift/objective-lock.d.ts +69 -0
- package/dist/vendor/core/drift/objective-lock.js +88 -0
- package/dist/vendor/core/drift/scope.d.ts +46 -0
- package/dist/vendor/core/drift/scope.js +102 -0
- package/dist/vendor/core/drift/signature-lock.d.ts +48 -0
- package/dist/vendor/core/drift/signature-lock.js +202 -0
- package/dist/vendor/core/drift/stale-proof-gate.d.ts +21 -0
- package/dist/vendor/core/drift/stale-proof-gate.js +19 -0
- package/dist/vendor/core/eval/known-bad-world-runner.d.ts +24 -0
- package/dist/vendor/core/eval/known-bad-world-runner.js +256 -0
- package/dist/vendor/core/evidence/claim-audit.d.ts +18 -0
- package/dist/vendor/core/evidence/claim-audit.js +89 -0
- package/dist/vendor/core/exit-intelligence.d.ts +2 -0
- package/dist/vendor/core/exit-intelligence.js +58 -0
- package/dist/vendor/core/explain/formatter.d.ts +42 -0
- package/dist/vendor/core/explain/formatter.js +171 -0
- package/dist/vendor/core/explain/timeline.d.ts +29 -0
- package/dist/vendor/core/explain/timeline.js +213 -0
- package/dist/vendor/core/failure-taxonomy.d.ts +2 -0
- package/dist/vendor/core/failure-taxonomy.js +76 -0
- package/dist/vendor/core/gateway/index.d.ts +10 -0
- package/dist/vendor/core/gateway/index.js +12 -0
- package/dist/vendor/core/gateway/registry.d.ts +40 -0
- package/dist/vendor/core/gateway/registry.js +97 -0
- package/dist/vendor/core/gateway/transport.d.ts +31 -0
- package/dist/vendor/core/gateway/transport.js +82 -0
- package/dist/vendor/core/gateway/vault.d.ts +19 -0
- package/dist/vendor/core/gateway/vault.js +29 -0
- package/dist/vendor/core/graph/adapters.d.ts +43 -0
- package/dist/vendor/core/graph/adapters.js +91 -0
- package/dist/vendor/core/graph/hotspots.d.ts +22 -0
- package/dist/vendor/core/graph/hotspots.js +30 -0
- package/dist/vendor/core/graph/index.d.ts +1 -0
- package/dist/vendor/core/graph/index.js +2 -0
- package/dist/vendor/core/honey/honey-tokens.d.ts +32 -0
- package/dist/vendor/core/honey/honey-tokens.js +44 -0
- package/dist/vendor/core/index.d.ts +7 -4
- package/dist/vendor/core/index.js +222 -64
- package/dist/vendor/core/learning/bayesian-update.d.ts +31 -0
- package/dist/vendor/core/learning/bayesian-update.js +60 -0
- package/dist/vendor/core/learning/prior-sets.d.ts +42 -0
- package/dist/vendor/core/learning/prior-sets.js +111 -0
- package/dist/vendor/core/learning/promotion-gate.d.ts +17 -0
- package/dist/vendor/core/learning/promotion-gate.js +23 -0
- package/dist/vendor/core/leash/blast-radius.d.ts +42 -0
- package/dist/vendor/core/leash/blast-radius.js +156 -0
- package/dist/vendor/core/leash/policy-leash.d.ts +31 -0
- package/dist/vendor/core/leash/policy-leash.js +117 -0
- package/dist/vendor/core/memo/memo.d.ts +63 -0
- package/dist/vendor/core/memo/memo.js +97 -0
- package/dist/vendor/core/memory/learning-pipeline.d.ts +154 -0
- package/dist/vendor/core/memory/learning-pipeline.js +391 -0
- package/dist/vendor/core/memory/palace.d.ts +84 -0
- package/dist/vendor/core/memory/palace.js +379 -0
- package/dist/vendor/core/merge/ast-merge.d.ts +22 -0
- package/dist/vendor/core/merge/ast-merge.js +350 -0
- package/dist/vendor/core/merge/text-merge.d.ts +12 -0
- package/dist/vendor/core/merge/text-merge.js +182 -0
- package/dist/vendor/core/otel/tracer.d.ts +45 -0
- package/dist/vendor/core/otel/tracer.js +116 -0
- package/dist/vendor/core/parallel/parallel-attempts.d.ts +28 -0
- package/dist/vendor/core/parallel/parallel-attempts.js +41 -0
- package/dist/vendor/core/parallel/scorer.d.ts +24 -0
- package/dist/vendor/core/parallel/scorer.js +65 -0
- package/dist/vendor/core/pattern-detection.d.ts +64 -0
- package/dist/vendor/core/pattern-detection.js +108 -0
- package/dist/vendor/core/persistence/checkpoint.d.ts +44 -0
- package/dist/vendor/core/persistence/checkpoint.js +156 -0
- package/dist/vendor/core/persistence/cleanup.d.ts +22 -0
- package/dist/vendor/core/persistence/cleanup.js +131 -0
- package/dist/vendor/core/persistence/index.d.ts +2 -0
- package/dist/vendor/core/persistence/index.js +1 -0
- package/dist/vendor/core/persistence/runs-reader.d.ts +52 -0
- package/dist/vendor/core/persistence/runs-reader.js +84 -0
- package/dist/vendor/core/persistence/store.d.ts +6 -1
- package/dist/vendor/core/persistence/store.js +5 -0
- package/dist/vendor/core/policy/file-touch-quota.d.ts +60 -0
- package/dist/vendor/core/policy/file-touch-quota.js +105 -0
- package/dist/vendor/core/policy/policy-loader.d.ts +30 -0
- package/dist/vendor/core/policy/policy-loader.js +170 -0
- package/dist/vendor/core/policy/policy-schema.d.ts +55 -0
- package/dist/vendor/core/policy/policy-schema.js +78 -0
- package/dist/vendor/core/policy.d.ts +6 -0
- package/dist/vendor/core/probe/probe.d.ts +49 -0
- package/dist/vendor/core/probe/probe.js +115 -0
- package/dist/vendor/core/proof/patch-proof.d.ts +58 -0
- package/dist/vendor/core/proof/patch-proof.js +84 -0
- package/dist/vendor/core/proof/semantic-probe.d.ts +25 -0
- package/dist/vendor/core/proof/semantic-probe.js +82 -0
- package/dist/vendor/core/recovery/failure-mode-runner.d.ts +29 -0
- package/dist/vendor/core/recovery/failure-mode-runner.js +39 -0
- package/dist/vendor/core/red-blue/red-phase.d.ts +64 -0
- package/dist/vendor/core/red-blue/red-phase.js +141 -0
- package/dist/vendor/core/red-blue/risk-tiers.d.ts +22 -0
- package/dist/vendor/core/red-blue/risk-tiers.js +33 -0
- package/dist/vendor/core/replay/replay.d.ts +85 -0
- package/dist/vendor/core/replay/replay.js +109 -0
- package/dist/vendor/core/router/engine.d.ts +54 -0
- package/dist/vendor/core/router/engine.js +131 -0
- package/dist/vendor/core/router/index.d.ts +1 -0
- package/dist/vendor/core/router/index.js +2 -0
- package/dist/vendor/core/router/trust-calibration.d.ts +57 -0
- package/dist/vendor/core/router/trust-calibration.js +127 -0
- package/dist/vendor/core/run-martin.d.ts +2 -0
- package/dist/vendor/core/run-martin.js +287 -0
- package/dist/vendor/core/security/cve-scanner.d.ts +62 -0
- package/dist/vendor/core/security/cve-scanner.js +178 -0
- package/dist/vendor/core/sentinel/efficiency-sentinel.d.ts +29 -0
- package/dist/vendor/core/sentinel/efficiency-sentinel.js +30 -0
- package/dist/vendor/core/sentinel/progress-guard.d.ts +35 -0
- package/dist/vendor/core/sentinel/progress-guard.js +46 -0
- package/dist/vendor/core/siem/siem-emitter.d.ts +49 -0
- package/dist/vendor/core/siem/siem-emitter.js +157 -0
- package/dist/vendor/core/strategy/attempt-brief.d.ts +22 -0
- package/dist/vendor/core/strategy/attempt-brief.js +89 -0
- package/dist/vendor/core/summarize/diff-summary.d.ts +35 -0
- package/dist/vendor/core/summarize/diff-summary.js +204 -0
- package/dist/vendor/core/surface-signals.d.ts +21 -0
- package/dist/vendor/core/surface-signals.js +139 -0
- package/dist/vendor/core/truth/truth-wall.d.ts +51 -0
- package/dist/vendor/core/truth/truth-wall.js +69 -0
- package/dist/vendor/core/truth-spine.d.ts +26 -0
- package/dist/vendor/core/truth-spine.js +62 -0
- package/dist/vendor/core/types.d.ts +115 -0
- package/dist/vendor/core/types.js +2 -0
- package/dist/vendor/core/verification/tiered-verify.d.ts +17 -0
- package/dist/vendor/core/verification/tiered-verify.js +29 -0
- package/dist/vendor/core/verifier-pyramid.d.ts +32 -0
- package/dist/vendor/core/verifier-pyramid.js +111 -0
- package/dist/vendor/core/workflow-artifacts.d.ts +99 -0
- package/dist/vendor/core/workflow-artifacts.js +668 -0
- package/dist/vendor/core/wrap/supervised-run.d.ts +96 -0
- package/dist/vendor/core/wrap/supervised-run.js +178 -0
- package/docs/assets/cli-animated.svg +139 -0
- package/docs/assets/cli-static.svg +34 -0
- package/docs/assets/github-hero-v2.svg +23 -0
- package/docs/assets/martin-raplph.png.jpg +0 -0
- package/docs/assets/martinloop-logo.png +0 -0
- package/docs/assets/nvidia-inception-program-light.png +0 -0
- package/docs/assets/nvidia-inception-program.png +0 -0
- package/docs/assets/phase3c-sidesidebyside-demo.html +228 -0
- package/docs/assets/side-by-side.svg +134 -0
- package/docs/oss/CLAUDE-CODE-WALKTHROUGH.md +142 -0
- package/docs/oss/EXAMPLES.md +9 -1
- package/docs/oss/OSS-BOUNDARY-REPORT.json +109 -113
- package/docs/oss/OSS-BOUNDARY-REPORT.md +48 -48
- package/docs/oss/QUICKSTART.md +39 -4
- package/docs/oss/RALPH-LOOP-SAFETY.md +113 -0
- package/docs/oss/README.md +7 -4
- package/docs/oss/RELEASE-SURFACE-REPORT.json +46 -45
- package/docs/oss/RELEASE-SURFACE-REPORT.md +36 -35
- package/package.json +129 -49
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { type RiskTier } from "./risk-tiers.js";
|
|
2
|
+
export interface RedFinding {
|
|
3
|
+
trapId: string;
|
|
4
|
+
severity: "warn" | "block";
|
|
5
|
+
description: string;
|
|
6
|
+
resolvedAt?: string;
|
|
7
|
+
}
|
|
8
|
+
export interface RedFindings {
|
|
9
|
+
riskTier: RiskTier;
|
|
10
|
+
probesRun: number;
|
|
11
|
+
findingsCount: number;
|
|
12
|
+
findings: RedFinding[];
|
|
13
|
+
modelCallMade: boolean;
|
|
14
|
+
modelUsed?: string;
|
|
15
|
+
budgetUsedUsd: number;
|
|
16
|
+
}
|
|
17
|
+
/** Minimal interface for the Anthropic model client (mockable in tests). */
|
|
18
|
+
export interface MockModelClient {
|
|
19
|
+
complete(prompt: string): Promise<{
|
|
20
|
+
findings: RedFinding[];
|
|
21
|
+
tokensUsed: number;
|
|
22
|
+
costUsd: number;
|
|
23
|
+
}>;
|
|
24
|
+
}
|
|
25
|
+
export interface RunRedPhaseOptions {
|
|
26
|
+
/** Inject a mock or real Anthropic client. Required for release_critical tier. */
|
|
27
|
+
modelClient?: MockModelClient;
|
|
28
|
+
/** Callback fired with each ledger event produced by the phase. */
|
|
29
|
+
onLedgerEvent?: (event: RedLedgerEvent) => void;
|
|
30
|
+
}
|
|
31
|
+
export interface RedLedgerEvent {
|
|
32
|
+
type: "red_phase_findings";
|
|
33
|
+
runId?: string;
|
|
34
|
+
riskTier: RiskTier;
|
|
35
|
+
probesRun: number;
|
|
36
|
+
findingsCount: number;
|
|
37
|
+
modelCallMade: boolean;
|
|
38
|
+
timestamp: string;
|
|
39
|
+
}
|
|
40
|
+
export interface PatchInput {
|
|
41
|
+
patchId: string;
|
|
42
|
+
diff: string;
|
|
43
|
+
changedFiles: string[];
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Runs the Red phase for a given patch and risk tier.
|
|
47
|
+
*
|
|
48
|
+
* - baseline: programmatic probes only, no model call
|
|
49
|
+
* - high_risk: paranoid programmatic scan, no model call
|
|
50
|
+
* - release_critical: paranoid scan + one Haiku model call
|
|
51
|
+
*/
|
|
52
|
+
export declare function runRedPhase(patch: PatchInput, tier: RiskTier, blueBudgetUsd: number, options?: RunRedPhaseOptions): Promise<RedFindings>;
|
|
53
|
+
/**
|
|
54
|
+
* Returns true only if the findings contain zero block-severity entries.
|
|
55
|
+
* A single block finding rejects the patch regardless of warn count.
|
|
56
|
+
*/
|
|
57
|
+
export declare function shouldAcceptPatch(findings: RedFindings): boolean;
|
|
58
|
+
/**
|
|
59
|
+
* Convenience builder for RedFindings — useful in tests and policy engine.
|
|
60
|
+
*/
|
|
61
|
+
export declare function buildRedFindings(input: Partial<RedFindings> & {
|
|
62
|
+
riskTier: RiskTier;
|
|
63
|
+
findings: RedFinding[];
|
|
64
|
+
}): RedFindings;
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import { PROBE_COUNTS, RED_PHASE_MODEL, resolveRedBudgetPolicy } from "./risk-tiers.js";
|
|
2
|
+
// ─── Programmatic probes ──────────────────────────────────────────────────────
|
|
3
|
+
/**
|
|
4
|
+
* Runs programmatic adversarial probes against a patch diff.
|
|
5
|
+
* These are deterministic checks — no model call.
|
|
6
|
+
*/
|
|
7
|
+
function runProgrammaticProbes(patch, paranoid) {
|
|
8
|
+
const findings = [];
|
|
9
|
+
// Probe 1: assertion deletion
|
|
10
|
+
if (/^\-.*expect\(.*\)\.to/m.test(patch.diff)) {
|
|
11
|
+
findings.push({
|
|
12
|
+
trapId: "T01",
|
|
13
|
+
severity: "warn",
|
|
14
|
+
description: "Patch removes test assertions — possible weak-test false green."
|
|
15
|
+
});
|
|
16
|
+
}
|
|
17
|
+
// Probe 2: grounding evasion pragma
|
|
18
|
+
if (/@martin-ignore|@ts-nocheck|eslint-disable/i.test(patch.diff)) {
|
|
19
|
+
findings.push({
|
|
20
|
+
trapId: "T11",
|
|
21
|
+
severity: "block",
|
|
22
|
+
description: "Grounding evasion pragma detected in patch."
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
// Probe 3: context poisoning
|
|
26
|
+
if (/\.martin\/|\.claude\//.test(patch.diff)) {
|
|
27
|
+
findings.push({
|
|
28
|
+
trapId: "T07",
|
|
29
|
+
severity: "block",
|
|
30
|
+
description: "Patch writes to agent context directory (.martin/ or .claude/)."
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
// Probe 4: budget self-report
|
|
34
|
+
if (/budgetSource.*model_reported|model_reported.*budget/i.test(patch.diff)) {
|
|
35
|
+
findings.push({
|
|
36
|
+
trapId: "T10",
|
|
37
|
+
severity: "block",
|
|
38
|
+
description: "Patch introduces model-reported budget sourcing."
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
if (paranoid) {
|
|
42
|
+
// Probe 5: scope creep — manifest changes
|
|
43
|
+
if (/^\+.*"[^"]+"\s*:\s*"[^"]+".*$/m.test(patch.diff) &&
|
|
44
|
+
/package\.json|Cargo\.toml|go\.mod/i.test(patch.changedFiles.join(","))) {
|
|
45
|
+
findings.push({
|
|
46
|
+
trapId: "T03",
|
|
47
|
+
severity: "warn",
|
|
48
|
+
description: "Paranoid scan: substantive manifest change detected."
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
// Probe 6: silent revert — removal of recently added symbols
|
|
52
|
+
const removedExportPattern = /^\-.*export\s+(function|const|class)\s+\w+/m;
|
|
53
|
+
if (removedExportPattern.test(patch.diff)) {
|
|
54
|
+
findings.push({
|
|
55
|
+
trapId: "T02",
|
|
56
|
+
severity: "warn",
|
|
57
|
+
description: "Paranoid scan: exported symbol removed — potential silent revert."
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return findings;
|
|
62
|
+
}
|
|
63
|
+
// ─── Red phase runner ─────────────────────────────────────────────────────────
|
|
64
|
+
/**
|
|
65
|
+
* Runs the Red phase for a given patch and risk tier.
|
|
66
|
+
*
|
|
67
|
+
* - baseline: programmatic probes only, no model call
|
|
68
|
+
* - high_risk: paranoid programmatic scan, no model call
|
|
69
|
+
* - release_critical: paranoid scan + one Haiku model call
|
|
70
|
+
*/
|
|
71
|
+
export async function runRedPhase(patch, tier, blueBudgetUsd, options = {}) {
|
|
72
|
+
const policy = resolveRedBudgetPolicy(tier, blueBudgetUsd);
|
|
73
|
+
const paranoid = tier !== "baseline";
|
|
74
|
+
let findings = runProgrammaticProbes(patch, paranoid);
|
|
75
|
+
let modelCallMade = false;
|
|
76
|
+
let modelUsed;
|
|
77
|
+
let budgetUsedUsd = 0;
|
|
78
|
+
const probesRun = PROBE_COUNTS[tier];
|
|
79
|
+
if (policy.modelCallAllowed && options.modelClient) {
|
|
80
|
+
const prompt = buildRedPhasePrompt(patch, findings);
|
|
81
|
+
const result = await options.modelClient.complete(prompt);
|
|
82
|
+
findings = [...findings, ...result.findings];
|
|
83
|
+
modelCallMade = true;
|
|
84
|
+
modelUsed = RED_PHASE_MODEL;
|
|
85
|
+
budgetUsedUsd += result.costUsd;
|
|
86
|
+
}
|
|
87
|
+
const result = {
|
|
88
|
+
riskTier: tier,
|
|
89
|
+
probesRun,
|
|
90
|
+
findingsCount: findings.length,
|
|
91
|
+
findings,
|
|
92
|
+
modelCallMade,
|
|
93
|
+
...(modelUsed !== undefined ? { modelUsed } : {}),
|
|
94
|
+
budgetUsedUsd
|
|
95
|
+
};
|
|
96
|
+
// Emit ledger event
|
|
97
|
+
options.onLedgerEvent?.({
|
|
98
|
+
type: "red_phase_findings",
|
|
99
|
+
riskTier: tier,
|
|
100
|
+
probesRun,
|
|
101
|
+
findingsCount: findings.length,
|
|
102
|
+
modelCallMade,
|
|
103
|
+
timestamp: new Date().toISOString()
|
|
104
|
+
});
|
|
105
|
+
return result;
|
|
106
|
+
}
|
|
107
|
+
// ─── Policy helpers ───────────────────────────────────────────────────────────
|
|
108
|
+
/**
|
|
109
|
+
* Returns true only if the findings contain zero block-severity entries.
|
|
110
|
+
* A single block finding rejects the patch regardless of warn count.
|
|
111
|
+
*/
|
|
112
|
+
export function shouldAcceptPatch(findings) {
|
|
113
|
+
return !findings.findings.some((f) => f.severity === "block");
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Convenience builder for RedFindings — useful in tests and policy engine.
|
|
117
|
+
*/
|
|
118
|
+
export function buildRedFindings(input) {
|
|
119
|
+
return {
|
|
120
|
+
riskTier: input.riskTier,
|
|
121
|
+
probesRun: input.probesRun ?? PROBE_COUNTS[input.riskTier],
|
|
122
|
+
findingsCount: input.findings.length,
|
|
123
|
+
findings: input.findings,
|
|
124
|
+
modelCallMade: input.modelCallMade ?? false,
|
|
125
|
+
...(input.modelUsed !== undefined ? { modelUsed: input.modelUsed } : {}),
|
|
126
|
+
budgetUsedUsd: input.budgetUsedUsd ?? 0
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
// ─── Internal helpers ─────────────────────────────────────────────────────────
|
|
130
|
+
function buildRedPhasePrompt(patch, existingFindings) {
|
|
131
|
+
return [
|
|
132
|
+
"You are a security-focused code reviewer running adversarial analysis.",
|
|
133
|
+
"Analyze this patch for adversarial patterns not caught by programmatic probes.",
|
|
134
|
+
"Return JSON array of findings with: trapId, severity (warn|block), description.",
|
|
135
|
+
`Already detected: ${JSON.stringify(existingFindings.map(f => f.trapId))}`,
|
|
136
|
+
`Changed files: ${patch.changedFiles.join(", ")}`,
|
|
137
|
+
"Diff (truncated to 2000 chars):",
|
|
138
|
+
patch.diff.slice(0, 2000)
|
|
139
|
+
].join("\n");
|
|
140
|
+
}
|
|
141
|
+
//# sourceMappingURL=red-phase.js.map
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export type RiskTier = "baseline" | "high_risk" | "release_critical";
|
|
2
|
+
export interface RedBudgetPolicy {
|
|
3
|
+
riskTier: RiskTier;
|
|
4
|
+
blueBudgetUsd: number;
|
|
5
|
+
/** Cap on Red phase spend: 30% / 100% / 150% of Blue */
|
|
6
|
+
redBudgetCapUsd: number;
|
|
7
|
+
/** Only release_critical permits a Haiku model call */
|
|
8
|
+
modelCallAllowed: boolean;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Returns the Red phase budget policy for a given risk tier and Blue budget.
|
|
12
|
+
*/
|
|
13
|
+
export declare function resolveRedBudgetPolicy(tier: RiskTier, blueBudgetUsd: number): RedBudgetPolicy;
|
|
14
|
+
/**
|
|
15
|
+
* Probe counts per tier.
|
|
16
|
+
* baseline = standard 6-probe sweep
|
|
17
|
+
* high_risk = paranoid 12-probe sweep
|
|
18
|
+
* release_critical = paranoid 12-probe sweep + model
|
|
19
|
+
*/
|
|
20
|
+
export declare const PROBE_COUNTS: Record<RiskTier, number>;
|
|
21
|
+
/** The only model ever permitted in the Red phase. */
|
|
22
|
+
export declare const RED_PHASE_MODEL: "claude-haiku-4-5-20251001";
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
// ─── Risk Tier Definitions ────────────────────────────────────────────────────
|
|
2
|
+
// Governs how aggressively Red phase probes a patch and whether a model call
|
|
3
|
+
// is permitted. Budget caps are expressed as fractions of the Blue phase budget.
|
|
4
|
+
const BUDGET_MULTIPLIERS = {
|
|
5
|
+
baseline: 0.30,
|
|
6
|
+
high_risk: 1.00,
|
|
7
|
+
release_critical: 1.50
|
|
8
|
+
};
|
|
9
|
+
/**
|
|
10
|
+
* Returns the Red phase budget policy for a given risk tier and Blue budget.
|
|
11
|
+
*/
|
|
12
|
+
export function resolveRedBudgetPolicy(tier, blueBudgetUsd) {
|
|
13
|
+
return {
|
|
14
|
+
riskTier: tier,
|
|
15
|
+
blueBudgetUsd,
|
|
16
|
+
redBudgetCapUsd: blueBudgetUsd * BUDGET_MULTIPLIERS[tier],
|
|
17
|
+
modelCallAllowed: tier === "release_critical"
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Probe counts per tier.
|
|
22
|
+
* baseline = standard 6-probe sweep
|
|
23
|
+
* high_risk = paranoid 12-probe sweep
|
|
24
|
+
* release_critical = paranoid 12-probe sweep + model
|
|
25
|
+
*/
|
|
26
|
+
export const PROBE_COUNTS = {
|
|
27
|
+
baseline: 6,
|
|
28
|
+
high_risk: 12,
|
|
29
|
+
release_critical: 12
|
|
30
|
+
};
|
|
31
|
+
/** The only model ever permitted in the Red phase. */
|
|
32
|
+
export const RED_PHASE_MODEL = "claude-haiku-4-5-20251001";
|
|
33
|
+
//# sourceMappingURL=risk-tiers.js.map
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* replay.ts — SLICE-10
|
|
3
|
+
*
|
|
4
|
+
* Reproducibility seal: re-runs the decision pipeline over stored attempt
|
|
5
|
+
* artifacts (no model calls) and confirms every gate decision still matches.
|
|
6
|
+
*
|
|
7
|
+
* Any deterministic mismatch is a P0 defect in the decision pipeline.
|
|
8
|
+
*/
|
|
9
|
+
export interface StoredAttemptArtifact {
|
|
10
|
+
attemptId: string;
|
|
11
|
+
loopId: string;
|
|
12
|
+
/** The diff that was evaluated */
|
|
13
|
+
diff: string;
|
|
14
|
+
/** The objective at time of evaluation */
|
|
15
|
+
objective: string;
|
|
16
|
+
/** Decisions recorded during the original run */
|
|
17
|
+
decisions: {
|
|
18
|
+
leash: {
|
|
19
|
+
blocked: boolean;
|
|
20
|
+
matchedPattern?: string;
|
|
21
|
+
};
|
|
22
|
+
grounding: {
|
|
23
|
+
contradictions: number;
|
|
24
|
+
};
|
|
25
|
+
proof: {
|
|
26
|
+
grade: "A" | "B" | "C";
|
|
27
|
+
passed: boolean;
|
|
28
|
+
};
|
|
29
|
+
finalVerdict: "ACCEPTED" | "REJECTED";
|
|
30
|
+
};
|
|
31
|
+
/** ISO timestamp of original run */
|
|
32
|
+
recordedAt: string;
|
|
33
|
+
}
|
|
34
|
+
export interface ReplayDecisions {
|
|
35
|
+
leash: {
|
|
36
|
+
blocked: boolean;
|
|
37
|
+
matchedPattern?: string;
|
|
38
|
+
};
|
|
39
|
+
grounding: {
|
|
40
|
+
contradictions: number;
|
|
41
|
+
};
|
|
42
|
+
proof: {
|
|
43
|
+
grade: "A" | "B" | "C";
|
|
44
|
+
passed: boolean;
|
|
45
|
+
};
|
|
46
|
+
finalVerdict: "ACCEPTED" | "REJECTED";
|
|
47
|
+
}
|
|
48
|
+
export interface ReplayMismatch {
|
|
49
|
+
gate: "leash" | "grounding" | "proof" | "finalVerdict";
|
|
50
|
+
original: unknown;
|
|
51
|
+
replayed: unknown;
|
|
52
|
+
severity: "P0" | "P1";
|
|
53
|
+
}
|
|
54
|
+
export interface ReplayReport {
|
|
55
|
+
loopId: string;
|
|
56
|
+
attemptId: string;
|
|
57
|
+
match: boolean;
|
|
58
|
+
mismatches: ReplayMismatch[];
|
|
59
|
+
replayedAt: string;
|
|
60
|
+
/** SHA-256 over (loopId + attemptId + replayedAt + match) */
|
|
61
|
+
replayHash: string;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Re-runs the leash check over a stored diff.
|
|
65
|
+
* Uses the same BLOCKED_PATTERNS from leash.ts — but as a lightweight
|
|
66
|
+
* re-implementation to avoid circular deps in the replay layer.
|
|
67
|
+
*/
|
|
68
|
+
export declare function replayLeashCheck(diff: string): {
|
|
69
|
+
blocked: boolean;
|
|
70
|
+
matchedPattern?: string;
|
|
71
|
+
};
|
|
72
|
+
/**
|
|
73
|
+
* Re-evaluates the proof grade from stored inputs.
|
|
74
|
+
* Grade is deterministic from the verifier result string.
|
|
75
|
+
*/
|
|
76
|
+
export declare function replayProofGrade(verifierResult: string, objective: string): {
|
|
77
|
+
grade: "A" | "B" | "C";
|
|
78
|
+
passed: boolean;
|
|
79
|
+
};
|
|
80
|
+
export declare function replayAttempt(artifact: StoredAttemptArtifact, verifierResult?: string): ReplayReport;
|
|
81
|
+
export declare function replayLoop(artifacts: StoredAttemptArtifact[], verifierResults?: Map<string, string>): {
|
|
82
|
+
reports: ReplayReport[];
|
|
83
|
+
allMatch: boolean;
|
|
84
|
+
p0Count: number;
|
|
85
|
+
};
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* replay.ts — SLICE-10
|
|
3
|
+
*
|
|
4
|
+
* Reproducibility seal: re-runs the decision pipeline over stored attempt
|
|
5
|
+
* artifacts (no model calls) and confirms every gate decision still matches.
|
|
6
|
+
*
|
|
7
|
+
* Any deterministic mismatch is a P0 defect in the decision pipeline.
|
|
8
|
+
*/
|
|
9
|
+
import { createHash } from "node:crypto";
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Pipeline re-runners (deterministic, no model calls)
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
/**
|
|
14
|
+
* Re-runs the leash check over a stored diff.
|
|
15
|
+
* Uses the same BLOCKED_PATTERNS from leash.ts — but as a lightweight
|
|
16
|
+
* re-implementation to avoid circular deps in the replay layer.
|
|
17
|
+
*/
|
|
18
|
+
export function replayLeashCheck(diff) {
|
|
19
|
+
const DANGEROUS_PATTERNS = [
|
|
20
|
+
{ name: "SHELL_RM_RF", re: /rm\s+-rf?\b/ },
|
|
21
|
+
{ name: "SHELL_PIPE_EVAL", re: /curl.*\|\s*(?:ba)?sh|wget.*\|\s*(?:ba)?sh/ },
|
|
22
|
+
{ name: "GIT_FORCE_PUSH", re: /git\s+push\s+.*--force/ },
|
|
23
|
+
{ name: "GIT_RESET_HARD", re: /git\s+reset\s+--hard/ },
|
|
24
|
+
{ name: "FORK_BOMB", re: /:\s*\(\s*\)\s*\{.*:.*\|.*:.*\}/ },
|
|
25
|
+
{ name: "SHELL_CHMOD_777", re: /chmod\s+(?:a\+rwx|777)/ },
|
|
26
|
+
{ name: "NODE_EXEC_EVAL", re: /eval\s*\(/ },
|
|
27
|
+
{ name: "SUDO_ESCALATION", re: /sudo\s+/ },
|
|
28
|
+
];
|
|
29
|
+
for (const { name, re } of DANGEROUS_PATTERNS) {
|
|
30
|
+
if (re.test(diff))
|
|
31
|
+
return { blocked: true, matchedPattern: name };
|
|
32
|
+
}
|
|
33
|
+
return { blocked: false };
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Re-evaluates the proof grade from stored inputs.
|
|
37
|
+
* Grade is deterministic from the verifier result string.
|
|
38
|
+
*/
|
|
39
|
+
export function replayProofGrade(verifierResult, objective) {
|
|
40
|
+
const lower = verifierResult.toLowerCase();
|
|
41
|
+
const hasPass = lower.includes("pass") || lower.includes("ok") || lower.includes("✓");
|
|
42
|
+
const hasFail = lower.includes("fail") || lower.includes("error") || lower.includes("✗");
|
|
43
|
+
if (hasPass && !hasFail)
|
|
44
|
+
return { grade: "A", passed: true };
|
|
45
|
+
if (hasPass && hasFail)
|
|
46
|
+
return { grade: "B", passed: false };
|
|
47
|
+
return { grade: "C", passed: false };
|
|
48
|
+
}
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// Core replay function
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
export function replayAttempt(artifact, verifierResult) {
|
|
53
|
+
const replayedAt = new Date().toISOString();
|
|
54
|
+
const mismatches = [];
|
|
55
|
+
// Re-run leash
|
|
56
|
+
const replayLeash = replayLeashCheck(artifact.diff);
|
|
57
|
+
if (replayLeash.blocked !== artifact.decisions.leash.blocked) {
|
|
58
|
+
mismatches.push({
|
|
59
|
+
gate: "leash",
|
|
60
|
+
original: artifact.decisions.leash,
|
|
61
|
+
replayed: replayLeash,
|
|
62
|
+
severity: "P0"
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
// Re-run proof grade (if verifier result provided)
|
|
66
|
+
if (verifierResult !== undefined) {
|
|
67
|
+
const replayProof = replayProofGrade(verifierResult, artifact.objective);
|
|
68
|
+
if (replayProof.grade !== artifact.decisions.proof.grade) {
|
|
69
|
+
mismatches.push({
|
|
70
|
+
gate: "proof",
|
|
71
|
+
original: artifact.decisions.proof,
|
|
72
|
+
replayed: replayProof,
|
|
73
|
+
severity: "P0"
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
// Final verdict: derive from replayed decisions
|
|
78
|
+
const replayedVerdict = (!replayLeash.blocked && (verifierResult === undefined || artifact.decisions.proof.passed)) ? "ACCEPTED" : "REJECTED";
|
|
79
|
+
if (replayedVerdict !== artifact.decisions.finalVerdict) {
|
|
80
|
+
mismatches.push({
|
|
81
|
+
gate: "finalVerdict",
|
|
82
|
+
original: artifact.decisions.finalVerdict,
|
|
83
|
+
replayed: replayedVerdict,
|
|
84
|
+
severity: "P0"
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
const match = mismatches.length === 0;
|
|
88
|
+
const replayHash = createHash("sha256")
|
|
89
|
+
.update(`${artifact.loopId}:${artifact.attemptId}:${replayedAt}:${match}`)
|
|
90
|
+
.digest("hex");
|
|
91
|
+
return {
|
|
92
|
+
loopId: artifact.loopId,
|
|
93
|
+
attemptId: artifact.attemptId,
|
|
94
|
+
match,
|
|
95
|
+
mismatches,
|
|
96
|
+
replayedAt,
|
|
97
|
+
replayHash
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
// ---------------------------------------------------------------------------
|
|
101
|
+
// Replay runner: multiple attempts
|
|
102
|
+
// ---------------------------------------------------------------------------
|
|
103
|
+
export function replayLoop(artifacts, verifierResults) {
|
|
104
|
+
const reports = artifacts.map(a => replayAttempt(a, verifierResults?.get(a.attemptId)));
|
|
105
|
+
const allMatch = reports.every(r => r.match);
|
|
106
|
+
const p0Count = reports.reduce((n, r) => n + r.mismatches.filter(m => m.severity === "P0").length, 0);
|
|
107
|
+
return { reports, allMatch, p0Count };
|
|
108
|
+
}
|
|
109
|
+
//# sourceMappingURL=replay.js.map
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import type { LoopTask, FailureClass } from "../../contracts/index.js";
|
|
2
|
+
import type { CostGovernorState } from "../policy.js";
|
|
3
|
+
import { type ModelTrustProfile } from "./trust-calibration.js";
|
|
4
|
+
/**
|
|
5
|
+
* Minimal structural interface for what the router requires from an adapter.
|
|
6
|
+
* Any MartinAdapter from @martin/core is structurally compatible with this.
|
|
7
|
+
*/
|
|
8
|
+
export interface RouterAdapterRef {
|
|
9
|
+
metadata: {
|
|
10
|
+
model: string;
|
|
11
|
+
provider?: string;
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
export interface RouteConfig {
|
|
15
|
+
adapter: RouterAdapterRef;
|
|
16
|
+
baseCostUsdPer1kTcs: number;
|
|
17
|
+
maxLatencyMs?: number;
|
|
18
|
+
trustTier: "high" | "medium" | "low";
|
|
19
|
+
}
|
|
20
|
+
export interface RouteEvaluationContext {
|
|
21
|
+
task: LoopTask;
|
|
22
|
+
costState: CostGovernorState;
|
|
23
|
+
currentFailure?: FailureClass;
|
|
24
|
+
complexityScore: number;
|
|
25
|
+
/**
|
|
26
|
+
* Estimated blast radius of the planned action on a 0–100 scale.
|
|
27
|
+
* Exposed in run summary and OTel span.
|
|
28
|
+
* When > 70, forces high-trust route regardless of other heuristics.
|
|
29
|
+
*/
|
|
30
|
+
blastRadius?: number;
|
|
31
|
+
/**
|
|
32
|
+
* Trust profiles derived from historical run data by the Trust Calibration Engine.
|
|
33
|
+
* When present, the router uses these to auto-downgrade to cheaper models that
|
|
34
|
+
* have proven reliability and to deprioritize models with poor track records.
|
|
35
|
+
*/
|
|
36
|
+
trustProfiles?: ModelTrustProfile[];
|
|
37
|
+
}
|
|
38
|
+
export interface RouteDecision {
|
|
39
|
+
adapter: RouterAdapterRef;
|
|
40
|
+
rationale: string;
|
|
41
|
+
/** Estimated cost per 1k tokens for the selected route — exposed in run summary and OTel span */
|
|
42
|
+
selectedCostPer1kTcs: number;
|
|
43
|
+
/** Trust tier of the selected route */
|
|
44
|
+
selectedTrustTier: "high" | "medium" | "low";
|
|
45
|
+
}
|
|
46
|
+
export declare class MartinRouter {
|
|
47
|
+
private readonly availableRoutes;
|
|
48
|
+
constructor(availableRoutes: RouteConfig[]);
|
|
49
|
+
/**
|
|
50
|
+
* Dynamically selects the optimal provider/model adapter for the next attempt.
|
|
51
|
+
* Balances the necessity for intelligence (trustTier, complexity) against budget constraints.
|
|
52
|
+
*/
|
|
53
|
+
evaluateRoute(context: RouteEvaluationContext): RouteDecision;
|
|
54
|
+
}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import { getTracer } from "../otel/tracer.js";
|
|
2
|
+
import { shouldDeprioritize } from "./trust-calibration.js";
|
|
3
|
+
export class MartinRouter {
|
|
4
|
+
availableRoutes;
|
|
5
|
+
constructor(availableRoutes) {
|
|
6
|
+
this.availableRoutes = availableRoutes;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Dynamically selects the optimal provider/model adapter for the next attempt.
|
|
10
|
+
* Balances the necessity for intelligence (trustTier, complexity) against budget constraints.
|
|
11
|
+
*/
|
|
12
|
+
evaluateRoute(context) {
|
|
13
|
+
if (this.availableRoutes.length === 0) {
|
|
14
|
+
throw new Error("MartinRouter has no available routes configured.");
|
|
15
|
+
}
|
|
16
|
+
const { costState, currentFailure, complexityScore, blastRadius, trustProfiles } = context;
|
|
17
|
+
const tracer = getTracer();
|
|
18
|
+
const span = tracer.startSpan("martin.router_decision", {
|
|
19
|
+
"router.pressure": costState.pressure,
|
|
20
|
+
"router.remaining_budget_usd": costState.remainingBudgetUsd,
|
|
21
|
+
"router.complexity_score": complexityScore,
|
|
22
|
+
...(currentFailure ? { "router.failure_class": currentFailure } : {}),
|
|
23
|
+
...(blastRadius !== undefined ? { "router.blast_radius": blastRadius } : {})
|
|
24
|
+
});
|
|
25
|
+
const emit = (selected, rationale) => {
|
|
26
|
+
span.attributes["router.selected_model"] = selected.adapter.metadata.model;
|
|
27
|
+
span.attributes["router.selected_trust_tier"] = selected.trustTier;
|
|
28
|
+
span.attributes["router.selected_cost_per_1k_tcs"] = selected.baseCostUsdPer1kTcs;
|
|
29
|
+
span.attributes["router.rationale"] = rationale;
|
|
30
|
+
tracer.endSpan(span, "OK");
|
|
31
|
+
return {
|
|
32
|
+
adapter: selected.adapter,
|
|
33
|
+
rationale,
|
|
34
|
+
selectedCostPer1kTcs: selected.baseCostUsdPer1kTcs,
|
|
35
|
+
selectedTrustTier: selected.trustTier
|
|
36
|
+
};
|
|
37
|
+
};
|
|
38
|
+
// Filter out adapters that would aggressively violate remaining budget estimations
|
|
39
|
+
const budgetFilteredRoutes = this.availableRoutes.filter((route) => {
|
|
40
|
+
const estimatedCost = route.baseCostUsdPer1kTcs * 15;
|
|
41
|
+
return estimatedCost <= costState.remainingBudgetUsd;
|
|
42
|
+
});
|
|
43
|
+
const routes = budgetFilteredRoutes.length > 0 ? budgetFilteredRoutes : this.availableRoutes;
|
|
44
|
+
// High blast radius: force a high-trust route to reduce regression risk
|
|
45
|
+
if (blastRadius !== undefined && blastRadius > 70) {
|
|
46
|
+
const highTrust = routes.filter((r) => r.trustTier === "high");
|
|
47
|
+
if (highTrust.length > 0) {
|
|
48
|
+
const safest = highTrust.reduce((min, r) => r.baseCostUsdPer1kTcs < min.baseCostUsdPer1kTcs ? r : min);
|
|
49
|
+
return emit(safest, `Forced high-trust route ${safest.adapter.metadata.model} due to blast radius ${blastRadius}/100.`);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
// Trust calibration: deprioritize models with proven poor track records
|
|
53
|
+
// and prefer cheaper models with proven high efficiency when data warrants it
|
|
54
|
+
let calibratedRoutes = routes;
|
|
55
|
+
if (trustProfiles && trustProfiles.length > 0) {
|
|
56
|
+
// Remove routes whose model has been deprioritized by evidence
|
|
57
|
+
const evidenceBacked = routes.filter((r) => {
|
|
58
|
+
const profile = trustProfiles.find((p) => profileMatchesRoute(p, r));
|
|
59
|
+
return profile === undefined || !shouldDeprioritize(profile);
|
|
60
|
+
});
|
|
61
|
+
if (evidenceBacked.length > 0)
|
|
62
|
+
calibratedRoutes = evidenceBacked;
|
|
63
|
+
const trustedRoutes = calibratedRoutes
|
|
64
|
+
.map((route) => ({
|
|
65
|
+
route,
|
|
66
|
+
profile: trustProfiles.find((profile) => profileMatchesRoute(profile, route))
|
|
67
|
+
}))
|
|
68
|
+
.filter((item) => {
|
|
69
|
+
const profile = item.profile;
|
|
70
|
+
return (profile !== undefined &&
|
|
71
|
+
profile.efficiencyScore > 0.85 &&
|
|
72
|
+
profile.runsObserved >= 3);
|
|
73
|
+
})
|
|
74
|
+
.sort((a, b) => a.route.baseCostUsdPer1kTcs - b.route.baseCostUsdPer1kTcs);
|
|
75
|
+
const trustedRoute = trustedRoutes[0];
|
|
76
|
+
const defaultRoute = routes[0];
|
|
77
|
+
if (trustedRoute &&
|
|
78
|
+
defaultRoute &&
|
|
79
|
+
trustedRoute.route.baseCostUsdPer1kTcs < defaultRoute.baseCostUsdPer1kTcs) {
|
|
80
|
+
return emit(trustedRoute.route, `Auto-selected ${trustedRoute.route.adapter.metadata.model} based on ${String(trustedRoute.profile.runsObserved)} historical runs (efficiency: ${String(Math.round(trustedRoute.profile.efficiencyScore * 100))}%, completion: ${String(Math.round(trustedRoute.profile.completionRate * 100))}%).`);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
const effectiveRoutes = calibratedRoutes;
|
|
84
|
+
// Default to cheapest route when budget is at the hard ceiling
|
|
85
|
+
if (costState.pressure === "hard_limit") {
|
|
86
|
+
const cheapest = effectiveRoutes.reduce((min, r) => r.baseCostUsdPer1kTcs < min.baseCostUsdPer1kTcs ? r : min);
|
|
87
|
+
return emit(cheapest, `Selected ${cheapest.adapter.metadata.model} due to severe budget pressure (hard_limit).`);
|
|
88
|
+
}
|
|
89
|
+
// If we're failing on reasoning or grounding, escalate to a high-trust model immediately
|
|
90
|
+
if (currentFailure === "verification_failure" ||
|
|
91
|
+
currentFailure === "repo_grounding_failure" ||
|
|
92
|
+
complexityScore > 0.7) {
|
|
93
|
+
const highTrust = effectiveRoutes.filter((r) => r.trustTier === "high");
|
|
94
|
+
if (highTrust.length > 0) {
|
|
95
|
+
const best = highTrust.reduce((min, r) => r.baseCostUsdPer1kTcs < min.baseCostUsdPer1kTcs ? r : min);
|
|
96
|
+
return emit(best, `Escalated to ${best.adapter.metadata.model} (high-trust) due to failure profile '${currentFailure ?? "complex_task"}'.`);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
// Prefer economical route for simple or trivially-classified errors
|
|
100
|
+
if (currentFailure === "syntax_error" ||
|
|
101
|
+
currentFailure === "repo_grounding_failure" ||
|
|
102
|
+
complexityScore < 0.3) {
|
|
103
|
+
const economical = effectiveRoutes.filter((r) => r.trustTier !== "high");
|
|
104
|
+
if (economical.length > 0) {
|
|
105
|
+
const best = economical.reduce((min, r) => r.baseCostUsdPer1kTcs < min.baseCostUsdPer1kTcs ? r : min);
|
|
106
|
+
return emit(best, `Selected ${best.adapter.metadata.model} to preserve budget on low-complexity task recovery.`);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Default to a balanced medium-tier route if no strict heuristic applies
|
|
110
|
+
const mediumTrust = effectiveRoutes.filter((r) => r.trustTier === "medium");
|
|
111
|
+
if (mediumTrust.length > 0) {
|
|
112
|
+
const best = mediumTrust.reduce((min, r) => Math.abs(r.maxLatencyMs ?? 5000) < Math.abs(min.maxLatencyMs ?? 5000) ? r : min);
|
|
113
|
+
return emit(best, `Selected balanced route ${best.adapter.metadata.model} for nominal execution.`);
|
|
114
|
+
}
|
|
115
|
+
// Ultimate fallback — effectiveRoutes is non-empty (guaranteed by the guard above)
|
|
116
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
117
|
+
const bestRoute = effectiveRoutes[0];
|
|
118
|
+
return emit(bestRoute, `Fallback route ${bestRoute.adapter.metadata.model} selected as default.`);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
function profileMatchesRoute(profile, route) {
|
|
122
|
+
const model = normalizeModelKey(route.adapter.metadata.model);
|
|
123
|
+
const profileModel = normalizeModelKey(profile.model);
|
|
124
|
+
return (model === profileModel ||
|
|
125
|
+
model.includes(profileModel) ||
|
|
126
|
+
profileModel.includes(model));
|
|
127
|
+
}
|
|
128
|
+
function normalizeModelKey(value) {
|
|
129
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, "-");
|
|
130
|
+
}
|
|
131
|
+
//# sourceMappingURL=engine.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from "./engine.js";
|