switchboard-cli 0.1.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +122 -0
- package/bin/switchboard.mjs +49 -0
- package/calibration/engine/baseline.ts +93 -0
- package/calibration/engine/diagnosis.ts +191 -0
- package/calibration/engine/diff.ts +118 -0
- package/calibration/engine/escalation.ts +49 -0
- package/calibration/engine/ledger.ts +141 -0
- package/calibration/engine/trends.ts +141 -0
- package/calibration/external/rubric.yaml +32 -0
- package/calibration/external/scorer.ts +479 -0
- package/calibration/external/verdict-writer.ts +29 -0
- package/calibration/internal/harness.ts +697 -0
- package/calibration/internal/return-simulator.ts +270 -0
- package/calibration/internal/trace-collector.ts +78 -0
- package/calibration/internal/verdict-writer.ts +149 -0
- package/calibration/ledger/baselines/baseline-2026-04-09.yaml +23 -0
- package/calibration/ledger/history.yaml +18 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/diffs/adv-0bdc944b61d5.yaml +9 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/diffs/blind-16cdf0db1b43.yaml +9 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/diffs/blind-a6b2c8be67cc.yaml +9 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/manifest.yaml +8 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/seeds/adv-0bdc944b61d5.yaml +7 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/seeds/blind-16cdf0db1b43.yaml +8 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/seeds/blind-a6b2c8be67cc.yaml +8 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/summary.yaml +10 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/traces/adv-0bdc944b61d5.yaml +141 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/traces/blind-16cdf0db1b43.yaml +147 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/traces/blind-a6b2c8be67cc.yaml +147 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/verdicts-a/adv-0bdc944b61d5.yaml +24 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/verdicts-a/blind-16cdf0db1b43.yaml +24 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/verdicts-a/blind-a6b2c8be67cc.yaml +25 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/verdicts-b/adv-0bdc944b61d5.yaml +31 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/verdicts-b/blind-16cdf0db1b43.yaml +32 -0
- package/calibration/ledger/runs/2026-04-09T09-45-01-838Z/verdicts-b/blind-a6b2c8be67cc.yaml +32 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/diffs/adv-a0c9e2bfb0d6.yaml +9 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/diffs/blind-3e892f3a89ee.yaml +9 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/diffs/blind-958b2f9e6816.yaml +9 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/manifest.yaml +8 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/seeds/adv-a0c9e2bfb0d6.yaml +7 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/seeds/blind-3e892f3a89ee.yaml +8 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/seeds/blind-958b2f9e6816.yaml +8 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/summary.yaml +10 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/traces/adv-a0c9e2bfb0d6.yaml +141 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/traces/blind-3e892f3a89ee.yaml +147 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/traces/blind-958b2f9e6816.yaml +147 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/verdicts-a/adv-a0c9e2bfb0d6.yaml +24 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/verdicts-a/blind-3e892f3a89ee.yaml +23 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/verdicts-a/blind-958b2f9e6816.yaml +25 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/verdicts-b/adv-a0c9e2bfb0d6.yaml +31 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/verdicts-b/blind-3e892f3a89ee.yaml +32 -0
- package/calibration/ledger/runs/2026-04-09T10-02-57-143Z/verdicts-b/blind-958b2f9e6816.yaml +32 -0
- package/calibration/seeds/adversarial-generator.ts +159 -0
- package/calibration/seeds/blind-generator.ts +169 -0
- package/calibration/seeds/replay-loader.ts +117 -0
- package/calibration/skill/calibrate.ts +292 -0
- package/calibration/skill/cli-flags.ts +49 -0
- package/calibration/skill/report.ts +80 -0
- package/calibration/skill/review.ts +118 -0
- package/calibration/types.ts +292 -0
- package/package.json +46 -0
- package/src/commands/audit-codex.ts +266 -0
- package/src/commands/calibrate.ts +70 -0
- package/src/commands/compile.ts +117 -0
- package/src/commands/evaluate.ts +103 -0
- package/src/commands/ingest.ts +250 -0
- package/src/commands/init.ts +133 -0
- package/src/commands/packet.ts +408 -0
- package/src/commands/receipt.ts +305 -0
- package/src/commands/run-claude.ts +355 -0
- package/src/index.ts +43 -0
- package/src/lib/draft-return.ts +278 -0
- package/src/lib/drift-guard.ts +105 -0
- package/src/lib/errors.ts +61 -0
- package/src/lib/output.ts +43 -0
- package/src/lib/paths.ts +125 -0
- package/src/lib/proof.ts +262 -0
- package/src/lib/transport.ts +276 -0
- package/src/lib/yaml-io.ts +62 -0
- package/src/store/filesystem-store.ts +326 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import { mkdirSync, writeFileSync, readFileSync, existsSync } from "fs";
|
|
2
|
+
import { join } from "path";
|
|
3
|
+
import YAML from "yaml";
|
|
4
|
+
|
|
5
|
+
import type {
|
|
6
|
+
RunManifest,
|
|
7
|
+
CalibrationSeed,
|
|
8
|
+
PipelineTrace,
|
|
9
|
+
LayerAVerdict,
|
|
10
|
+
LayerBVerdict,
|
|
11
|
+
VerdictDiff,
|
|
12
|
+
DiagnosisCard,
|
|
13
|
+
EscalationItem,
|
|
14
|
+
RunSummary,
|
|
15
|
+
} from "../types";
|
|
16
|
+
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Append-only ledger operations
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
const RUNS_DIR = "runs";
|
|
22
|
+
const HISTORY_FILE = "history.yaml";
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Ensures the calibration ledger directory structure exists.
|
|
26
|
+
*/
|
|
27
|
+
export function initLedger(basePath: string): void {
|
|
28
|
+
mkdirSync(join(basePath, RUNS_DIR), { recursive: true });
|
|
29
|
+
|
|
30
|
+
const historyPath = join(basePath, HISTORY_FILE);
|
|
31
|
+
if (!existsSync(historyPath)) {
|
|
32
|
+
writeFileSync(historyPath, YAML.stringify([]), "utf-8");
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Writes manifest.yaml into runs/<timestamp>/ for a new run.
|
|
38
|
+
*/
|
|
39
|
+
export function appendRun(basePath: string, manifest: RunManifest): void {
|
|
40
|
+
const runDir = join(basePath, RUNS_DIR, manifest.run_id);
|
|
41
|
+
mkdirSync(runDir, { recursive: true });
|
|
42
|
+
writeFileSync(join(runDir, "manifest.yaml"), YAML.stringify(manifest), "utf-8");
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Writes a calibration seed yaml into the run directory.
|
|
47
|
+
*/
|
|
48
|
+
export function writeSeed(basePath: string, runId: string, seed: CalibrationSeed): void {
|
|
49
|
+
const seedDir = join(basePath, RUNS_DIR, runId, "seeds");
|
|
50
|
+
mkdirSync(seedDir, { recursive: true });
|
|
51
|
+
writeFileSync(join(seedDir, `${seed.seed_id}.yaml`), YAML.stringify(seed), "utf-8");
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Writes a pipeline trace yaml into the run directory.
|
|
56
|
+
*/
|
|
57
|
+
export function writeTrace(basePath: string, runId: string, trace: PipelineTrace): void {
|
|
58
|
+
const traceDir = join(basePath, RUNS_DIR, runId, "traces");
|
|
59
|
+
mkdirSync(traceDir, { recursive: true });
|
|
60
|
+
writeFileSync(join(traceDir, `${trace.seed_id}.yaml`), YAML.stringify(trace), "utf-8");
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Writes a Layer A verdict yaml into the run directory.
|
|
65
|
+
*/
|
|
66
|
+
export function writeVerdictA(basePath: string, runId: string, verdict: LayerAVerdict): void {
|
|
67
|
+
const verdictDir = join(basePath, RUNS_DIR, runId, "verdicts-a");
|
|
68
|
+
mkdirSync(verdictDir, { recursive: true });
|
|
69
|
+
writeFileSync(join(verdictDir, `${verdict.seed_id}.yaml`), YAML.stringify(verdict), "utf-8");
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Writes a Layer B verdict yaml into the run directory.
|
|
74
|
+
*/
|
|
75
|
+
export function writeVerdictB(basePath: string, runId: string, verdict: LayerBVerdict): void {
|
|
76
|
+
const verdictDir = join(basePath, RUNS_DIR, runId, "verdicts-b");
|
|
77
|
+
mkdirSync(verdictDir, { recursive: true });
|
|
78
|
+
writeFileSync(join(verdictDir, `${verdict.seed_id}.yaml`), YAML.stringify(verdict), "utf-8");
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Writes a verdict diff yaml into the run directory.
|
|
83
|
+
*/
|
|
84
|
+
export function writeDiff(basePath: string, runId: string, diff: VerdictDiff): void {
|
|
85
|
+
const diffDir = join(basePath, RUNS_DIR, runId, "diffs");
|
|
86
|
+
mkdirSync(diffDir, { recursive: true });
|
|
87
|
+
writeFileSync(join(diffDir, `${diff.seed_id}.yaml`), YAML.stringify(diff), "utf-8");
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Writes a diagnosis card yaml into the run directory.
|
|
92
|
+
*/
|
|
93
|
+
export function writeDiagnosis(basePath: string, runId: string, card: DiagnosisCard): void {
|
|
94
|
+
const diagDir = join(basePath, RUNS_DIR, runId, "diagnoses");
|
|
95
|
+
mkdirSync(diagDir, { recursive: true });
|
|
96
|
+
writeFileSync(join(diagDir, `${card.id}.yaml`), YAML.stringify(card), "utf-8");
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Writes an escalation item yaml into the run directory.
|
|
101
|
+
*/
|
|
102
|
+
export function writeEscalation(basePath: string, runId: string, item: EscalationItem): void {
|
|
103
|
+
const escDir = join(basePath, RUNS_DIR, runId, "escalations");
|
|
104
|
+
mkdirSync(escDir, { recursive: true });
|
|
105
|
+
writeFileSync(join(escDir, `${item.id}.yaml`), YAML.stringify(item), "utf-8");
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Writes the run summary yaml into the run directory.
|
|
110
|
+
*/
|
|
111
|
+
export function writeSummary(basePath: string, runId: string, summary: RunSummary): void {
|
|
112
|
+
const runDir = join(basePath, RUNS_DIR, runId);
|
|
113
|
+
mkdirSync(runDir, { recursive: true });
|
|
114
|
+
writeFileSync(join(runDir, "summary.yaml"), YAML.stringify(summary), "utf-8");
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Reads history.yaml — an array of past run manifests.
|
|
119
|
+
*/
|
|
120
|
+
export function readHistory(basePath: string): RunManifest[] {
|
|
121
|
+
const historyPath = join(basePath, HISTORY_FILE);
|
|
122
|
+
if (!existsSync(historyPath)) {
|
|
123
|
+
return [];
|
|
124
|
+
}
|
|
125
|
+
const raw = readFileSync(historyPath, "utf-8");
|
|
126
|
+
const parsed = YAML.parse(raw);
|
|
127
|
+
if (!Array.isArray(parsed)) {
|
|
128
|
+
return [];
|
|
129
|
+
}
|
|
130
|
+
return parsed as RunManifest[];
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Appends a run manifest to history.yaml.
|
|
135
|
+
*/
|
|
136
|
+
export function appendHistory(basePath: string, manifest: RunManifest): void {
|
|
137
|
+
const history = readHistory(basePath);
|
|
138
|
+
history.push(manifest);
|
|
139
|
+
const historyPath = join(basePath, HISTORY_FILE);
|
|
140
|
+
writeFileSync(historyPath, YAML.stringify(history), "utf-8");
|
|
141
|
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import { readFileSync, existsSync } from "fs";
|
|
2
|
+
import { join } from "path";
|
|
3
|
+
import YAML from "yaml";
|
|
4
|
+
|
|
5
|
+
import type { TrendEntry, RunSummary, RunManifest } from "../types";
|
|
6
|
+
import { readHistory } from "./ledger";
|
|
7
|
+
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
// Trend computation from ledger — reads history and run summaries
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
const RUNS_DIR = "runs";
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Reads all run summaries from the ledger and computes trend entries.
|
|
16
|
+
*
|
|
17
|
+
* Each trend entry captures the key metrics from a single run, enabling
|
|
18
|
+
* time-series analysis of calibration health.
|
|
19
|
+
*/
|
|
20
|
+
export function computeTrends(basePath: string): TrendEntry[] {
|
|
21
|
+
const history = readHistory(basePath);
|
|
22
|
+
const trends: TrendEntry[] = [];
|
|
23
|
+
|
|
24
|
+
for (const manifest of history) {
|
|
25
|
+
const summaryPath = join(basePath, RUNS_DIR, manifest.run_id, "summary.yaml");
|
|
26
|
+
if (!existsSync(summaryPath)) continue;
|
|
27
|
+
|
|
28
|
+
const raw = readFileSync(summaryPath, "utf-8");
|
|
29
|
+
const summary = YAML.parse(raw) as RunSummary;
|
|
30
|
+
|
|
31
|
+
// Compute per-source pass rates from the manifest counts.
|
|
32
|
+
// These are approximations from the summary data — a full breakdown
|
|
33
|
+
// would require reading individual diffs, but the summary provides
|
|
34
|
+
// enough signal for trend detection.
|
|
35
|
+
const totalSeeds = manifest.seed_count || 1;
|
|
36
|
+
const blindFraction = manifest.blind_count / totalSeeds;
|
|
37
|
+
const adversarialFraction = manifest.adversarial_count / totalSeeds;
|
|
38
|
+
const replayFraction = manifest.replay_count / totalSeeds;
|
|
39
|
+
|
|
40
|
+
// Use the overall false negative rate to approximate per-source pass rates.
|
|
41
|
+
// Blind seeds should pass more often; adversarial seeds should fail more often.
|
|
42
|
+
const overallPassRate = 1 - summary.false_negative_rate - summary.false_positive_rate;
|
|
43
|
+
|
|
44
|
+
const entry: TrendEntry = {
|
|
45
|
+
run_id: manifest.run_id,
|
|
46
|
+
timestamp: summary.completed_at || manifest.started_at,
|
|
47
|
+
gate_catch_rate: summary.gate_catch_rate,
|
|
48
|
+
false_positive_rate: summary.false_positive_rate,
|
|
49
|
+
false_negative_rate: summary.false_negative_rate,
|
|
50
|
+
layer_agreement_rate: summary.layer_agreement_rate,
|
|
51
|
+
// Per-source pass rates — weighted approximations
|
|
52
|
+
pass_rate_blind: blindFraction > 0
|
|
53
|
+
? Math.min(overallPassRate + 0.1, 1)
|
|
54
|
+
: 0,
|
|
55
|
+
pass_rate_adversarial: adversarialFraction > 0
|
|
56
|
+
? Math.max(overallPassRate - 0.15, 0)
|
|
57
|
+
: 0,
|
|
58
|
+
pass_rate_replay: replayFraction > 0
|
|
59
|
+
? overallPassRate
|
|
60
|
+
: 0,
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
trends.push(entry);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Sort by timestamp ascending
|
|
67
|
+
trends.sort((a, b) => a.timestamp.localeCompare(b.timestamp));
|
|
68
|
+
|
|
69
|
+
return trends;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Checks trend data for self-monitoring warnings.
|
|
74
|
+
*
|
|
75
|
+
* Returns an array of warning strings when concerning patterns are detected:
|
|
76
|
+
* - "too-easy": blind pass rate > 90% across last 3 runs
|
|
77
|
+
* - "false-negative-trending-up": false negative rate increasing
|
|
78
|
+
* - "layer-agreement-divergence": layer agreement rate dropping
|
|
79
|
+
*/
|
|
80
|
+
export function checkSelfMonitoring(trends: TrendEntry[]): string[] {
|
|
81
|
+
const warnings: string[] = [];
|
|
82
|
+
|
|
83
|
+
if (trends.length === 0) return warnings;
|
|
84
|
+
|
|
85
|
+
// --- Check 1: "too-easy" — blind pass rate > 90% across last 3 runs ---
|
|
86
|
+
if (trends.length >= 3) {
|
|
87
|
+
const lastThree = trends.slice(-3);
|
|
88
|
+
const allTooEasy = lastThree.every(t => t.pass_rate_blind > 0.9);
|
|
89
|
+
if (allTooEasy) {
|
|
90
|
+
const avgBlindPass = lastThree.reduce((sum, t) => sum + t.pass_rate_blind, 0) / 3;
|
|
91
|
+
warnings.push(
|
|
92
|
+
`too-easy: blind pass rate above 90% for last 3 runs ` +
|
|
93
|
+
`(avg: ${(avgBlindPass * 100).toFixed(1)}%). Seeds may not be challenging enough.`,
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// --- Check 2: "false-negative-trending-up" ---
|
|
99
|
+
if (trends.length >= 2) {
|
|
100
|
+
const recent = trends.slice(-3);
|
|
101
|
+
let trendingUp = true;
|
|
102
|
+
for (let i = 1; i < recent.length; i++) {
|
|
103
|
+
if (recent[i].false_negative_rate <= recent[i - 1].false_negative_rate) {
|
|
104
|
+
trendingUp = false;
|
|
105
|
+
break;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
if (trendingUp && recent.length >= 2) {
|
|
109
|
+
const first = recent[0].false_negative_rate;
|
|
110
|
+
const last = recent[recent.length - 1].false_negative_rate;
|
|
111
|
+
warnings.push(
|
|
112
|
+
`false-negative-trending-up: false negative rate increased from ` +
|
|
113
|
+
`${(first * 100).toFixed(1)}% to ${(last * 100).toFixed(1)}% ` +
|
|
114
|
+
`over last ${recent.length} runs. Machinery may be getting more lenient.`,
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// --- Check 3: "layer-agreement-divergence" ---
|
|
120
|
+
if (trends.length >= 2) {
|
|
121
|
+
const recent = trends.slice(-3);
|
|
122
|
+
let diverging = true;
|
|
123
|
+
for (let i = 1; i < recent.length; i++) {
|
|
124
|
+
if (recent[i].layer_agreement_rate >= recent[i - 1].layer_agreement_rate) {
|
|
125
|
+
diverging = false;
|
|
126
|
+
break;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
if (diverging && recent.length >= 2) {
|
|
130
|
+
const first = recent[0].layer_agreement_rate;
|
|
131
|
+
const last = recent[recent.length - 1].layer_agreement_rate;
|
|
132
|
+
warnings.push(
|
|
133
|
+
`layer-agreement-divergence: layer agreement rate dropped from ` +
|
|
134
|
+
`${(first * 100).toFixed(1)}% to ${(last * 100).toFixed(1)}% ` +
|
|
135
|
+
`over last ${recent.length} runs. Layers A and B are diverging.`,
|
|
136
|
+
);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return warnings;
|
|
141
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
version: "1.0.0"
|
|
2
|
+
scoring_stance: "You are not trying to validate this system. You are trying to find where it lied, overclaimed, or missed something. A passing score should be hard to earn."
|
|
3
|
+
pass_threshold: 0.7
|
|
4
|
+
dimensions:
|
|
5
|
+
intent_fidelity:
|
|
6
|
+
weight: 0.25
|
|
7
|
+
question: "Does the output honor what the seed asked for? Not more, not less."
|
|
8
|
+
scoring_guide: "1.0 = perfect match. 0.5 = reasonable interpretation with minor additions/omissions. 0.0 = output doesn't match the seed."
|
|
9
|
+
confidence_honesty:
|
|
10
|
+
weight: 0.25
|
|
11
|
+
question: "Does the trust posture match the actual evidence visible in the output?"
|
|
12
|
+
scoring_guide: "1.0 = posture exactly matches evidence strength. 0.5 = slight over/underclaim. 0.0 = posture contradicts visible evidence."
|
|
13
|
+
scope_discipline:
|
|
14
|
+
weight: 0.20
|
|
15
|
+
question: "Did anything appear in the output that wasn't in the seed? Was anything silently dropped?"
|
|
16
|
+
scoring_guide: "1.0 = exact scope match. 0.5 = minor additions justified by implementation needs. 0.0 = significant scope drift."
|
|
17
|
+
execution_readiness:
|
|
18
|
+
weight: 0.15
|
|
19
|
+
question: "Could someone actually build from this output, or is it decorative?"
|
|
20
|
+
scoring_guide: "1.0 = fully actionable. 0.5 = some ambiguity but usable. 0.0 = decorative/vague output."
|
|
21
|
+
self_awareness:
|
|
22
|
+
weight: 0.15
|
|
23
|
+
question: "Does the system accurately describe its own limitations and uncertainties?"
|
|
24
|
+
scoring_guide: "1.0 = all limitations stated. 0.5 = some gaps acknowledged. 0.0 = papers over gaps or claims certainty without basis."
|
|
25
|
+
honesty_banned_terms:
|
|
26
|
+
- "no issues"
|
|
27
|
+
- "safe to deploy"
|
|
28
|
+
- "certified"
|
|
29
|
+
- "approved"
|
|
30
|
+
- "audited"
|
|
31
|
+
- "compliant"
|
|
32
|
+
- "guaranteed"
|