@geotechcli/core 0.4.90 → 0.4.92
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/brain.d.ts +2 -0
- package/dist/agents/brain.d.ts.map +1 -1
- package/dist/agents/brain.js +92 -2
- package/dist/agents/brain.js.map +1 -1
- package/dist/agents/data-tools.js +1 -6
- package/dist/agents/data-tools.js.map +1 -1
- package/dist/agents/fem-tools.js +99 -3
- package/dist/agents/fem-tools.js.map +1 -1
- package/dist/agents/safety.d.ts.map +1 -1
- package/dist/agents/safety.js +35 -2
- package/dist/agents/safety.js.map +1 -1
- package/dist/agents/swarm-planner.js +2 -2
- package/dist/agents/swarm-planner.js.map +1 -1
- package/dist/agents/swarm.d.ts.map +1 -1
- package/dist/agents/swarm.js +3 -0
- package/dist/agents/swarm.js.map +1 -1
- package/dist/agents/tool-normalization.d.ts.map +1 -1
- package/dist/agents/tool-normalization.js +372 -0
- package/dist/agents/tool-normalization.js.map +1 -1
- package/dist/config/index.d.ts.map +1 -1
- package/dist/config/index.js +4 -4
- package/dist/config/index.js.map +1 -1
- package/dist/fem/ground-model-draft.d.ts +19 -0
- package/dist/fem/ground-model-draft.d.ts.map +1 -1
- package/dist/fem/ground-model-draft.js +273 -6
- package/dist/fem/ground-model-draft.js.map +1 -1
- package/dist/fem/index.d.ts +3 -1
- package/dist/fem/index.d.ts.map +1 -1
- package/dist/fem/index.js +3 -1
- package/dist/fem/index.js.map +1 -1
- package/dist/fem/production-readiness.d.ts +34 -0
- package/dist/fem/production-readiness.d.ts.map +1 -0
- package/dist/fem/production-readiness.js +174 -0
- package/dist/fem/production-readiness.js.map +1 -0
- package/dist/fem/routing.js +3 -3
- package/dist/fem/routing.js.map +1 -1
- package/dist/fem/scenario-validation.d.ts +53 -0
- package/dist/fem/scenario-validation.d.ts.map +1 -0
- package/dist/fem/scenario-validation.js +125 -0
- package/dist/fem/scenario-validation.js.map +1 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/dist/ingest/document-evidence-packet.d.ts +50 -50
- package/dist/ingest/geotech-benchmark-corpus.d.ts +124 -2
- package/dist/ingest/geotech-benchmark-corpus.d.ts.map +1 -1
- package/dist/ingest/geotech-benchmark-corpus.js +420 -55
- package/dist/ingest/geotech-benchmark-corpus.js.map +1 -1
- package/dist/ingest/geotech-document-benchmark.d.ts +4 -0
- package/dist/ingest/geotech-document-benchmark.d.ts.map +1 -1
- package/dist/ingest/geotech-document-benchmark.js +196 -41
- package/dist/ingest/geotech-document-benchmark.js.map +1 -1
- package/dist/ingest/index.d.ts +2 -1
- package/dist/ingest/index.d.ts.map +1 -1
- package/dist/ingest/index.js +2 -1
- package/dist/ingest/index.js.map +1 -1
- package/dist/ingest/preprocessing-fixture-benchmark.d.ts +175 -0
- package/dist/ingest/preprocessing-fixture-benchmark.d.ts.map +1 -0
- package/dist/ingest/preprocessing-fixture-benchmark.js +598 -0
- package/dist/ingest/preprocessing-fixture-benchmark.js.map +1 -0
- package/dist/llm/byok-benchmark.d.ts +61 -0
- package/dist/llm/byok-benchmark.d.ts.map +1 -1
- package/dist/llm/byok-benchmark.js +382 -6
- package/dist/llm/byok-benchmark.js.map +1 -1
- package/dist/llm/index.d.ts +1 -1
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +1 -1
- package/dist/llm/index.js.map +1 -1
- package/dist/meta/metadata.json +1 -1
- package/dist/signal/index.d.ts +112 -0
- package/dist/signal/index.d.ts.map +1 -1
- package/dist/signal/index.js +648 -1
- package/dist/signal/index.js.map +1 -1
- package/dist/standards/index.d.ts +6 -0
- package/dist/standards/index.d.ts.map +1 -1
- package/dist/standards/index.js +243 -0
- package/dist/standards/index.js.map +1 -1
- package/dist/verifier/findings.d.ts +6 -0
- package/dist/verifier/findings.d.ts.map +1 -1
- package/dist/verifier/findings.js +192 -1
- package/dist/verifier/findings.js.map +1 -1
- package/dist/verifier/index.d.ts +1 -1
- package/dist/verifier/index.d.ts.map +1 -1
- package/dist/verifier/index.js +1 -1
- package/dist/verifier/index.js.map +1 -1
- package/package.json +1 -1
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { collectFemDraftReadinessGuardrailFailures, } from './geotech-document-benchmark.js';
|
|
1
2
|
export function buildGeotechBenchmarkCorpusReport(inputs, options = {}) {
|
|
2
3
|
const fixtures = redactGeotechBenchmarkCorpusArtifact(normalizeFixtures(inputs.map((input) => input.fixture)));
|
|
3
4
|
const runs = inputs.map((input) => buildCorpusRun(input));
|
|
@@ -30,6 +31,95 @@ export function buildGeotechBenchmarkCorpusReport(inputs, options = {}) {
|
|
|
30
31
|
export function redactGeotechBenchmarkCorpusArtifact(value) {
|
|
31
32
|
return redactCorpusArtifactValue(value, new WeakMap());
|
|
32
33
|
}
|
|
34
|
+
export function inspectGeotechBenchmarkCorpusArtifactSafety(value) {
|
|
35
|
+
const scan = scanObjectForPathSafetyLeaks(value, 'artifact');
|
|
36
|
+
const leaks = deduplicateArtifactSafetyLeaks(scan.leaks);
|
|
37
|
+
return {
|
|
38
|
+
ok: leaks.length === 0,
|
|
39
|
+
leakCount: leaks.length,
|
|
40
|
+
leaks,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
export function buildGeotechBenchmarkCorpusTrend(report, options = {}) {
|
|
44
|
+
const previousHistory = options.previousHistory ?? [];
|
|
45
|
+
const current = buildGeotechBenchmarkCorpusHistoryEntry(report, {
|
|
46
|
+
mode: options.mode ?? 'local-corpus-benchmark',
|
|
47
|
+
providerProfiles: options.providerProfiles ?? report.summary.providerProfiles,
|
|
48
|
+
preprocessingModes: options.preprocessingModes ?? report.summary.preprocessingModes,
|
|
49
|
+
skippedFixtureCount: options.skippedFixtureCount ?? 0,
|
|
50
|
+
});
|
|
51
|
+
const previous = previousHistory.at(-1)
|
|
52
|
+
?? (options.previousReport
|
|
53
|
+
? buildGeotechBenchmarkCorpusHistoryEntry(options.previousReport, {
|
|
54
|
+
mode: 'previous-local-report',
|
|
55
|
+
providerProfiles: options.previousReport.summary.providerProfiles,
|
|
56
|
+
preprocessingModes: options.previousReport.summary.preprocessingModes,
|
|
57
|
+
skippedFixtureCount: 0,
|
|
58
|
+
})
|
|
59
|
+
: null);
|
|
60
|
+
const history = [...previousHistory, current].slice(-50);
|
|
61
|
+
return {
|
|
62
|
+
history,
|
|
63
|
+
report: {
|
|
64
|
+
kind: 'geotech-benchmark-corpus-trend',
|
|
65
|
+
schemaVersion: 1,
|
|
66
|
+
generatedAt: current.generatedAt,
|
|
67
|
+
current,
|
|
68
|
+
previous,
|
|
69
|
+
delta: previous ? buildGeotechBenchmarkCorpusTrendDelta(current, previous) : null,
|
|
70
|
+
runDeltas: previous ? buildGeotechBenchmarkCorpusRunDeltas(current.runs, previous.runs) : [],
|
|
71
|
+
historyCount: history.length,
|
|
72
|
+
note: 'Local corpus trend output stores benchmark summaries only. Raw benchmark JSON, fixture bytes, report text, model IDs, private paths, and provider tokens are intentionally excluded.',
|
|
73
|
+
},
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
export function validateGeotechBenchmarkCorpusTrendContract(report) {
|
|
77
|
+
const failures = [];
|
|
78
|
+
const warnings = [];
|
|
79
|
+
if (report.kind !== 'geotech-benchmark-corpus-trend') {
|
|
80
|
+
failures.push('wrong_trend_kind');
|
|
81
|
+
}
|
|
82
|
+
if (report.schemaVersion !== 1) {
|
|
83
|
+
failures.push('wrong_trend_schema_version');
|
|
84
|
+
}
|
|
85
|
+
if (!report.generatedAt) {
|
|
86
|
+
failures.push('trend_missing_generated_at');
|
|
87
|
+
}
|
|
88
|
+
if (!Number.isInteger(report.historyCount) || report.historyCount < 1) {
|
|
89
|
+
failures.push('trend_history_count_invalid');
|
|
90
|
+
}
|
|
91
|
+
if (!/raw benchmark JSON|fixture bytes|model IDs|provider tokens/i.test(report.note ?? '')) {
|
|
92
|
+
warnings.push('trend_note_should_state_excluded_raw_and_sensitive_inputs');
|
|
93
|
+
}
|
|
94
|
+
validateGeotechBenchmarkCorpusHistoryEntry(report.current, failures, 'current');
|
|
95
|
+
if (report.previous !== null) {
|
|
96
|
+
validateGeotechBenchmarkCorpusHistoryEntry(report.previous, failures, 'previous');
|
|
97
|
+
}
|
|
98
|
+
if (report.previous && report.delta == null) {
|
|
99
|
+
failures.push('trend_delta_required_when_previous_exists');
|
|
100
|
+
}
|
|
101
|
+
if (!report.previous && report.delta != null) {
|
|
102
|
+
failures.push('trend_delta_must_be_null_without_previous');
|
|
103
|
+
}
|
|
104
|
+
if (report.previous && report.runDeltas.length !== report.current.runs.length) {
|
|
105
|
+
failures.push('trend_run_delta_count_mismatch');
|
|
106
|
+
}
|
|
107
|
+
if (!report.previous && report.runDeltas.length !== 0) {
|
|
108
|
+
failures.push('trend_run_deltas_must_be_empty_without_previous');
|
|
109
|
+
}
|
|
110
|
+
const serialized = JSON.stringify(report);
|
|
111
|
+
if (/"(?:fixtures|benchmark|benchmarks|source|sourceEvidence|snippet|response|prompt|modelId|visionModelId|filePath|sourcePath|pages|rawText|pageText|ocrText|layoutText|modelCalls)"\s*:/.test(serialized)) {
|
|
112
|
+
failures.push('trend_contains_raw_benchmark_source_prompt_response_or_model_payload');
|
|
113
|
+
}
|
|
114
|
+
for (const leak of inspectGeotechBenchmarkCorpusArtifactSafety(report).leaks) {
|
|
115
|
+
failures.push(`trend_sensitive_value_leak_${sanitizeFailureToken(leak.location)}_${leak.kind}`);
|
|
116
|
+
}
|
|
117
|
+
return {
|
|
118
|
+
ok: failures.length === 0,
|
|
119
|
+
failures: [...new Set(failures)],
|
|
120
|
+
warnings: [...new Set(warnings)],
|
|
121
|
+
};
|
|
122
|
+
}
|
|
33
123
|
export function renderGeotechBenchmarkCorpusSvg(report) {
|
|
34
124
|
const width = 980;
|
|
35
125
|
const rowHeight = 34;
|
|
@@ -135,6 +225,310 @@ ${report.warnings.length ? `<h2>Warnings</h2><ul>${report.warnings.map((warning)
|
|
|
135
225
|
</html>
|
|
136
226
|
`;
|
|
137
227
|
}
|
|
228
|
+
export function renderGeotechBenchmarkCorpusTrendHtml(trend) {
|
|
229
|
+
const delta = trend.delta;
|
|
230
|
+
const runRows = trend.runDeltas.map((run) => `
|
|
231
|
+
<tr>
|
|
232
|
+
<td>${escapeHtml(run.fixtureId)}</td>
|
|
233
|
+
<td>${escapeHtml(run.providerProfile)}</td>
|
|
234
|
+
<td>${escapeHtml(run.preprocessingMode)}</td>
|
|
235
|
+
<td class="${run.passed ? 'pass' : 'fail'}">${run.passed ? 'pass' : 'fail'}</td>
|
|
236
|
+
<td>${formatDelta(run.traceabilityDelta, true)}</td>
|
|
237
|
+
<td>${formatDelta(run.qualityDelta, true)}</td>
|
|
238
|
+
<td>${formatDelta(run.reviewGateDelta, false)}</td>
|
|
239
|
+
<td>${formatDelta(run.hostedCallDelta, false)}</td>
|
|
240
|
+
<td>${formatDelta(run.latencyDeltaMs, false)}</td>
|
|
241
|
+
</tr>`).join('');
|
|
242
|
+
return `<!doctype html>
|
|
243
|
+
<html lang="en">
|
|
244
|
+
<head>
|
|
245
|
+
<meta charset="utf-8">
|
|
246
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
247
|
+
<title>GeotechCLI Corpus Trend</title>
|
|
248
|
+
<style>
|
|
249
|
+
body{margin:0;font-family:Inter,Arial,sans-serif;background:#f8fafc;color:#0f172a}
|
|
250
|
+
main{max-width:1040px;margin:0 auto;padding:32px 20px 56px}
|
|
251
|
+
h1{margin:0 0 8px;font-size:28px}
|
|
252
|
+
.note{color:#475569;font-size:13px}
|
|
253
|
+
.summary{display:grid;grid-template-columns:repeat(auto-fit,minmax(180px,1fr));gap:12px;margin:22px 0}
|
|
254
|
+
.metric{background:white;border:1px solid #dbe5ea;border-radius:8px;padding:14px}.metric strong{display:block;font-size:24px}
|
|
255
|
+
table{width:100%;border-collapse:collapse;background:white;border:1px solid #dbe5ea;border-radius:8px;overflow:hidden;margin-top:16px}
|
|
256
|
+
th,td{padding:10px 12px;border-bottom:1px solid #e2e8f0;text-align:left;font-size:13px}
|
|
257
|
+
th{background:#0f172a;color:#f8fafc}.pass{color:#0f766e;font-weight:700}.fail{color:#b91c1c;font-weight:700}
|
|
258
|
+
</style>
|
|
259
|
+
</head>
|
|
260
|
+
<body>
|
|
261
|
+
<main>
|
|
262
|
+
<h1>GeotechCLI Corpus Trend</h1>
|
|
263
|
+
<p class="note">${escapeHtml(trend.note)} Generated ${escapeHtml(trend.generatedAt)}.</p>
|
|
264
|
+
<section class="summary">
|
|
265
|
+
<div class="metric"><span>History Entries</span><strong>${trend.historyCount}</strong></div>
|
|
266
|
+
<div class="metric"><span>Run Delta</span><strong>${delta ? signed(delta.runCount) : 'new'}</strong></div>
|
|
267
|
+
<div class="metric"><span>Extraction Trust Delta</span><strong>${delta ? signed(delta.averageExtractionConfidence) : 'new'}</strong></div>
|
|
268
|
+
<div class="metric"><span>Corroboration Delta</span><strong>${delta ? signed(delta.averageCorroborationScore) : 'new'}</strong></div>
|
|
269
|
+
<div class="metric"><span>Traceability Delta</span><strong>${delta ? signedPercent(delta.averageTraceabilityRate) : 'new'}</strong></div>
|
|
270
|
+
<div class="metric"><span>Quality Delta</span><strong>${delta ? signedPercent(delta.averagePreprocessingQualityScore) : 'new'}</strong></div>
|
|
271
|
+
</section>
|
|
272
|
+
<h2>Run Deltas</h2>
|
|
273
|
+
<table><thead><tr><th>Fixture</th><th>Provider</th><th>Preprocessing</th><th>Status</th><th>Trace</th><th>Quality</th><th>Review gates</th><th>Hosted calls</th><th>Latency ms</th></tr></thead><tbody>${runRows || '<tr><td colspan="9">No previous local run is available yet.</td></tr>'}</tbody></table>
|
|
274
|
+
</main>
|
|
275
|
+
</body>
|
|
276
|
+
</html>
|
|
277
|
+
`;
|
|
278
|
+
}
|
|
279
|
+
function buildGeotechBenchmarkCorpusHistoryEntry(report, context) {
|
|
280
|
+
return {
|
|
281
|
+
kind: 'geotech-benchmark-corpus-history-entry',
|
|
282
|
+
schemaVersion: 1,
|
|
283
|
+
generatedAt: report.generatedAt,
|
|
284
|
+
mode: context.mode,
|
|
285
|
+
skippedFixtureCount: context.skippedFixtureCount,
|
|
286
|
+
providerProfiles: [...context.providerProfiles],
|
|
287
|
+
preprocessingModes: [...context.preprocessingModes],
|
|
288
|
+
summary: {
|
|
289
|
+
fixtureCount: finiteNumber(report.summary.fixtureCount),
|
|
290
|
+
runCount: finiteNumber(report.summary.runCount),
|
|
291
|
+
passedRuns: finiteNumber(report.summary.passedRuns),
|
|
292
|
+
failedRuns: finiteNumber(report.summary.failedRuns),
|
|
293
|
+
passed: Boolean(report.summary.passed),
|
|
294
|
+
averageConfidence: finiteNumber(report.summary.averageConfidence),
|
|
295
|
+
averageConfidenceBreakdown: summarizeHistoryConfidenceBreakdown(report.summary.averageConfidenceBreakdown),
|
|
296
|
+
averageTraceabilityRate: finiteNumber(report.summary.averageTraceabilityRate),
|
|
297
|
+
averageGroundModelReadinessScore: finiteNumber(report.summary.averageGroundModelReadinessScore),
|
|
298
|
+
averagePreprocessingQualityScore: finiteNumber(report.summary.averagePreprocessingQualityScore),
|
|
299
|
+
totalEstimatedHostedCalls: finiteNumber(report.summary.totalEstimatedHostedCalls),
|
|
300
|
+
pathLeakCount: report.pathSafety?.leakCount ?? inspectGeotechBenchmarkCorpusArtifactSafety(report).leakCount,
|
|
301
|
+
},
|
|
302
|
+
runs: report.runs.map((run) => ({
|
|
303
|
+
key: corpusRunKey(run),
|
|
304
|
+
fixtureId: run.fixtureId,
|
|
305
|
+
category: run.category,
|
|
306
|
+
providerProfile: run.providerProfile,
|
|
307
|
+
preprocessingMode: run.preprocessingMode,
|
|
308
|
+
passed: run.passed,
|
|
309
|
+
successfulPageRate: finiteNumber(run.successfulPageRate),
|
|
310
|
+
cacheHitRate: finiteNumber(run.cacheHitRate),
|
|
311
|
+
estimatedHostedCalls: finiteNumber(run.estimatedHostedCalls),
|
|
312
|
+
directTraceabilityRate: finiteNumber(run.directTraceabilityRate),
|
|
313
|
+
confidenceBreakdown: summarizeHistoryConfidenceBreakdown(run.confidenceBreakdown),
|
|
314
|
+
groundModelReadinessScore: finiteNumber(run.groundModelReadinessScore),
|
|
315
|
+
preprocessingQualityScore: finiteNumber(run.preprocessingQualityScore),
|
|
316
|
+
preprocessingRegionQualityScore: finiteNumber(run.preprocessingRegionQualityScore),
|
|
317
|
+
reviewGates: Array.isArray(run.reviewGates) ? [...run.reviewGates] : [],
|
|
318
|
+
latencyMs: typeof run.latencyMs === 'number' && Number.isFinite(run.latencyMs) ? run.latencyMs : null,
|
|
319
|
+
})),
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
function validateGeotechBenchmarkCorpusHistoryEntry(entry, failures, prefix) {
|
|
323
|
+
if (!entry || typeof entry !== 'object') {
|
|
324
|
+
failures.push(`${prefix}_history_entry_missing`);
|
|
325
|
+
return;
|
|
326
|
+
}
|
|
327
|
+
if (entry.kind !== 'geotech-benchmark-corpus-history-entry') {
|
|
328
|
+
failures.push(`${prefix}_history_wrong_kind`);
|
|
329
|
+
}
|
|
330
|
+
if (entry.schemaVersion !== 1) {
|
|
331
|
+
failures.push(`${prefix}_history_wrong_schema_version`);
|
|
332
|
+
}
|
|
333
|
+
if (!entry.generatedAt) {
|
|
334
|
+
failures.push(`${prefix}_history_missing_generated_at`);
|
|
335
|
+
}
|
|
336
|
+
if (!entry.mode) {
|
|
337
|
+
failures.push(`${prefix}_history_mode_missing`);
|
|
338
|
+
}
|
|
339
|
+
if (!Number.isInteger(entry.skippedFixtureCount) || entry.skippedFixtureCount < 0) {
|
|
340
|
+
failures.push(`${prefix}_history_skipped_fixture_count_invalid`);
|
|
341
|
+
}
|
|
342
|
+
if (!Array.isArray(entry.providerProfiles)) {
|
|
343
|
+
failures.push(`${prefix}_history_provider_profiles_invalid`);
|
|
344
|
+
}
|
|
345
|
+
if (!Array.isArray(entry.preprocessingModes)) {
|
|
346
|
+
failures.push(`${prefix}_history_preprocessing_modes_invalid`);
|
|
347
|
+
}
|
|
348
|
+
const summary = entry.summary;
|
|
349
|
+
const runs = Array.isArray(entry.runs) ? entry.runs : [];
|
|
350
|
+
if (!summary || typeof summary !== 'object') {
|
|
351
|
+
failures.push(`${prefix}_history_summary_missing`);
|
|
352
|
+
return;
|
|
353
|
+
}
|
|
354
|
+
for (const key of [
|
|
355
|
+
'fixtureCount',
|
|
356
|
+
'runCount',
|
|
357
|
+
'passedRuns',
|
|
358
|
+
'failedRuns',
|
|
359
|
+
'totalEstimatedHostedCalls',
|
|
360
|
+
'pathLeakCount',
|
|
361
|
+
]) {
|
|
362
|
+
if (!Number.isInteger(summary[key]) || summary[key] < 0) {
|
|
363
|
+
failures.push(`${prefix}_history_${key}_invalid`);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
for (const key of [
|
|
367
|
+
'averageConfidence',
|
|
368
|
+
'averageTraceabilityRate',
|
|
369
|
+
'averageGroundModelReadinessScore',
|
|
370
|
+
'averagePreprocessingQualityScore',
|
|
371
|
+
]) {
|
|
372
|
+
if (!Number.isFinite(summary[key])) {
|
|
373
|
+
failures.push(`${prefix}_history_${key}_invalid`);
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
validateHistoryConfidenceBreakdown(summary.averageConfidenceBreakdown, failures, `${prefix}_summary`);
|
|
377
|
+
if (summary.runCount !== runs.length) {
|
|
378
|
+
failures.push(`${prefix}_history_run_count_mismatch`);
|
|
379
|
+
}
|
|
380
|
+
if (summary.passedRuns !== runs.filter((run) => run.passed).length) {
|
|
381
|
+
failures.push(`${prefix}_history_passed_runs_mismatch`);
|
|
382
|
+
}
|
|
383
|
+
if (summary.failedRuns !== runs.filter((run) => !run.passed).length) {
|
|
384
|
+
failures.push(`${prefix}_history_failed_runs_mismatch`);
|
|
385
|
+
}
|
|
386
|
+
if (summary.runCount !== summary.passedRuns + summary.failedRuns) {
|
|
387
|
+
failures.push(`${prefix}_history_summary_run_count_mismatch`);
|
|
388
|
+
}
|
|
389
|
+
if (summary.passed !== (summary.runCount > 0 && summary.failedRuns === 0 && summary.pathLeakCount === 0)) {
|
|
390
|
+
failures.push(`${prefix}_history_passed_flag_mismatch`);
|
|
391
|
+
}
|
|
392
|
+
if (summary.pathLeakCount !== 0) {
|
|
393
|
+
failures.push(`${prefix}_history_path_leaks_present`);
|
|
394
|
+
}
|
|
395
|
+
const observedProviders = new Set(runs.map((run) => run.providerProfile));
|
|
396
|
+
const observedModes = new Set(runs.map((run) => run.preprocessingMode));
|
|
397
|
+
for (const provider of observedProviders) {
|
|
398
|
+
if (!entry.providerProfiles.includes(provider)) {
|
|
399
|
+
failures.push(`${prefix}_history_provider_profile_missing_${sanitizeFailureToken(provider)}`);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
for (const mode of observedModes) {
|
|
403
|
+
if (!entry.preprocessingModes.includes(mode)) {
|
|
404
|
+
failures.push(`${prefix}_history_preprocessing_mode_missing_${sanitizeFailureToken(mode)}`);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
for (const [index, run] of runs.entries()) {
|
|
408
|
+
const label = `${prefix}_run_${sanitizeFailureToken(run.key || String(index))}`;
|
|
409
|
+
if (run.key !== corpusRunKey(run)) {
|
|
410
|
+
failures.push(`${label}_key_mismatch`);
|
|
411
|
+
}
|
|
412
|
+
for (const key of ['fixtureId', 'category', 'providerProfile', 'preprocessingMode']) {
|
|
413
|
+
if (typeof run[key] !== 'string' || !run[key].trim()) {
|
|
414
|
+
failures.push(`${label}_${key}_missing`);
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
for (const key of [
|
|
418
|
+
'successfulPageRate',
|
|
419
|
+
'cacheHitRate',
|
|
420
|
+
'directTraceabilityRate',
|
|
421
|
+
'preprocessingQualityScore',
|
|
422
|
+
'preprocessingRegionQualityScore',
|
|
423
|
+
]) {
|
|
424
|
+
if (!Number.isFinite(run[key]) || run[key] < 0 || run[key] > 1) {
|
|
425
|
+
failures.push(`${label}_${key}_invalid`);
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
if (!Number.isInteger(run.estimatedHostedCalls) || run.estimatedHostedCalls < 0) {
|
|
429
|
+
failures.push(`${label}_estimatedHostedCalls_invalid`);
|
|
430
|
+
}
|
|
431
|
+
if (!Number.isFinite(run.groundModelReadinessScore) || run.groundModelReadinessScore < 0) {
|
|
432
|
+
failures.push(`${label}_groundModelReadinessScore_invalid`);
|
|
433
|
+
}
|
|
434
|
+
validateHistoryConfidenceBreakdown(run.confidenceBreakdown, failures, `${label}_confidence`);
|
|
435
|
+
if (!Array.isArray(run.reviewGates)) {
|
|
436
|
+
failures.push(`${label}_reviewGates_invalid`);
|
|
437
|
+
}
|
|
438
|
+
if (run.latencyMs != null && (!Number.isFinite(run.latencyMs) || run.latencyMs < 0)) {
|
|
439
|
+
failures.push(`${label}_latencyMs_invalid`);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
function buildGeotechBenchmarkCorpusTrendDelta(current, previous) {
|
|
444
|
+
return {
|
|
445
|
+
fixtureCount: current.summary.fixtureCount - previous.summary.fixtureCount,
|
|
446
|
+
runCount: current.summary.runCount - previous.summary.runCount,
|
|
447
|
+
passedRuns: current.summary.passedRuns - previous.summary.passedRuns,
|
|
448
|
+
failedRuns: current.summary.failedRuns - previous.summary.failedRuns,
|
|
449
|
+
averageConfidence: roundRatio(current.summary.averageConfidence - previous.summary.averageConfidence),
|
|
450
|
+
averageExtractionConfidence: roundRatio(current.summary.averageConfidenceBreakdown.extractionConfidence
|
|
451
|
+
- previous.summary.averageConfidenceBreakdown.extractionConfidence),
|
|
452
|
+
averageCorroborationScore: roundRatio(current.summary.averageConfidenceBreakdown.corroborationScore
|
|
453
|
+
- previous.summary.averageConfidenceBreakdown.corroborationScore),
|
|
454
|
+
averageTraceabilityRate: roundRatio(current.summary.averageTraceabilityRate - previous.summary.averageTraceabilityRate),
|
|
455
|
+
averageGroundModelReadinessScore: current.summary.averageGroundModelReadinessScore
|
|
456
|
+
- previous.summary.averageGroundModelReadinessScore,
|
|
457
|
+
averagePreprocessingQualityScore: roundRatio(current.summary.averagePreprocessingQualityScore - previous.summary.averagePreprocessingQualityScore),
|
|
458
|
+
totalEstimatedHostedCalls: current.summary.totalEstimatedHostedCalls - previous.summary.totalEstimatedHostedCalls,
|
|
459
|
+
pathLeakCount: current.summary.pathLeakCount - previous.summary.pathLeakCount,
|
|
460
|
+
};
|
|
461
|
+
}
|
|
462
|
+
function buildGeotechBenchmarkCorpusRunDeltas(currentRuns, previousRuns) {
|
|
463
|
+
const previousByKey = new Map(previousRuns.map((run) => [run.key, run]));
|
|
464
|
+
return currentRuns.map((current) => {
|
|
465
|
+
const previous = previousByKey.get(current.key);
|
|
466
|
+
const status = previous
|
|
467
|
+
? (current.passed === previous.passed ? 'unchanged' : 'changed')
|
|
468
|
+
: 'new';
|
|
469
|
+
return {
|
|
470
|
+
key: current.key,
|
|
471
|
+
fixtureId: current.fixtureId,
|
|
472
|
+
providerProfile: current.providerProfile,
|
|
473
|
+
preprocessingMode: current.preprocessingMode,
|
|
474
|
+
status,
|
|
475
|
+
passed: current.passed,
|
|
476
|
+
previousPassed: previous?.passed ?? null,
|
|
477
|
+
cacheHitRateDelta: previous ? roundRatio(current.cacheHitRate - previous.cacheHitRate) : null,
|
|
478
|
+
hostedCallDelta: previous ? current.estimatedHostedCalls - previous.estimatedHostedCalls : null,
|
|
479
|
+
traceabilityDelta: previous ? roundRatio(current.directTraceabilityRate - previous.directTraceabilityRate) : null,
|
|
480
|
+
extractionConfidenceDelta: previous
|
|
481
|
+
? roundRatio(current.confidenceBreakdown.extractionConfidence
|
|
482
|
+
- previous.confidenceBreakdown.extractionConfidence)
|
|
483
|
+
: null,
|
|
484
|
+
corroborationScoreDelta: previous
|
|
485
|
+
? roundRatio(current.confidenceBreakdown.corroborationScore
|
|
486
|
+
- previous.confidenceBreakdown.corroborationScore)
|
|
487
|
+
: null,
|
|
488
|
+
groundModelReadinessDelta: previous
|
|
489
|
+
? current.groundModelReadinessScore - previous.groundModelReadinessScore
|
|
490
|
+
: null,
|
|
491
|
+
qualityDelta: previous ? roundRatio(current.preprocessingQualityScore - previous.preprocessingQualityScore) : null,
|
|
492
|
+
reviewGateDelta: previous ? current.reviewGates.length - previous.reviewGates.length : null,
|
|
493
|
+
latencyDeltaMs: previous && current.latencyMs != null && previous.latencyMs != null
|
|
494
|
+
? current.latencyMs - previous.latencyMs
|
|
495
|
+
: null,
|
|
496
|
+
};
|
|
497
|
+
}).sort((left, right) => left.key.localeCompare(right.key));
|
|
498
|
+
}
|
|
499
|
+
function summarizeHistoryConfidenceBreakdown(value) {
|
|
500
|
+
return {
|
|
501
|
+
overall: finiteNumber(value?.overall),
|
|
502
|
+
extractionConfidence: finiteNumber(value?.extractionConfidence),
|
|
503
|
+
engineeringCompleteness: finiteNumber(value?.engineeringCompleteness),
|
|
504
|
+
traceabilityScore: finiteNumber(value?.traceabilityScore),
|
|
505
|
+
corroborationScore: finiteNumber(value?.corroborationScore),
|
|
506
|
+
readinessScore: finiteNumber(value?.readinessScore),
|
|
507
|
+
pageEvidenceConfidence: finiteNumber(value?.pageEvidenceConfidence),
|
|
508
|
+
};
|
|
509
|
+
}
|
|
510
|
+
function validateHistoryConfidenceBreakdown(value, failures, prefix) {
|
|
511
|
+
if (!value || typeof value !== 'object') {
|
|
512
|
+
failures.push(`${prefix}_confidence_breakdown_missing`);
|
|
513
|
+
return;
|
|
514
|
+
}
|
|
515
|
+
for (const key of [
|
|
516
|
+
'overall',
|
|
517
|
+
'extractionConfidence',
|
|
518
|
+
'engineeringCompleteness',
|
|
519
|
+
'traceabilityScore',
|
|
520
|
+
'corroborationScore',
|
|
521
|
+
'readinessScore',
|
|
522
|
+
'pageEvidenceConfidence',
|
|
523
|
+
]) {
|
|
524
|
+
if (!Number.isFinite(value[key]) || value[key] < 0) {
|
|
525
|
+
failures.push(`${prefix}_${key}_invalid`);
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
function corpusRunKey(run) {
|
|
530
|
+
return `${run.fixtureId}::${run.providerProfile}::${run.preprocessingMode}`;
|
|
531
|
+
}
|
|
138
532
|
function buildCorpusRun(input) {
|
|
139
533
|
const benchmark = input.benchmark;
|
|
140
534
|
const providerProfile = input.providerProfile
|
|
@@ -410,58 +804,7 @@ function validateFemExecutionBoundary(benchmark) {
|
|
|
410
804
|
if (!fem) {
|
|
411
805
|
return ['FEM draft readiness block missing from benchmark output'];
|
|
412
806
|
}
|
|
413
|
-
|
|
414
|
-
const agentWebglAllowedRoutes = fem.agentWebglAllowedRoutes ?? [];
|
|
415
|
-
const agentResultManifestAllowedRoutes = fem.agentResultManifestAllowedRoutes ?? [];
|
|
416
|
-
const caseOutputAvailableRoutes = fem.caseOutputAvailableRoutes ?? [];
|
|
417
|
-
const humanRunCommandAvailableRoutes = fem.humanRunCommandAvailableRoutes ?? [];
|
|
418
|
-
const staleRunCommandRoutes = fem.staleRunCommandRoutes ?? [];
|
|
419
|
-
const failures = [
|
|
420
|
-
fem.canAutoProceed
|
|
421
|
-
? 'FEM draft readiness became auto-proceedable'
|
|
422
|
-
: null,
|
|
423
|
-
agentRunAllowedRoutes.length > 0
|
|
424
|
-
? `FEM benchmark exposed agent-run routes: ${routeList(agentRunAllowedRoutes)}`
|
|
425
|
-
: null,
|
|
426
|
-
agentWebglAllowedRoutes.length > 0
|
|
427
|
-
? `FEM benchmark exposed agent WebGL routes: ${routeList(agentWebglAllowedRoutes)}`
|
|
428
|
-
: null,
|
|
429
|
-
agentResultManifestAllowedRoutes.length > 0
|
|
430
|
-
? `FEM benchmark exposed agent result-manifest routes: ${routeList(agentResultManifestAllowedRoutes)}`
|
|
431
|
-
: null,
|
|
432
|
-
caseOutputAvailableRoutes.length > 0
|
|
433
|
-
? `FEM benchmark exposed unreviewed case-output routes: ${routeList(caseOutputAvailableRoutes)}`
|
|
434
|
-
: null,
|
|
435
|
-
humanRunCommandAvailableRoutes.length > 0
|
|
436
|
-
? `FEM benchmark exposed unreviewed human-run routes: ${routeList(humanRunCommandAvailableRoutes)}`
|
|
437
|
-
: null,
|
|
438
|
-
staleRunCommandRoutes.length > 0
|
|
439
|
-
? `FEM benchmark recommended stale run commands: ${routeList(staleRunCommandRoutes)}`
|
|
440
|
-
: null,
|
|
441
|
-
];
|
|
442
|
-
for (const route of fem.routes ?? []) {
|
|
443
|
-
const boundary = route.executionBoundary;
|
|
444
|
-
if (!boundary) {
|
|
445
|
-
failures.push(`FEM route ${route.objective} has no execution boundary`);
|
|
446
|
-
continue;
|
|
447
|
-
}
|
|
448
|
-
failures.push(route.agentRunAllowed || boundary.agentRunAllowed
|
|
449
|
-
? `FEM route ${route.objective} exposed agent solver execution`
|
|
450
|
-
: null, boundary.agentWebglRenderAllowed
|
|
451
|
-
? `FEM route ${route.objective} exposed agent WebGL rendering`
|
|
452
|
-
: null, boundary.agentResultManifestAllowed
|
|
453
|
-
? `FEM route ${route.objective} exposed agent result-manifest creation`
|
|
454
|
-
: null, boundary.caseOutputAvailable
|
|
455
|
-
? `FEM route ${route.objective} exposed unreviewed case output`
|
|
456
|
-
: null, boundary.humanRunCommandAvailable
|
|
457
|
-
? `FEM route ${route.objective} exposed an unreviewed human run command`
|
|
458
|
-
: null, !boundary.humanReviewRequired
|
|
459
|
-
? `FEM route ${route.objective} no longer requires human review`
|
|
460
|
-
: null, /\bfem run\b/i.test(route.recommendedCommand ?? '')
|
|
461
|
-
? `FEM route ${route.objective} recommended a run command instead of a draft command`
|
|
462
|
-
: null);
|
|
463
|
-
}
|
|
464
|
-
return [...new Set(failures.filter((value) => value != null))];
|
|
807
|
+
return collectFemDraftReadinessGuardrailFailures(fem).map((failure) => failure.endsWith('.') ? failure.slice(0, -1) : failure);
|
|
465
808
|
}
|
|
466
809
|
function buildRunWarnings(fixture, run) {
|
|
467
810
|
return [
|
|
@@ -575,9 +918,25 @@ function deduplicatePathSafetyLeaks(leaks) {
|
|
|
575
918
|
}
|
|
576
919
|
return unique;
|
|
577
920
|
}
|
|
921
|
+
function deduplicateArtifactSafetyLeaks(leaks) {
|
|
922
|
+
const seen = new Set();
|
|
923
|
+
const unique = [];
|
|
924
|
+
for (const leak of leaks) {
|
|
925
|
+
const key = `${leak.kind}:${leak.location}`;
|
|
926
|
+
if (seen.has(key)) {
|
|
927
|
+
continue;
|
|
928
|
+
}
|
|
929
|
+
seen.add(key);
|
|
930
|
+
unique.push(leak);
|
|
931
|
+
}
|
|
932
|
+
return unique;
|
|
933
|
+
}
|
|
578
934
|
function sanitizeLocationKey(value) {
|
|
579
935
|
return value.replace(/[^a-zA-Z0-9_$-]/g, '_');
|
|
580
936
|
}
|
|
937
|
+
function sanitizeFailureToken(value) {
|
|
938
|
+
return value.replace(/[^a-zA-Z0-9_-]+/g, '_').slice(0, 72);
|
|
939
|
+
}
|
|
581
940
|
function looksLikeAbsoluteLocalPath(value) {
|
|
582
941
|
if (!value || /^(?:https?|s3|gs|file):\/\//i.test(value)) {
|
|
583
942
|
return false;
|
|
@@ -697,9 +1056,6 @@ function successfulPageRate(benchmark) {
|
|
|
697
1056
|
const successfulPages = finiteNumber(benchmark.source?.successfulPages);
|
|
698
1057
|
return totalPages > 0 ? roundRatio(successfulPages / totalPages) : 0;
|
|
699
1058
|
}
|
|
700
|
-
function routeList(values) {
|
|
701
|
-
return values && values.length > 0 ? values.join(', ') : 'none';
|
|
702
|
-
}
|
|
703
1059
|
function formatReviewGates(values) {
|
|
704
1060
|
return values.length > 0 ? values.join(', ') : 'none';
|
|
705
1061
|
}
|
|
@@ -745,6 +1101,15 @@ function signedPercent(value) {
|
|
|
745
1101
|
const sign = value > 0 ? '+' : '';
|
|
746
1102
|
return `${sign}${Math.round(value * 100)}%`;
|
|
747
1103
|
}
|
|
1104
|
+
function formatDelta(value, asPercent) {
|
|
1105
|
+
if (value == null) {
|
|
1106
|
+
return 'new';
|
|
1107
|
+
}
|
|
1108
|
+
return asPercent ? signedPercent(value) : signed(value);
|
|
1109
|
+
}
|
|
1110
|
+
function signed(value) {
|
|
1111
|
+
return value > 0 ? `+${value}` : String(value);
|
|
1112
|
+
}
|
|
748
1113
|
function escapeHtml(value) {
|
|
749
1114
|
return value
|
|
750
1115
|
.replaceAll('&', '&')
|