@geotechcli/core 0.4.89 → 0.4.91

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/dist/config/index.d.ts.map +1 -1
  2. package/dist/config/index.js +4 -4
  3. package/dist/config/index.js.map +1 -1
  4. package/dist/fem/ground-model-draft.d.ts +7 -0
  5. package/dist/fem/ground-model-draft.d.ts.map +1 -1
  6. package/dist/fem/ground-model-draft.js +213 -6
  7. package/dist/fem/ground-model-draft.js.map +1 -1
  8. package/dist/fem/index.d.ts +1 -1
  9. package/dist/fem/index.d.ts.map +1 -1
  10. package/dist/fem/index.js +1 -1
  11. package/dist/fem/index.js.map +1 -1
  12. package/dist/index.d.ts +1 -1
  13. package/dist/index.d.ts.map +1 -1
  14. package/dist/index.js +1 -1
  15. package/dist/index.js.map +1 -1
  16. package/dist/ingest/document-evidence-packet.d.ts +92 -92
  17. package/dist/ingest/geotech-benchmark-corpus.d.ts +124 -2
  18. package/dist/ingest/geotech-benchmark-corpus.d.ts.map +1 -1
  19. package/dist/ingest/geotech-benchmark-corpus.js +420 -55
  20. package/dist/ingest/geotech-benchmark-corpus.js.map +1 -1
  21. package/dist/ingest/geotech-document-benchmark.d.ts +4 -0
  22. package/dist/ingest/geotech-document-benchmark.d.ts.map +1 -1
  23. package/dist/ingest/geotech-document-benchmark.js +196 -41
  24. package/dist/ingest/geotech-document-benchmark.js.map +1 -1
  25. package/dist/ingest/index.d.ts +2 -1
  26. package/dist/ingest/index.d.ts.map +1 -1
  27. package/dist/ingest/index.js +2 -1
  28. package/dist/ingest/index.js.map +1 -1
  29. package/dist/ingest/preprocessing-fixture-benchmark.d.ts +175 -0
  30. package/dist/ingest/preprocessing-fixture-benchmark.d.ts.map +1 -0
  31. package/dist/ingest/preprocessing-fixture-benchmark.js +598 -0
  32. package/dist/ingest/preprocessing-fixture-benchmark.js.map +1 -0
  33. package/dist/llm/byok-benchmark.d.ts +125 -0
  34. package/dist/llm/byok-benchmark.d.ts.map +1 -0
  35. package/dist/llm/byok-benchmark.js +529 -0
  36. package/dist/llm/byok-benchmark.js.map +1 -0
  37. package/dist/llm/index.d.ts +1 -0
  38. package/dist/llm/index.d.ts.map +1 -1
  39. package/dist/llm/index.js +1 -0
  40. package/dist/llm/index.js.map +1 -1
  41. package/dist/meta/metadata.json +1 -1
  42. package/dist/signal/index.d.ts +112 -0
  43. package/dist/signal/index.d.ts.map +1 -1
  44. package/dist/signal/index.js +648 -1
  45. package/dist/signal/index.js.map +1 -1
  46. package/dist/standards/index.d.ts +6 -0
  47. package/dist/standards/index.d.ts.map +1 -1
  48. package/dist/standards/index.js +243 -0
  49. package/dist/standards/index.js.map +1 -1
  50. package/dist/verifier/findings.d.ts +6 -0
  51. package/dist/verifier/findings.d.ts.map +1 -1
  52. package/dist/verifier/findings.js +192 -1
  53. package/dist/verifier/findings.js.map +1 -1
  54. package/dist/verifier/index.d.ts +1 -1
  55. package/dist/verifier/index.d.ts.map +1 -1
  56. package/dist/verifier/index.js +1 -1
  57. package/dist/verifier/index.js.map +1 -1
  58. package/package.json +1 -1
@@ -1,3 +1,4 @@
1
+ import { collectFemDraftReadinessGuardrailFailures, } from './geotech-document-benchmark.js';
1
2
  export function buildGeotechBenchmarkCorpusReport(inputs, options = {}) {
2
3
  const fixtures = redactGeotechBenchmarkCorpusArtifact(normalizeFixtures(inputs.map((input) => input.fixture)));
3
4
  const runs = inputs.map((input) => buildCorpusRun(input));
@@ -30,6 +31,95 @@ export function buildGeotechBenchmarkCorpusReport(inputs, options = {}) {
30
31
  export function redactGeotechBenchmarkCorpusArtifact(value) {
31
32
  return redactCorpusArtifactValue(value, new WeakMap());
32
33
  }
34
+ export function inspectGeotechBenchmarkCorpusArtifactSafety(value) {
35
+ const scan = scanObjectForPathSafetyLeaks(value, 'artifact');
36
+ const leaks = deduplicateArtifactSafetyLeaks(scan.leaks);
37
+ return {
38
+ ok: leaks.length === 0,
39
+ leakCount: leaks.length,
40
+ leaks,
41
+ };
42
+ }
43
+ export function buildGeotechBenchmarkCorpusTrend(report, options = {}) {
44
+ const previousHistory = options.previousHistory ?? [];
45
+ const current = buildGeotechBenchmarkCorpusHistoryEntry(report, {
46
+ mode: options.mode ?? 'local-corpus-benchmark',
47
+ providerProfiles: options.providerProfiles ?? report.summary.providerProfiles,
48
+ preprocessingModes: options.preprocessingModes ?? report.summary.preprocessingModes,
49
+ skippedFixtureCount: options.skippedFixtureCount ?? 0,
50
+ });
51
+ const previous = previousHistory.at(-1)
52
+ ?? (options.previousReport
53
+ ? buildGeotechBenchmarkCorpusHistoryEntry(options.previousReport, {
54
+ mode: 'previous-local-report',
55
+ providerProfiles: options.previousReport.summary.providerProfiles,
56
+ preprocessingModes: options.previousReport.summary.preprocessingModes,
57
+ skippedFixtureCount: 0,
58
+ })
59
+ : null);
60
+ const history = [...previousHistory, current].slice(-50);
61
+ return {
62
+ history,
63
+ report: {
64
+ kind: 'geotech-benchmark-corpus-trend',
65
+ schemaVersion: 1,
66
+ generatedAt: current.generatedAt,
67
+ current,
68
+ previous,
69
+ delta: previous ? buildGeotechBenchmarkCorpusTrendDelta(current, previous) : null,
70
+ runDeltas: previous ? buildGeotechBenchmarkCorpusRunDeltas(current.runs, previous.runs) : [],
71
+ historyCount: history.length,
72
+ note: 'Local corpus trend output stores benchmark summaries only. Raw benchmark JSON, fixture bytes, report text, model IDs, private paths, and provider tokens are intentionally excluded.',
73
+ },
74
+ };
75
+ }
76
+ export function validateGeotechBenchmarkCorpusTrendContract(report) {
77
+ const failures = [];
78
+ const warnings = [];
79
+ if (report.kind !== 'geotech-benchmark-corpus-trend') {
80
+ failures.push('wrong_trend_kind');
81
+ }
82
+ if (report.schemaVersion !== 1) {
83
+ failures.push('wrong_trend_schema_version');
84
+ }
85
+ if (!report.generatedAt) {
86
+ failures.push('trend_missing_generated_at');
87
+ }
88
+ if (!Number.isInteger(report.historyCount) || report.historyCount < 1) {
89
+ failures.push('trend_history_count_invalid');
90
+ }
91
+ if (!/raw benchmark JSON|fixture bytes|model IDs|provider tokens/i.test(report.note ?? '')) {
92
+ warnings.push('trend_note_should_state_excluded_raw_and_sensitive_inputs');
93
+ }
94
+ validateGeotechBenchmarkCorpusHistoryEntry(report.current, failures, 'current');
95
+ if (report.previous !== null) {
96
+ validateGeotechBenchmarkCorpusHistoryEntry(report.previous, failures, 'previous');
97
+ }
98
+ if (report.previous && report.delta == null) {
99
+ failures.push('trend_delta_required_when_previous_exists');
100
+ }
101
+ if (!report.previous && report.delta != null) {
102
+ failures.push('trend_delta_must_be_null_without_previous');
103
+ }
104
+ if (report.previous && report.runDeltas.length !== report.current.runs.length) {
105
+ failures.push('trend_run_delta_count_mismatch');
106
+ }
107
+ if (!report.previous && report.runDeltas.length !== 0) {
108
+ failures.push('trend_run_deltas_must_be_empty_without_previous');
109
+ }
110
+ const serialized = JSON.stringify(report);
111
+ if (/"(?:fixtures|benchmark|benchmarks|source|sourceEvidence|snippet|response|prompt|modelId|visionModelId|filePath|sourcePath|pages|rawText|pageText|ocrText|layoutText|modelCalls)"\s*:/.test(serialized)) {
112
+ failures.push('trend_contains_raw_benchmark_source_prompt_response_or_model_payload');
113
+ }
114
+ for (const leak of inspectGeotechBenchmarkCorpusArtifactSafety(report).leaks) {
115
+ failures.push(`trend_sensitive_value_leak_${sanitizeFailureToken(leak.location)}_${leak.kind}`);
116
+ }
117
+ return {
118
+ ok: failures.length === 0,
119
+ failures: [...new Set(failures)],
120
+ warnings: [...new Set(warnings)],
121
+ };
122
+ }
33
123
  export function renderGeotechBenchmarkCorpusSvg(report) {
34
124
  const width = 980;
35
125
  const rowHeight = 34;
@@ -135,6 +225,310 @@ ${report.warnings.length ? `<h2>Warnings</h2><ul>${report.warnings.map((warning)
135
225
  </html>
136
226
  `;
137
227
  }
228
+ export function renderGeotechBenchmarkCorpusTrendHtml(trend) {
229
+ const delta = trend.delta;
230
+ const runRows = trend.runDeltas.map((run) => `
231
+ <tr>
232
+ <td>${escapeHtml(run.fixtureId)}</td>
233
+ <td>${escapeHtml(run.providerProfile)}</td>
234
+ <td>${escapeHtml(run.preprocessingMode)}</td>
235
+ <td class="${run.passed ? 'pass' : 'fail'}">${run.passed ? 'pass' : 'fail'}</td>
236
+ <td>${formatDelta(run.traceabilityDelta, true)}</td>
237
+ <td>${formatDelta(run.qualityDelta, true)}</td>
238
+ <td>${formatDelta(run.reviewGateDelta, false)}</td>
239
+ <td>${formatDelta(run.hostedCallDelta, false)}</td>
240
+ <td>${formatDelta(run.latencyDeltaMs, false)}</td>
241
+ </tr>`).join('');
242
+ return `<!doctype html>
243
+ <html lang="en">
244
+ <head>
245
+ <meta charset="utf-8">
246
+ <meta name="viewport" content="width=device-width, initial-scale=1">
247
+ <title>GeotechCLI Corpus Trend</title>
248
+ <style>
249
+ body{margin:0;font-family:Inter,Arial,sans-serif;background:#f8fafc;color:#0f172a}
250
+ main{max-width:1040px;margin:0 auto;padding:32px 20px 56px}
251
+ h1{margin:0 0 8px;font-size:28px}
252
+ .note{color:#475569;font-size:13px}
253
+ .summary{display:grid;grid-template-columns:repeat(auto-fit,minmax(180px,1fr));gap:12px;margin:22px 0}
254
+ .metric{background:white;border:1px solid #dbe5ea;border-radius:8px;padding:14px}.metric strong{display:block;font-size:24px}
255
+ table{width:100%;border-collapse:collapse;background:white;border:1px solid #dbe5ea;border-radius:8px;overflow:hidden;margin-top:16px}
256
+ th,td{padding:10px 12px;border-bottom:1px solid #e2e8f0;text-align:left;font-size:13px}
257
+ th{background:#0f172a;color:#f8fafc}.pass{color:#0f766e;font-weight:700}.fail{color:#b91c1c;font-weight:700}
258
+ </style>
259
+ </head>
260
+ <body>
261
+ <main>
262
+ <h1>GeotechCLI Corpus Trend</h1>
263
+ <p class="note">${escapeHtml(trend.note)} Generated ${escapeHtml(trend.generatedAt)}.</p>
264
+ <section class="summary">
265
+ <div class="metric"><span>History Entries</span><strong>${trend.historyCount}</strong></div>
266
+ <div class="metric"><span>Run Delta</span><strong>${delta ? signed(delta.runCount) : 'new'}</strong></div>
267
+ <div class="metric"><span>Extraction Trust Delta</span><strong>${delta ? signed(delta.averageExtractionConfidence) : 'new'}</strong></div>
268
+ <div class="metric"><span>Corroboration Delta</span><strong>${delta ? signed(delta.averageCorroborationScore) : 'new'}</strong></div>
269
+ <div class="metric"><span>Traceability Delta</span><strong>${delta ? signedPercent(delta.averageTraceabilityRate) : 'new'}</strong></div>
270
+ <div class="metric"><span>Quality Delta</span><strong>${delta ? signedPercent(delta.averagePreprocessingQualityScore) : 'new'}</strong></div>
271
+ </section>
272
+ <h2>Run Deltas</h2>
273
+ <table><thead><tr><th>Fixture</th><th>Provider</th><th>Preprocessing</th><th>Status</th><th>Trace</th><th>Quality</th><th>Review gates</th><th>Hosted calls</th><th>Latency ms</th></tr></thead><tbody>${runRows || '<tr><td colspan="9">No previous local run is available yet.</td></tr>'}</tbody></table>
274
+ </main>
275
+ </body>
276
+ </html>
277
+ `;
278
+ }
279
+ function buildGeotechBenchmarkCorpusHistoryEntry(report, context) {
280
+ return {
281
+ kind: 'geotech-benchmark-corpus-history-entry',
282
+ schemaVersion: 1,
283
+ generatedAt: report.generatedAt,
284
+ mode: context.mode,
285
+ skippedFixtureCount: context.skippedFixtureCount,
286
+ providerProfiles: [...context.providerProfiles],
287
+ preprocessingModes: [...context.preprocessingModes],
288
+ summary: {
289
+ fixtureCount: finiteNumber(report.summary.fixtureCount),
290
+ runCount: finiteNumber(report.summary.runCount),
291
+ passedRuns: finiteNumber(report.summary.passedRuns),
292
+ failedRuns: finiteNumber(report.summary.failedRuns),
293
+ passed: Boolean(report.summary.passed),
294
+ averageConfidence: finiteNumber(report.summary.averageConfidence),
295
+ averageConfidenceBreakdown: summarizeHistoryConfidenceBreakdown(report.summary.averageConfidenceBreakdown),
296
+ averageTraceabilityRate: finiteNumber(report.summary.averageTraceabilityRate),
297
+ averageGroundModelReadinessScore: finiteNumber(report.summary.averageGroundModelReadinessScore),
298
+ averagePreprocessingQualityScore: finiteNumber(report.summary.averagePreprocessingQualityScore),
299
+ totalEstimatedHostedCalls: finiteNumber(report.summary.totalEstimatedHostedCalls),
300
+ pathLeakCount: report.pathSafety?.leakCount ?? inspectGeotechBenchmarkCorpusArtifactSafety(report).leakCount,
301
+ },
302
+ runs: report.runs.map((run) => ({
303
+ key: corpusRunKey(run),
304
+ fixtureId: run.fixtureId,
305
+ category: run.category,
306
+ providerProfile: run.providerProfile,
307
+ preprocessingMode: run.preprocessingMode,
308
+ passed: run.passed,
309
+ successfulPageRate: finiteNumber(run.successfulPageRate),
310
+ cacheHitRate: finiteNumber(run.cacheHitRate),
311
+ estimatedHostedCalls: finiteNumber(run.estimatedHostedCalls),
312
+ directTraceabilityRate: finiteNumber(run.directTraceabilityRate),
313
+ confidenceBreakdown: summarizeHistoryConfidenceBreakdown(run.confidenceBreakdown),
314
+ groundModelReadinessScore: finiteNumber(run.groundModelReadinessScore),
315
+ preprocessingQualityScore: finiteNumber(run.preprocessingQualityScore),
316
+ preprocessingRegionQualityScore: finiteNumber(run.preprocessingRegionQualityScore),
317
+ reviewGates: Array.isArray(run.reviewGates) ? [...run.reviewGates] : [],
318
+ latencyMs: typeof run.latencyMs === 'number' && Number.isFinite(run.latencyMs) ? run.latencyMs : null,
319
+ })),
320
+ };
321
+ }
322
+ function validateGeotechBenchmarkCorpusHistoryEntry(entry, failures, prefix) {
323
+ if (!entry || typeof entry !== 'object') {
324
+ failures.push(`${prefix}_history_entry_missing`);
325
+ return;
326
+ }
327
+ if (entry.kind !== 'geotech-benchmark-corpus-history-entry') {
328
+ failures.push(`${prefix}_history_wrong_kind`);
329
+ }
330
+ if (entry.schemaVersion !== 1) {
331
+ failures.push(`${prefix}_history_wrong_schema_version`);
332
+ }
333
+ if (!entry.generatedAt) {
334
+ failures.push(`${prefix}_history_missing_generated_at`);
335
+ }
336
+ if (!entry.mode) {
337
+ failures.push(`${prefix}_history_mode_missing`);
338
+ }
339
+ if (!Number.isInteger(entry.skippedFixtureCount) || entry.skippedFixtureCount < 0) {
340
+ failures.push(`${prefix}_history_skipped_fixture_count_invalid`);
341
+ }
342
+ if (!Array.isArray(entry.providerProfiles)) {
343
+ failures.push(`${prefix}_history_provider_profiles_invalid`);
344
+ }
345
+ if (!Array.isArray(entry.preprocessingModes)) {
346
+ failures.push(`${prefix}_history_preprocessing_modes_invalid`);
347
+ }
348
+ const summary = entry.summary;
349
+ const runs = Array.isArray(entry.runs) ? entry.runs : [];
350
+ if (!summary || typeof summary !== 'object') {
351
+ failures.push(`${prefix}_history_summary_missing`);
352
+ return;
353
+ }
354
+ for (const key of [
355
+ 'fixtureCount',
356
+ 'runCount',
357
+ 'passedRuns',
358
+ 'failedRuns',
359
+ 'totalEstimatedHostedCalls',
360
+ 'pathLeakCount',
361
+ ]) {
362
+ if (!Number.isInteger(summary[key]) || summary[key] < 0) {
363
+ failures.push(`${prefix}_history_${key}_invalid`);
364
+ }
365
+ }
366
+ for (const key of [
367
+ 'averageConfidence',
368
+ 'averageTraceabilityRate',
369
+ 'averageGroundModelReadinessScore',
370
+ 'averagePreprocessingQualityScore',
371
+ ]) {
372
+ if (!Number.isFinite(summary[key])) {
373
+ failures.push(`${prefix}_history_${key}_invalid`);
374
+ }
375
+ }
376
+ validateHistoryConfidenceBreakdown(summary.averageConfidenceBreakdown, failures, `${prefix}_summary`);
377
+ if (summary.runCount !== runs.length) {
378
+ failures.push(`${prefix}_history_run_count_mismatch`);
379
+ }
380
+ if (summary.passedRuns !== runs.filter((run) => run.passed).length) {
381
+ failures.push(`${prefix}_history_passed_runs_mismatch`);
382
+ }
383
+ if (summary.failedRuns !== runs.filter((run) => !run.passed).length) {
384
+ failures.push(`${prefix}_history_failed_runs_mismatch`);
385
+ }
386
+ if (summary.runCount !== summary.passedRuns + summary.failedRuns) {
387
+ failures.push(`${prefix}_history_summary_run_count_mismatch`);
388
+ }
389
+ if (summary.passed !== (summary.runCount > 0 && summary.failedRuns === 0 && summary.pathLeakCount === 0)) {
390
+ failures.push(`${prefix}_history_passed_flag_mismatch`);
391
+ }
392
+ if (summary.pathLeakCount !== 0) {
393
+ failures.push(`${prefix}_history_path_leaks_present`);
394
+ }
395
+ const observedProviders = new Set(runs.map((run) => run.providerProfile));
396
+ const observedModes = new Set(runs.map((run) => run.preprocessingMode));
397
+ for (const provider of observedProviders) {
398
+ if (!entry.providerProfiles.includes(provider)) {
399
+ failures.push(`${prefix}_history_provider_profile_missing_${sanitizeFailureToken(provider)}`);
400
+ }
401
+ }
402
+ for (const mode of observedModes) {
403
+ if (!entry.preprocessingModes.includes(mode)) {
404
+ failures.push(`${prefix}_history_preprocessing_mode_missing_${sanitizeFailureToken(mode)}`);
405
+ }
406
+ }
407
+ for (const [index, run] of runs.entries()) {
408
+ const label = `${prefix}_run_${sanitizeFailureToken(run.key || String(index))}`;
409
+ if (run.key !== corpusRunKey(run)) {
410
+ failures.push(`${label}_key_mismatch`);
411
+ }
412
+ for (const key of ['fixtureId', 'category', 'providerProfile', 'preprocessingMode']) {
413
+ if (typeof run[key] !== 'string' || !run[key].trim()) {
414
+ failures.push(`${label}_${key}_missing`);
415
+ }
416
+ }
417
+ for (const key of [
418
+ 'successfulPageRate',
419
+ 'cacheHitRate',
420
+ 'directTraceabilityRate',
421
+ 'preprocessingQualityScore',
422
+ 'preprocessingRegionQualityScore',
423
+ ]) {
424
+ if (!Number.isFinite(run[key]) || run[key] < 0 || run[key] > 1) {
425
+ failures.push(`${label}_${key}_invalid`);
426
+ }
427
+ }
428
+ if (!Number.isInteger(run.estimatedHostedCalls) || run.estimatedHostedCalls < 0) {
429
+ failures.push(`${label}_estimatedHostedCalls_invalid`);
430
+ }
431
+ if (!Number.isFinite(run.groundModelReadinessScore) || run.groundModelReadinessScore < 0) {
432
+ failures.push(`${label}_groundModelReadinessScore_invalid`);
433
+ }
434
+ validateHistoryConfidenceBreakdown(run.confidenceBreakdown, failures, `${label}_confidence`);
435
+ if (!Array.isArray(run.reviewGates)) {
436
+ failures.push(`${label}_reviewGates_invalid`);
437
+ }
438
+ if (run.latencyMs != null && (!Number.isFinite(run.latencyMs) || run.latencyMs < 0)) {
439
+ failures.push(`${label}_latencyMs_invalid`);
440
+ }
441
+ }
442
+ }
443
+ function buildGeotechBenchmarkCorpusTrendDelta(current, previous) {
444
+ return {
445
+ fixtureCount: current.summary.fixtureCount - previous.summary.fixtureCount,
446
+ runCount: current.summary.runCount - previous.summary.runCount,
447
+ passedRuns: current.summary.passedRuns - previous.summary.passedRuns,
448
+ failedRuns: current.summary.failedRuns - previous.summary.failedRuns,
449
+ averageConfidence: roundRatio(current.summary.averageConfidence - previous.summary.averageConfidence),
450
+ averageExtractionConfidence: roundRatio(current.summary.averageConfidenceBreakdown.extractionConfidence
451
+ - previous.summary.averageConfidenceBreakdown.extractionConfidence),
452
+ averageCorroborationScore: roundRatio(current.summary.averageConfidenceBreakdown.corroborationScore
453
+ - previous.summary.averageConfidenceBreakdown.corroborationScore),
454
+ averageTraceabilityRate: roundRatio(current.summary.averageTraceabilityRate - previous.summary.averageTraceabilityRate),
455
+ averageGroundModelReadinessScore: current.summary.averageGroundModelReadinessScore
456
+ - previous.summary.averageGroundModelReadinessScore,
457
+ averagePreprocessingQualityScore: roundRatio(current.summary.averagePreprocessingQualityScore - previous.summary.averagePreprocessingQualityScore),
458
+ totalEstimatedHostedCalls: current.summary.totalEstimatedHostedCalls - previous.summary.totalEstimatedHostedCalls,
459
+ pathLeakCount: current.summary.pathLeakCount - previous.summary.pathLeakCount,
460
+ };
461
+ }
462
+ function buildGeotechBenchmarkCorpusRunDeltas(currentRuns, previousRuns) {
463
+ const previousByKey = new Map(previousRuns.map((run) => [run.key, run]));
464
+ return currentRuns.map((current) => {
465
+ const previous = previousByKey.get(current.key);
466
+ const status = previous
467
+ ? (current.passed === previous.passed ? 'unchanged' : 'changed')
468
+ : 'new';
469
+ return {
470
+ key: current.key,
471
+ fixtureId: current.fixtureId,
472
+ providerProfile: current.providerProfile,
473
+ preprocessingMode: current.preprocessingMode,
474
+ status,
475
+ passed: current.passed,
476
+ previousPassed: previous?.passed ?? null,
477
+ cacheHitRateDelta: previous ? roundRatio(current.cacheHitRate - previous.cacheHitRate) : null,
478
+ hostedCallDelta: previous ? current.estimatedHostedCalls - previous.estimatedHostedCalls : null,
479
+ traceabilityDelta: previous ? roundRatio(current.directTraceabilityRate - previous.directTraceabilityRate) : null,
480
+ extractionConfidenceDelta: previous
481
+ ? roundRatio(current.confidenceBreakdown.extractionConfidence
482
+ - previous.confidenceBreakdown.extractionConfidence)
483
+ : null,
484
+ corroborationScoreDelta: previous
485
+ ? roundRatio(current.confidenceBreakdown.corroborationScore
486
+ - previous.confidenceBreakdown.corroborationScore)
487
+ : null,
488
+ groundModelReadinessDelta: previous
489
+ ? current.groundModelReadinessScore - previous.groundModelReadinessScore
490
+ : null,
491
+ qualityDelta: previous ? roundRatio(current.preprocessingQualityScore - previous.preprocessingQualityScore) : null,
492
+ reviewGateDelta: previous ? current.reviewGates.length - previous.reviewGates.length : null,
493
+ latencyDeltaMs: previous && current.latencyMs != null && previous.latencyMs != null
494
+ ? current.latencyMs - previous.latencyMs
495
+ : null,
496
+ };
497
+ }).sort((left, right) => left.key.localeCompare(right.key));
498
+ }
499
+ function summarizeHistoryConfidenceBreakdown(value) {
500
+ return {
501
+ overall: finiteNumber(value?.overall),
502
+ extractionConfidence: finiteNumber(value?.extractionConfidence),
503
+ engineeringCompleteness: finiteNumber(value?.engineeringCompleteness),
504
+ traceabilityScore: finiteNumber(value?.traceabilityScore),
505
+ corroborationScore: finiteNumber(value?.corroborationScore),
506
+ readinessScore: finiteNumber(value?.readinessScore),
507
+ pageEvidenceConfidence: finiteNumber(value?.pageEvidenceConfidence),
508
+ };
509
+ }
510
+ function validateHistoryConfidenceBreakdown(value, failures, prefix) {
511
+ if (!value || typeof value !== 'object') {
512
+ failures.push(`${prefix}_confidence_breakdown_missing`);
513
+ return;
514
+ }
515
+ for (const key of [
516
+ 'overall',
517
+ 'extractionConfidence',
518
+ 'engineeringCompleteness',
519
+ 'traceabilityScore',
520
+ 'corroborationScore',
521
+ 'readinessScore',
522
+ 'pageEvidenceConfidence',
523
+ ]) {
524
+ if (!Number.isFinite(value[key]) || value[key] < 0) {
525
+ failures.push(`${prefix}_${key}_invalid`);
526
+ }
527
+ }
528
+ }
529
+ function corpusRunKey(run) {
530
+ return `${run.fixtureId}::${run.providerProfile}::${run.preprocessingMode}`;
531
+ }
138
532
  function buildCorpusRun(input) {
139
533
  const benchmark = input.benchmark;
140
534
  const providerProfile = input.providerProfile
@@ -410,58 +804,7 @@ function validateFemExecutionBoundary(benchmark) {
410
804
  if (!fem) {
411
805
  return ['FEM draft readiness block missing from benchmark output'];
412
806
  }
413
- const agentRunAllowedRoutes = fem.agentRunAllowedRoutes ?? [];
414
- const agentWebglAllowedRoutes = fem.agentWebglAllowedRoutes ?? [];
415
- const agentResultManifestAllowedRoutes = fem.agentResultManifestAllowedRoutes ?? [];
416
- const caseOutputAvailableRoutes = fem.caseOutputAvailableRoutes ?? [];
417
- const humanRunCommandAvailableRoutes = fem.humanRunCommandAvailableRoutes ?? [];
418
- const staleRunCommandRoutes = fem.staleRunCommandRoutes ?? [];
419
- const failures = [
420
- fem.canAutoProceed
421
- ? 'FEM draft readiness became auto-proceedable'
422
- : null,
423
- agentRunAllowedRoutes.length > 0
424
- ? `FEM benchmark exposed agent-run routes: ${routeList(agentRunAllowedRoutes)}`
425
- : null,
426
- agentWebglAllowedRoutes.length > 0
427
- ? `FEM benchmark exposed agent WebGL routes: ${routeList(agentWebglAllowedRoutes)}`
428
- : null,
429
- agentResultManifestAllowedRoutes.length > 0
430
- ? `FEM benchmark exposed agent result-manifest routes: ${routeList(agentResultManifestAllowedRoutes)}`
431
- : null,
432
- caseOutputAvailableRoutes.length > 0
433
- ? `FEM benchmark exposed unreviewed case-output routes: ${routeList(caseOutputAvailableRoutes)}`
434
- : null,
435
- humanRunCommandAvailableRoutes.length > 0
436
- ? `FEM benchmark exposed unreviewed human-run routes: ${routeList(humanRunCommandAvailableRoutes)}`
437
- : null,
438
- staleRunCommandRoutes.length > 0
439
- ? `FEM benchmark recommended stale run commands: ${routeList(staleRunCommandRoutes)}`
440
- : null,
441
- ];
442
- for (const route of fem.routes ?? []) {
443
- const boundary = route.executionBoundary;
444
- if (!boundary) {
445
- failures.push(`FEM route ${route.objective} has no execution boundary`);
446
- continue;
447
- }
448
- failures.push(route.agentRunAllowed || boundary.agentRunAllowed
449
- ? `FEM route ${route.objective} exposed agent solver execution`
450
- : null, boundary.agentWebglRenderAllowed
451
- ? `FEM route ${route.objective} exposed agent WebGL rendering`
452
- : null, boundary.agentResultManifestAllowed
453
- ? `FEM route ${route.objective} exposed agent result-manifest creation`
454
- : null, boundary.caseOutputAvailable
455
- ? `FEM route ${route.objective} exposed unreviewed case output`
456
- : null, boundary.humanRunCommandAvailable
457
- ? `FEM route ${route.objective} exposed an unreviewed human run command`
458
- : null, !boundary.humanReviewRequired
459
- ? `FEM route ${route.objective} no longer requires human review`
460
- : null, /\bfem run\b/i.test(route.recommendedCommand ?? '')
461
- ? `FEM route ${route.objective} recommended a run command instead of a draft command`
462
- : null);
463
- }
464
- return [...new Set(failures.filter((value) => value != null))];
807
+ return collectFemDraftReadinessGuardrailFailures(fem).map((failure) => failure.endsWith('.') ? failure.slice(0, -1) : failure);
465
808
  }
466
809
  function buildRunWarnings(fixture, run) {
467
810
  return [
@@ -575,9 +918,25 @@ function deduplicatePathSafetyLeaks(leaks) {
575
918
  }
576
919
  return unique;
577
920
  }
921
+ function deduplicateArtifactSafetyLeaks(leaks) {
922
+ const seen = new Set();
923
+ const unique = [];
924
+ for (const leak of leaks) {
925
+ const key = `${leak.kind}:${leak.location}`;
926
+ if (seen.has(key)) {
927
+ continue;
928
+ }
929
+ seen.add(key);
930
+ unique.push(leak);
931
+ }
932
+ return unique;
933
+ }
578
934
  function sanitizeLocationKey(value) {
579
935
  return value.replace(/[^a-zA-Z0-9_$-]/g, '_');
580
936
  }
937
+ function sanitizeFailureToken(value) {
938
+ return value.replace(/[^a-zA-Z0-9_-]+/g, '_').slice(0, 72);
939
+ }
581
940
  function looksLikeAbsoluteLocalPath(value) {
582
941
  if (!value || /^(?:https?|s3|gs|file):\/\//i.test(value)) {
583
942
  return false;
@@ -697,9 +1056,6 @@ function successfulPageRate(benchmark) {
697
1056
  const successfulPages = finiteNumber(benchmark.source?.successfulPages);
698
1057
  return totalPages > 0 ? roundRatio(successfulPages / totalPages) : 0;
699
1058
  }
700
- function routeList(values) {
701
- return values && values.length > 0 ? values.join(', ') : 'none';
702
- }
703
1059
  function formatReviewGates(values) {
704
1060
  return values.length > 0 ? values.join(', ') : 'none';
705
1061
  }
@@ -745,6 +1101,15 @@ function signedPercent(value) {
745
1101
  const sign = value > 0 ? '+' : '';
746
1102
  return `${sign}${Math.round(value * 100)}%`;
747
1103
  }
1104
+ function formatDelta(value, asPercent) {
1105
+ if (value == null) {
1106
+ return 'new';
1107
+ }
1108
+ return asPercent ? signedPercent(value) : signed(value);
1109
+ }
1110
+ function signed(value) {
1111
+ return value > 0 ? `+${value}` : String(value);
1112
+ }
748
1113
  function escapeHtml(value) {
749
1114
  return value
750
1115
  .replaceAll('&', '&amp;')