@geotechcli/core 0.4.89 → 0.4.91
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/config/index.d.ts.map +1 -1
- package/dist/config/index.js +4 -4
- package/dist/config/index.js.map +1 -1
- package/dist/fem/ground-model-draft.d.ts +7 -0
- package/dist/fem/ground-model-draft.d.ts.map +1 -1
- package/dist/fem/ground-model-draft.js +213 -6
- package/dist/fem/ground-model-draft.js.map +1 -1
- package/dist/fem/index.d.ts +1 -1
- package/dist/fem/index.d.ts.map +1 -1
- package/dist/fem/index.js +1 -1
- package/dist/fem/index.js.map +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -1
- package/dist/index.js.map +1 -1
- package/dist/ingest/document-evidence-packet.d.ts +92 -92
- package/dist/ingest/geotech-benchmark-corpus.d.ts +124 -2
- package/dist/ingest/geotech-benchmark-corpus.d.ts.map +1 -1
- package/dist/ingest/geotech-benchmark-corpus.js +420 -55
- package/dist/ingest/geotech-benchmark-corpus.js.map +1 -1
- package/dist/ingest/geotech-document-benchmark.d.ts +4 -0
- package/dist/ingest/geotech-document-benchmark.d.ts.map +1 -1
- package/dist/ingest/geotech-document-benchmark.js +196 -41
- package/dist/ingest/geotech-document-benchmark.js.map +1 -1
- package/dist/ingest/index.d.ts +2 -1
- package/dist/ingest/index.d.ts.map +1 -1
- package/dist/ingest/index.js +2 -1
- package/dist/ingest/index.js.map +1 -1
- package/dist/ingest/preprocessing-fixture-benchmark.d.ts +175 -0
- package/dist/ingest/preprocessing-fixture-benchmark.d.ts.map +1 -0
- package/dist/ingest/preprocessing-fixture-benchmark.js +598 -0
- package/dist/ingest/preprocessing-fixture-benchmark.js.map +1 -0
- package/dist/llm/byok-benchmark.d.ts +125 -0
- package/dist/llm/byok-benchmark.d.ts.map +1 -0
- package/dist/llm/byok-benchmark.js +529 -0
- package/dist/llm/byok-benchmark.js.map +1 -0
- package/dist/llm/index.d.ts +1 -0
- package/dist/llm/index.d.ts.map +1 -1
- package/dist/llm/index.js +1 -0
- package/dist/llm/index.js.map +1 -1
- package/dist/meta/metadata.json +1 -1
- package/dist/signal/index.d.ts +112 -0
- package/dist/signal/index.d.ts.map +1 -1
- package/dist/signal/index.js +648 -1
- package/dist/signal/index.js.map +1 -1
- package/dist/standards/index.d.ts +6 -0
- package/dist/standards/index.d.ts.map +1 -1
- package/dist/standards/index.js +243 -0
- package/dist/standards/index.js.map +1 -1
- package/dist/verifier/findings.d.ts +6 -0
- package/dist/verifier/findings.d.ts.map +1 -1
- package/dist/verifier/findings.js +192 -1
- package/dist/verifier/findings.js.map +1 -1
- package/dist/verifier/index.d.ts +1 -1
- package/dist/verifier/index.d.ts.map +1 -1
- package/dist/verifier/index.js +1 -1
- package/dist/verifier/index.js.map +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
import type { LLMConfig, ProviderCapabilityProfile } from './types.js';
|
|
2
|
+
export type ByokBenchmarkProfileId = 'hosted-beta' | 'direct-zai' | 'premium-byok' | 'openai-compatible' | 'openrouter-free' | 'local-hf-compatible';
|
|
3
|
+
export type ByokBenchmarkEvidenceInput = 'preprocessed-page-evidence';
|
|
4
|
+
export interface ByokBenchmarkEvidenceItem {
|
|
5
|
+
evidenceId: string;
|
|
6
|
+
sourcePage: string;
|
|
7
|
+
method: 'ocr' | 'layout-ocr' | 'page-cache';
|
|
8
|
+
confidence: number;
|
|
9
|
+
snippet: string;
|
|
10
|
+
}
|
|
11
|
+
export interface ByokBenchmarkEvidenceContract {
|
|
12
|
+
schemaVersion: 'geotech.byok-evidence-contract.v1';
|
|
13
|
+
evidenceInput: ByokBenchmarkEvidenceInput;
|
|
14
|
+
prohibitedInputs: Array<'direct-image' | 'native-pdf'>;
|
|
15
|
+
sourceEvidence: ByokBenchmarkEvidenceItem[];
|
|
16
|
+
requiredResponseKeys: string[];
|
|
17
|
+
reviewGates: string[];
|
|
18
|
+
}
|
|
19
|
+
export interface ByokProviderBenchmarkProfile {
|
|
20
|
+
id: ByokBenchmarkProfileId;
|
|
21
|
+
provider: LLMConfig['provider'];
|
|
22
|
+
modelId: string | null;
|
|
23
|
+
visionModelId: string | null;
|
|
24
|
+
capabilityProfile: ProviderCapabilityProfile;
|
|
25
|
+
evidenceContract: ByokBenchmarkEvidenceContract;
|
|
26
|
+
jsonModeAllowed: boolean;
|
|
27
|
+
requestContainsImageInput: false;
|
|
28
|
+
requestContainsNativePdfInput: false;
|
|
29
|
+
}
|
|
30
|
+
export interface ByokBenchmarkResponseValidation {
|
|
31
|
+
ok: boolean;
|
|
32
|
+
parsedJson: boolean;
|
|
33
|
+
citedEvidenceIds: string[];
|
|
34
|
+
failures: string[];
|
|
35
|
+
warnings: string[];
|
|
36
|
+
}
|
|
37
|
+
export interface ByokBenchmarkRunSummary {
|
|
38
|
+
profile: ByokProviderBenchmarkProfile;
|
|
39
|
+
ok: boolean;
|
|
40
|
+
model: string | null;
|
|
41
|
+
latencyMs: number | null;
|
|
42
|
+
totalTokens: number | null;
|
|
43
|
+
response: ByokBenchmarkResponseValidation;
|
|
44
|
+
error?: string;
|
|
45
|
+
}
|
|
46
|
+
export interface ByokBenchmarkReport {
|
|
47
|
+
kind: 'geotech-byok-provider-benchmark';
|
|
48
|
+
schemaVersion: 1;
|
|
49
|
+
generatedAt: string;
|
|
50
|
+
runs: ByokBenchmarkRunSummary[];
|
|
51
|
+
summary: {
|
|
52
|
+
runCount: number;
|
|
53
|
+
passedRuns: number;
|
|
54
|
+
failedRuns: number;
|
|
55
|
+
profiles: ByokBenchmarkProfileId[];
|
|
56
|
+
passed: boolean;
|
|
57
|
+
};
|
|
58
|
+
contractValidation?: ByokBenchmarkReportContractValidation;
|
|
59
|
+
}
|
|
60
|
+
export interface ByokBenchmarkReportContractOptions {
|
|
61
|
+
requiredProfiles?: ByokBenchmarkProfileId[];
|
|
62
|
+
}
|
|
63
|
+
export interface ByokBenchmarkReportContractValidation {
|
|
64
|
+
ok: boolean;
|
|
65
|
+
profiles: ByokBenchmarkProfileId[];
|
|
66
|
+
failures: string[];
|
|
67
|
+
warnings: string[];
|
|
68
|
+
}
|
|
69
|
+
export interface ByokBenchmarkArtifactSafety {
|
|
70
|
+
ok: boolean;
|
|
71
|
+
leaks: string[];
|
|
72
|
+
}
|
|
73
|
+
export interface ByokBenchmarkHistoryEntry {
|
|
74
|
+
kind: 'geotech-byok-provider-benchmark-history-entry';
|
|
75
|
+
schemaVersion: 1;
|
|
76
|
+
generatedAt: string;
|
|
77
|
+
summary: {
|
|
78
|
+
runCount: number;
|
|
79
|
+
passedRuns: number;
|
|
80
|
+
failedRuns: number;
|
|
81
|
+
passed: boolean;
|
|
82
|
+
profiles: ByokBenchmarkProfileId[];
|
|
83
|
+
evidenceInput: ByokBenchmarkEvidenceInput | 'none';
|
|
84
|
+
averageLatencyMs: number | null;
|
|
85
|
+
totalTokens: number | null;
|
|
86
|
+
pathLeakCount: number;
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
export interface ByokBenchmarkTrendReport {
|
|
90
|
+
kind: 'geotech-byok-provider-benchmark-trend';
|
|
91
|
+
schemaVersion: 1;
|
|
92
|
+
generatedAt: string;
|
|
93
|
+
current: ByokBenchmarkHistoryEntry;
|
|
94
|
+
previous: ByokBenchmarkHistoryEntry | null;
|
|
95
|
+
delta: {
|
|
96
|
+
runCount: number;
|
|
97
|
+
passedRuns: number;
|
|
98
|
+
failedRuns: number;
|
|
99
|
+
averageLatencyMs: number | null;
|
|
100
|
+
totalTokens: number | null;
|
|
101
|
+
pathLeakCount: number;
|
|
102
|
+
} | null;
|
|
103
|
+
historyCount: number;
|
|
104
|
+
note: string;
|
|
105
|
+
}
|
|
106
|
+
export interface ByokBenchmarkTrend {
|
|
107
|
+
history: ByokBenchmarkHistoryEntry[];
|
|
108
|
+
report: ByokBenchmarkTrendReport;
|
|
109
|
+
}
|
|
110
|
+
export interface ByokBenchmarkTrendContractValidation {
|
|
111
|
+
ok: boolean;
|
|
112
|
+
failures: string[];
|
|
113
|
+
warnings: string[];
|
|
114
|
+
}
|
|
115
|
+
export declare function buildByokProviderBenchmarkProfile(id: ByokBenchmarkProfileId, config: Pick<LLMConfig, 'provider' | 'modelId' | 'visionModelId'>): ByokProviderBenchmarkProfile;
|
|
116
|
+
export declare function buildByokBenchmarkEvidenceContract(capabilityProfile: ProviderCapabilityProfile): ByokBenchmarkEvidenceContract;
|
|
117
|
+
export declare function buildByokBenchmarkPrompt(profile: ByokProviderBenchmarkProfile): string;
|
|
118
|
+
export declare function validateByokBenchmarkResponse(text: string, contract?: ByokBenchmarkEvidenceContract): ByokBenchmarkResponseValidation;
|
|
119
|
+
export declare function buildByokBenchmarkReport(runs: ByokBenchmarkRunSummary[], generatedAt?: string | Date): ByokBenchmarkReport;
|
|
120
|
+
export declare function validateByokBenchmarkReportContract(report: ByokBenchmarkReport, options?: ByokBenchmarkReportContractOptions): ByokBenchmarkReportContractValidation;
|
|
121
|
+
export declare function attachByokBenchmarkReportContractValidation(report: ByokBenchmarkReport, options?: ByokBenchmarkReportContractOptions): ByokBenchmarkReport;
|
|
122
|
+
export declare function inspectByokBenchmarkArtifactSafety(value: unknown): ByokBenchmarkArtifactSafety;
|
|
123
|
+
export declare function buildByokBenchmarkTrend(report: ByokBenchmarkReport, previousHistory?: ByokBenchmarkHistoryEntry[]): ByokBenchmarkTrend;
|
|
124
|
+
export declare function validateByokBenchmarkTrendContract(report: ByokBenchmarkTrendReport): ByokBenchmarkTrendContractValidation;
|
|
125
|
+
//# sourceMappingURL=byok-benchmark.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"byok-benchmark.d.ts","sourceRoot":"","sources":["../../src/llm/byok-benchmark.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,SAAS,EAAE,yBAAyB,EAAE,MAAM,YAAY,CAAC;AAEvE,MAAM,MAAM,sBAAsB,GAC9B,aAAa,GACb,YAAY,GACZ,cAAc,GACd,mBAAmB,GACnB,iBAAiB,GACjB,qBAAqB,CAAC;AAE1B,MAAM,MAAM,0BAA0B,GAAG,4BAA4B,CAAC;AAEtE,MAAM,WAAW,yBAAyB;IACxC,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,KAAK,GAAG,YAAY,GAAG,YAAY,CAAC;IAC5C,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,6BAA6B;IAC5C,aAAa,EAAE,mCAAmC,CAAC;IACnD,aAAa,EAAE,0BAA0B,CAAC;IAC1C,gBAAgB,EAAE,KAAK,CAAC,cAAc,GAAG,YAAY,CAAC,CAAC;IACvD,cAAc,EAAE,yBAAyB,EAAE,CAAC;IAC5C,oBAAoB,EAAE,MAAM,EAAE,CAAC;IAC/B,WAAW,EAAE,MAAM,EAAE,CAAC;CACvB;AAED,MAAM,WAAW,4BAA4B;IAC3C,EAAE,EAAE,sBAAsB,CAAC;IAC3B,QAAQ,EAAE,SAAS,CAAC,UAAU,CAAC,CAAC;IAChC,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,iBAAiB,EAAE,yBAAyB,CAAC;IAC7C,gBAAgB,EAAE,6BAA6B,CAAC;IAChD,eAAe,EAAE,OAAO,CAAC;IACzB,yBAAyB,EAAE,KAAK,CAAC;IACjC,6BAA6B,EAAE,KAAK,CAAC;CACtC;AAED,MAAM,WAAW,+BAA+B;IAC9C,EAAE,EAAE,OAAO,CAAC;IACZ,UAAU,EAAE,OAAO,CAAC;IACpB,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,MAAM,WAAW,uBAAuB;IACtC,OAAO,EAAE,4BAA4B,CAAC;IACtC,EAAE,EAAE,OAAO,CAAC;IACZ,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,QAAQ,EAAE,+BAA+B,CAAC;IAC1C,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,iCAAiC,CAAC;IACxC,aAAa,EAAE,CAAC,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,uBAAuB,EAAE,CAAC;IAChC,OAAO,EAAE;QACP,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;QACnB,QAAQ,EAAE,sBAAsB,EAAE,CAAC;QACnC,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC;IACF,kBAAkB,CAAC,EAAE,qCAAqC,CAAC;CAC5D;AAED,MAAM,WAAW,kCAAkC;IACjD,gBAAgB,CAAC,EAAE,sBAAsB,EAAE,CAAC;CAC7C;AAED,MAAM,WAAW,qCAAqC;IACpD,EAAE,EAAE,OAAO,CAAC;IACZ,QAAQ,EAAE,sBAAsB,EAAE,CAAC;IACnC,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAED,MAAM,WAAW,2BAA2B;IAC1C,EAAE,EAAE,OAAO,CAAC;IACZ,KAAK,EAAE,MAAM,EAAE,CAAC;CACjB;AAED,MAAM,WAAW,yBAAyB;IACxC,IAAI,EAAE,+CAA+C,CAAC;IACtD,aAAa,EAAE,CAAC,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE;QACP,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;QACnB,MAAM,EAAE,OAAO,CAAC;QAChB,QAAQ,EAAE,sBAAsB,EAAE,CAAC;QACnC,aAAa,EAAE,0BAA0B,GAAG,MAAM,CAAC;QACnD,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;QAChC,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;QAC3B,aAAa,EAAE,MAAM,CAAC;KACvB,CAAC;CACH;AAED,MAAM,WAAW,wBAAwB;IACvC,IAAI,EAAE,uCAAuC,CAAC;IAC9C,aAAa,EAAE,CAAC,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,yBAAyB,CAAC;IACnC,QAAQ,EAAE,yBAAyB,GAAG,IAAI,CAAC;IAC3C,KAAK,EAAE;QACL,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;QACnB,gBAAgB,EAAE,MAAM,GAAG,IAAI,CAAC;QAChC,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;QAC3B,aAAa,EAAE,MAAM,CAAC;KACvB,GAAG,IAAI,CAAC;IACT,YAAY,EAAE,MAAM,CAAC;IACrB,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,kBAAkB;IACjC,OAAO,EAAE,yBAAyB,EAAE,CAAC;IACrC,MAAM,EAAE,wBAAwB,CAAC;CAClC;AAED,MAAM,WAAW,oCAAoC;IACnD,EAAE,EAAE,OAAO,CAAC;IACZ,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,QAAQ,EAAE,MAAM,EAAE,CAAC;CACpB;AAgBD,wBAAgB,iCAAiC,CAC/C,EAAE,EAAE,sBAAsB,EAC1B,MAAM,EAAE,IAAI,CAAC,SAAS,EAAE,UAAU,GAAG,SAAS,GAAG,eAAe,CAAC,GAChE,4BAA4B,CAa9B;AAED,wBAAgB,kCAAkC,CAChD,iBAAiB,EAAE,yBAAyB,GAC3C,6BAA6B,CA0C/B;AAED,wBAAgB,wBAAwB,CAAC,OAAO,EAAE,4BAA4B,GAAG,MAAM,CAiBtF;AAED,wBAAgB,6BAA6B,CAC3C,IAAI,EAAE,MAAM,EACZ,QAAQ,GAAE,6BAET,GACA,+BAA+B,CAiDjC;AAED,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,uBAAuB,EAAE,EAC/B,WAAW,GAAE,MAAM,GAAG,IAAiB,GACtC,mBAAmB,CAgBrB;AAED,wBAAgB,mCAAmC,CACjD,MAAM,EAAE,mBAAmB,EAC3B,OAAO,GAAE,kCAAuC,GAC/C,qCAAqC,CA6EvC;AAED,wBAAgB,2CAA2C,CACzD,MAAM,EAAE,mBAAmB,EAC3B,OAAO,GAAE,kCAAuC,GAC/C,mBAAmB,CAMrB;AAED,wBAAgB,kCAAkC,CAAC,KAAK,EAAE,OAAO,GAAG,2BAA2B,CAM9F;AAED,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,mBAAmB,EAC3B,eAAe,GAAE,yBAAyB,EAAO,GAChD,kBAAkB,CAiBpB;AAED,wBAAgB,kCAAkC,CAAC,MAAM,EAAE,wBAAwB,GAAG,oCAAoC,CAyCzH"}
|
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
import { resolveProviderCapabilityProfile } from './capabilities.js';
|
|
2
|
+
const BYOK_REQUIRED_RESPONSE_KEYS = [
|
|
3
|
+
'ok',
|
|
4
|
+
'evidence_input',
|
|
5
|
+
'cited_evidence_ids',
|
|
6
|
+
'takeaway',
|
|
7
|
+
'review_gates',
|
|
8
|
+
];
|
|
9
|
+
const BYOK_TEXT_EVIDENCE_PROFILE_IDS = new Set([
|
|
10
|
+
'openai-compatible',
|
|
11
|
+
'openrouter-free',
|
|
12
|
+
'local-hf-compatible',
|
|
13
|
+
]);
|
|
14
|
+
export function buildByokProviderBenchmarkProfile(id, config) {
|
|
15
|
+
const capabilityProfile = resolveProviderCapabilityProfile(config);
|
|
16
|
+
return {
|
|
17
|
+
id,
|
|
18
|
+
provider: config.provider,
|
|
19
|
+
modelId: capabilityProfile.modelId,
|
|
20
|
+
visionModelId: capabilityProfile.visionModelId,
|
|
21
|
+
capabilityProfile,
|
|
22
|
+
evidenceContract: buildByokBenchmarkEvidenceContract(capabilityProfile),
|
|
23
|
+
jsonModeAllowed: capabilityProfile.capabilities.jsonMode && !capabilityProfile.likelyFreeRoute,
|
|
24
|
+
requestContainsImageInput: false,
|
|
25
|
+
requestContainsNativePdfInput: false,
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
export function buildByokBenchmarkEvidenceContract(capabilityProfile) {
|
|
29
|
+
return {
|
|
30
|
+
schemaVersion: 'geotech.byok-evidence-contract.v1',
|
|
31
|
+
evidenceInput: 'preprocessed-page-evidence',
|
|
32
|
+
prohibitedInputs: ['direct-image', 'native-pdf'],
|
|
33
|
+
sourceEvidence: [
|
|
34
|
+
{
|
|
35
|
+
evidenceId: 'ev-page-3-bh1-spt',
|
|
36
|
+
sourcePage: '3',
|
|
37
|
+
method: 'ocr',
|
|
38
|
+
confidence: 0.92,
|
|
39
|
+
snippet: 'BH-01 SPT N=18 at 2.0 m; source page 3.',
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
evidenceId: 'ev-page-4-lab-atterberg',
|
|
43
|
+
sourcePage: '4',
|
|
44
|
+
method: 'layout-ocr',
|
|
45
|
+
confidence: 0.88,
|
|
46
|
+
snippet: 'Atterberg limits: LL=42%, PI=22%; source page 4.',
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
evidenceId: 'ev-page-5-groundwater-gap',
|
|
50
|
+
sourcePage: '5',
|
|
51
|
+
method: 'page-cache',
|
|
52
|
+
confidence: 0.74,
|
|
53
|
+
snippet: 'Groundwater level was not extracted; review gate remains open.',
|
|
54
|
+
},
|
|
55
|
+
],
|
|
56
|
+
requiredResponseKeys: [
|
|
57
|
+
'ok',
|
|
58
|
+
'evidence_input',
|
|
59
|
+
'cited_evidence_ids',
|
|
60
|
+
'takeaway',
|
|
61
|
+
'review_gates',
|
|
62
|
+
],
|
|
63
|
+
reviewGates: [
|
|
64
|
+
...capabilityProfile.reviewGates,
|
|
65
|
+
'source-evidence-required',
|
|
66
|
+
'missing-groundwater-review-required',
|
|
67
|
+
'human-engineering-review-required',
|
|
68
|
+
],
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
export function buildByokBenchmarkPrompt(profile) {
|
|
72
|
+
const contract = profile.evidenceContract;
|
|
73
|
+
return [
|
|
74
|
+
'Return ONLY compact JSON. Do not use markdown.',
|
|
75
|
+
'You are being benchmarked as a GeotechCLI BYOK/provider brain.',
|
|
76
|
+
`Provider profile: ${profile.id}. Capability profile: ${profile.capabilityProfile.id}.`,
|
|
77
|
+
`Evidence input: ${contract.evidenceInput}. Do not request or claim direct image/PDF inspection.`,
|
|
78
|
+
`Prohibited inputs: ${contract.prohibitedInputs.join(', ')}.`,
|
|
79
|
+
`Active review gates: ${contract.reviewGates.join(', ')}.`,
|
|
80
|
+
'Use only this OCR/page evidence:',
|
|
81
|
+
...contract.sourceEvidence.map((item) => `- ${item.evidenceId} | page ${item.sourcePage} | ${item.method} | confidence ${Math.round(item.confidence * 100)}% | ${item.snippet}`),
|
|
82
|
+
'Required JSON shape:',
|
|
83
|
+
'{"ok":true,"evidence_input":"preprocessed-page-evidence","cited_evidence_ids":["ev-page-3-bh1-spt"],"takeaway":"...","review_gates":["..."]}',
|
|
84
|
+
'The takeaway must be one short geotechnical sentence and must mention source evidence.',
|
|
85
|
+
].join('\n');
|
|
86
|
+
}
|
|
87
|
+
export function validateByokBenchmarkResponse(text, contract = buildByokBenchmarkEvidenceContract(resolveProviderCapabilityProfile({ provider: 'openai-compatible', modelId: 'byok-text', visionModelId: '' }))) {
|
|
88
|
+
const trimmed = text.trim();
|
|
89
|
+
const parsed = parseJsonObject(trimmed);
|
|
90
|
+
const failures = [];
|
|
91
|
+
const warnings = [];
|
|
92
|
+
const allowedIds = new Set(contract.sourceEvidence.map((item) => item.evidenceId));
|
|
93
|
+
let citedEvidenceIds = [];
|
|
94
|
+
if (!parsed) {
|
|
95
|
+
failures.push('response_not_json');
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
const missingKeys = contract.requiredResponseKeys.filter((key) => !(key in parsed));
|
|
99
|
+
failures.push(...missingKeys.map((key) => `missing_key_${key}`));
|
|
100
|
+
if (parsed.ok !== true) {
|
|
101
|
+
failures.push('ok_not_true');
|
|
102
|
+
}
|
|
103
|
+
if (parsed.evidence_input !== contract.evidenceInput) {
|
|
104
|
+
failures.push('wrong_evidence_input');
|
|
105
|
+
}
|
|
106
|
+
citedEvidenceIds = Array.isArray(parsed.cited_evidence_ids)
|
|
107
|
+
? parsed.cited_evidence_ids.filter((id) => typeof id === 'string')
|
|
108
|
+
: [];
|
|
109
|
+
if (citedEvidenceIds.length === 0) {
|
|
110
|
+
failures.push('missing_evidence_citations');
|
|
111
|
+
}
|
|
112
|
+
const unknownIds = citedEvidenceIds.filter((id) => !allowedIds.has(id));
|
|
113
|
+
failures.push(...unknownIds.map((id) => `unknown_evidence_id_${sanitizeFailureToken(id)}`));
|
|
114
|
+
const reviewGates = Array.isArray(parsed.review_gates)
|
|
115
|
+
? parsed.review_gates.filter((gate) => typeof gate === 'string')
|
|
116
|
+
: [];
|
|
117
|
+
if (!reviewGates.some((gate) => /groundwater|human|review/i.test(gate))) {
|
|
118
|
+
failures.push('review_gate_not_preserved');
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
if (/(?:i inspected|from the image|from the pdf|visual inspection|native pdf)/i.test(trimmed)) {
|
|
122
|
+
failures.push('claimed_prohibited_image_or_pdf_inspection');
|
|
123
|
+
}
|
|
124
|
+
if (!/source evidence/i.test(trimmed)) {
|
|
125
|
+
warnings.push('source_evidence_phrase_missing');
|
|
126
|
+
}
|
|
127
|
+
return {
|
|
128
|
+
ok: failures.length === 0,
|
|
129
|
+
parsedJson: Boolean(parsed),
|
|
130
|
+
citedEvidenceIds,
|
|
131
|
+
failures,
|
|
132
|
+
warnings,
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
export function buildByokBenchmarkReport(runs, generatedAt = new Date()) {
|
|
136
|
+
const generated = generatedAt instanceof Date ? generatedAt.toISOString() : generatedAt;
|
|
137
|
+
const safeRuns = runs.map(redactByokBenchmarkRun);
|
|
138
|
+
return {
|
|
139
|
+
kind: 'geotech-byok-provider-benchmark',
|
|
140
|
+
schemaVersion: 1,
|
|
141
|
+
generatedAt: generated,
|
|
142
|
+
runs: safeRuns,
|
|
143
|
+
summary: {
|
|
144
|
+
runCount: safeRuns.length,
|
|
145
|
+
passedRuns: safeRuns.filter((run) => run.ok).length,
|
|
146
|
+
failedRuns: safeRuns.filter((run) => !run.ok).length,
|
|
147
|
+
profiles: [...new Set(safeRuns.map((run) => run.profile.id))].sort(),
|
|
148
|
+
passed: safeRuns.length > 0 && safeRuns.every((run) => run.ok),
|
|
149
|
+
},
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
export function validateByokBenchmarkReportContract(report, options = {}) {
|
|
153
|
+
const failures = [];
|
|
154
|
+
const warnings = [];
|
|
155
|
+
const runs = Array.isArray(report.runs) ? report.runs : [];
|
|
156
|
+
const profiles = runs
|
|
157
|
+
.map((run) => run.profile?.id)
|
|
158
|
+
.filter((profile) => typeof profile === 'string');
|
|
159
|
+
if (report.kind !== 'geotech-byok-provider-benchmark') {
|
|
160
|
+
failures.push('wrong_report_kind');
|
|
161
|
+
}
|
|
162
|
+
if (report.schemaVersion !== 1) {
|
|
163
|
+
failures.push('wrong_report_schema_version');
|
|
164
|
+
}
|
|
165
|
+
if (runs.length === 0) {
|
|
166
|
+
warnings.push('report_has_no_runs');
|
|
167
|
+
}
|
|
168
|
+
const passedRuns = runs.filter((run) => run.ok).length;
|
|
169
|
+
const failedRuns = runs.filter((run) => !run.ok).length;
|
|
170
|
+
const expectedProfiles = [...new Set(profiles)].sort();
|
|
171
|
+
if (report.summary?.runCount !== runs.length) {
|
|
172
|
+
failures.push('summary_run_count_mismatch');
|
|
173
|
+
}
|
|
174
|
+
if (report.summary?.passedRuns !== passedRuns) {
|
|
175
|
+
failures.push('summary_passed_count_mismatch');
|
|
176
|
+
}
|
|
177
|
+
if (report.summary?.failedRuns !== failedRuns) {
|
|
178
|
+
failures.push('summary_failed_count_mismatch');
|
|
179
|
+
}
|
|
180
|
+
if (report.summary?.passed !== (runs.length > 0 && runs.every((run) => run.ok))) {
|
|
181
|
+
failures.push('summary_passed_flag_mismatch');
|
|
182
|
+
}
|
|
183
|
+
if (JSON.stringify(report.summary?.profiles ?? []) !== JSON.stringify(expectedProfiles)) {
|
|
184
|
+
failures.push('summary_profiles_mismatch');
|
|
185
|
+
}
|
|
186
|
+
const observedProfiles = new Set(profiles);
|
|
187
|
+
for (const requiredProfile of options.requiredProfiles ?? []) {
|
|
188
|
+
if (!observedProfiles.has(requiredProfile)) {
|
|
189
|
+
failures.push(`missing_required_profile_${requiredProfile}`);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
let evidenceSignature = null;
|
|
193
|
+
for (const [index, run] of runs.entries()) {
|
|
194
|
+
const profile = run.profile;
|
|
195
|
+
const profileLabel = profile?.id ?? `run_${index}`;
|
|
196
|
+
if (!profile) {
|
|
197
|
+
failures.push(`missing_profile_${index}`);
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
if (run.ok !== run.response.ok) {
|
|
201
|
+
failures.push(`run_response_status_mismatch_${profileLabel}`);
|
|
202
|
+
}
|
|
203
|
+
if (run.ok && run.response.failures.length > 0) {
|
|
204
|
+
failures.push(`passed_run_has_failures_${profileLabel}`);
|
|
205
|
+
}
|
|
206
|
+
validateByokBenchmarkProfileForReport(profile, failures);
|
|
207
|
+
const signature = byokEvidenceComparabilitySignature(profile.evidenceContract);
|
|
208
|
+
if (evidenceSignature == null) {
|
|
209
|
+
evidenceSignature = signature;
|
|
210
|
+
}
|
|
211
|
+
else if (signature !== evidenceSignature) {
|
|
212
|
+
failures.push(`evidence_contract_not_comparable_${profileLabel}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
for (const pointer of collectSensitiveReportPointers(report)) {
|
|
216
|
+
failures.push(`sensitive_value_leak_${sanitizeFailureToken(pointer)}`);
|
|
217
|
+
}
|
|
218
|
+
return {
|
|
219
|
+
ok: failures.length === 0,
|
|
220
|
+
profiles: expectedProfiles,
|
|
221
|
+
failures,
|
|
222
|
+
warnings,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
export function attachByokBenchmarkReportContractValidation(report, options = {}) {
|
|
226
|
+
const { contractValidation: _previousValidation, ...reportWithoutPreviousValidation } = report;
|
|
227
|
+
return {
|
|
228
|
+
...reportWithoutPreviousValidation,
|
|
229
|
+
contractValidation: validateByokBenchmarkReportContract(reportWithoutPreviousValidation, options),
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
export function inspectByokBenchmarkArtifactSafety(value) {
|
|
233
|
+
const leaks = [...new Set(collectSensitiveReportPointers(value))];
|
|
234
|
+
return {
|
|
235
|
+
ok: leaks.length === 0,
|
|
236
|
+
leaks,
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
export function buildByokBenchmarkTrend(report, previousHistory = []) {
|
|
240
|
+
const current = buildByokBenchmarkHistoryEntry(report);
|
|
241
|
+
const previous = previousHistory.at(-1) ?? null;
|
|
242
|
+
const history = [...previousHistory, current].slice(-50);
|
|
243
|
+
return {
|
|
244
|
+
history,
|
|
245
|
+
report: {
|
|
246
|
+
kind: 'geotech-byok-provider-benchmark-trend',
|
|
247
|
+
schemaVersion: 1,
|
|
248
|
+
generatedAt: current.generatedAt,
|
|
249
|
+
current,
|
|
250
|
+
previous,
|
|
251
|
+
delta: previous ? buildByokBenchmarkTrendDelta(current, previous) : null,
|
|
252
|
+
historyCount: history.length,
|
|
253
|
+
note: 'Local BYOK trend output stores provider benchmark summaries only. Raw prompts, source evidence snippets, responses, private paths, and provider tokens are intentionally excluded.',
|
|
254
|
+
},
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
export function validateByokBenchmarkTrendContract(report) {
|
|
258
|
+
const failures = [];
|
|
259
|
+
const warnings = [];
|
|
260
|
+
if (report.kind !== 'geotech-byok-provider-benchmark-trend') {
|
|
261
|
+
failures.push('wrong_trend_kind');
|
|
262
|
+
}
|
|
263
|
+
if (report.schemaVersion !== 1) {
|
|
264
|
+
failures.push('wrong_trend_schema_version');
|
|
265
|
+
}
|
|
266
|
+
if (!report.generatedAt) {
|
|
267
|
+
failures.push('trend_missing_generated_at');
|
|
268
|
+
}
|
|
269
|
+
if (!Number.isInteger(report.historyCount) || report.historyCount < 1) {
|
|
270
|
+
failures.push('trend_history_count_invalid');
|
|
271
|
+
}
|
|
272
|
+
if (!/raw prompts|source evidence snippets|provider tokens/i.test(report.note ?? '')) {
|
|
273
|
+
warnings.push('trend_note_should_state_excluded_sensitive_inputs');
|
|
274
|
+
}
|
|
275
|
+
validateByokBenchmarkHistoryEntry(report.current, failures, 'current');
|
|
276
|
+
if (report.previous !== null) {
|
|
277
|
+
validateByokBenchmarkHistoryEntry(report.previous, failures, 'previous');
|
|
278
|
+
}
|
|
279
|
+
if (report.previous && report.delta == null) {
|
|
280
|
+
failures.push('trend_delta_required_when_previous_exists');
|
|
281
|
+
}
|
|
282
|
+
const serialized = JSON.stringify(report);
|
|
283
|
+
if (/"sourceEvidence"|"snippet"|"response"|"prompt"/.test(serialized)) {
|
|
284
|
+
failures.push('trend_contains_raw_prompt_response_or_source_evidence');
|
|
285
|
+
}
|
|
286
|
+
for (const pointer of inspectByokBenchmarkArtifactSafety(report).leaks) {
|
|
287
|
+
failures.push(`trend_sensitive_value_leak_${sanitizeFailureToken(pointer)}`);
|
|
288
|
+
}
|
|
289
|
+
return {
|
|
290
|
+
ok: failures.length === 0,
|
|
291
|
+
failures: [...new Set(failures)],
|
|
292
|
+
warnings: [...new Set(warnings)],
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
function parseJsonObject(text) {
|
|
296
|
+
try {
|
|
297
|
+
const parsed = JSON.parse(text);
|
|
298
|
+
return parsed && typeof parsed === 'object' && !Array.isArray(parsed)
|
|
299
|
+
? parsed
|
|
300
|
+
: null;
|
|
301
|
+
}
|
|
302
|
+
catch {
|
|
303
|
+
return null;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
function buildByokBenchmarkHistoryEntry(report) {
|
|
307
|
+
const safety = inspectByokBenchmarkArtifactSafety(report);
|
|
308
|
+
const latencies = report.runs
|
|
309
|
+
.map((run) => run.latencyMs)
|
|
310
|
+
.filter((value) => typeof value === 'number' && Number.isFinite(value));
|
|
311
|
+
const tokenCounts = report.runs
|
|
312
|
+
.map((run) => run.totalTokens)
|
|
313
|
+
.filter((value) => typeof value === 'number' && Number.isFinite(value));
|
|
314
|
+
const evidenceInputs = [...new Set(report.runs.map((run) => run.profile.evidenceContract.evidenceInput))];
|
|
315
|
+
return {
|
|
316
|
+
kind: 'geotech-byok-provider-benchmark-history-entry',
|
|
317
|
+
schemaVersion: 1,
|
|
318
|
+
generatedAt: report.generatedAt,
|
|
319
|
+
summary: {
|
|
320
|
+
runCount: report.summary.runCount,
|
|
321
|
+
passedRuns: report.summary.passedRuns,
|
|
322
|
+
failedRuns: report.summary.failedRuns,
|
|
323
|
+
passed: report.summary.passed,
|
|
324
|
+
profiles: [...report.summary.profiles],
|
|
325
|
+
evidenceInput: evidenceInputs.length === 1 ? evidenceInputs[0] : 'none',
|
|
326
|
+
averageLatencyMs: latencies.length > 0
|
|
327
|
+
? Math.round(latencies.reduce((sum, value) => sum + value, 0) / latencies.length)
|
|
328
|
+
: null,
|
|
329
|
+
totalTokens: tokenCounts.length > 0
|
|
330
|
+
? tokenCounts.reduce((sum, value) => sum + value, 0)
|
|
331
|
+
: null,
|
|
332
|
+
pathLeakCount: safety.leaks.length,
|
|
333
|
+
},
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
function validateByokBenchmarkHistoryEntry(entry, failures, prefix) {
|
|
337
|
+
if (!entry || typeof entry !== 'object') {
|
|
338
|
+
failures.push(`${prefix}_history_entry_missing`);
|
|
339
|
+
return;
|
|
340
|
+
}
|
|
341
|
+
if (entry.kind !== 'geotech-byok-provider-benchmark-history-entry') {
|
|
342
|
+
failures.push(`${prefix}_history_wrong_kind`);
|
|
343
|
+
}
|
|
344
|
+
if (entry.schemaVersion !== 1) {
|
|
345
|
+
failures.push(`${prefix}_history_wrong_schema_version`);
|
|
346
|
+
}
|
|
347
|
+
if (!entry.generatedAt) {
|
|
348
|
+
failures.push(`${prefix}_history_missing_generated_at`);
|
|
349
|
+
}
|
|
350
|
+
const summary = entry.summary;
|
|
351
|
+
if (!summary || typeof summary !== 'object') {
|
|
352
|
+
failures.push(`${prefix}_history_summary_missing`);
|
|
353
|
+
return;
|
|
354
|
+
}
|
|
355
|
+
for (const key of ['runCount', 'passedRuns', 'failedRuns', 'pathLeakCount']) {
|
|
356
|
+
if (!Number.isInteger(summary[key]) || summary[key] < 0) {
|
|
357
|
+
failures.push(`${prefix}_history_${key}_invalid`);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
if (summary.runCount !== summary.passedRuns + summary.failedRuns) {
|
|
361
|
+
failures.push(`${prefix}_history_run_count_mismatch`);
|
|
362
|
+
}
|
|
363
|
+
if (summary.passed !== (summary.runCount > 0 && summary.failedRuns === 0)) {
|
|
364
|
+
failures.push(`${prefix}_history_passed_flag_mismatch`);
|
|
365
|
+
}
|
|
366
|
+
if (!Array.isArray(summary.profiles)) {
|
|
367
|
+
failures.push(`${prefix}_history_profiles_invalid`);
|
|
368
|
+
}
|
|
369
|
+
if (summary.evidenceInput !== 'preprocessed-page-evidence' && summary.evidenceInput !== 'none') {
|
|
370
|
+
failures.push(`${prefix}_history_wrong_evidence_input`);
|
|
371
|
+
}
|
|
372
|
+
if (summary.pathLeakCount !== 0) {
|
|
373
|
+
failures.push(`${prefix}_history_path_leaks_present`);
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
function buildByokBenchmarkTrendDelta(current, previous) {
|
|
377
|
+
return {
|
|
378
|
+
runCount: current.summary.runCount - previous.summary.runCount,
|
|
379
|
+
passedRuns: current.summary.passedRuns - previous.summary.passedRuns,
|
|
380
|
+
failedRuns: current.summary.failedRuns - previous.summary.failedRuns,
|
|
381
|
+
averageLatencyMs: nullableDelta(current.summary.averageLatencyMs, previous.summary.averageLatencyMs),
|
|
382
|
+
totalTokens: nullableDelta(current.summary.totalTokens, previous.summary.totalTokens),
|
|
383
|
+
pathLeakCount: current.summary.pathLeakCount - previous.summary.pathLeakCount,
|
|
384
|
+
};
|
|
385
|
+
}
|
|
386
|
+
function nullableDelta(current, previous) {
|
|
387
|
+
return current == null || previous == null ? null : current - previous;
|
|
388
|
+
}
|
|
389
|
+
function sanitizeFailureToken(value) {
|
|
390
|
+
return value.replace(/[^a-zA-Z0-9_-]+/g, '_').slice(0, 48);
|
|
391
|
+
}
|
|
392
|
+
function validateByokBenchmarkProfileForReport(profile, failures) {
|
|
393
|
+
const label = profile.id;
|
|
394
|
+
const contract = profile.evidenceContract;
|
|
395
|
+
if (profile.requestContainsImageInput !== false) {
|
|
396
|
+
failures.push(`profile_${label}_contains_image_input`);
|
|
397
|
+
}
|
|
398
|
+
if (profile.requestContainsNativePdfInput !== false) {
|
|
399
|
+
failures.push(`profile_${label}_contains_native_pdf_input`);
|
|
400
|
+
}
|
|
401
|
+
if (contract.schemaVersion !== 'geotech.byok-evidence-contract.v1') {
|
|
402
|
+
failures.push(`profile_${label}_wrong_contract_schema`);
|
|
403
|
+
}
|
|
404
|
+
if (contract.evidenceInput !== 'preprocessed-page-evidence') {
|
|
405
|
+
failures.push(`profile_${label}_wrong_evidence_input`);
|
|
406
|
+
}
|
|
407
|
+
if (!contract.prohibitedInputs.includes('direct-image')) {
|
|
408
|
+
failures.push(`profile_${label}_allows_direct_image`);
|
|
409
|
+
}
|
|
410
|
+
if (!contract.prohibitedInputs.includes('native-pdf')) {
|
|
411
|
+
failures.push(`profile_${label}_allows_native_pdf`);
|
|
412
|
+
}
|
|
413
|
+
for (const key of BYOK_REQUIRED_RESPONSE_KEYS) {
|
|
414
|
+
if (!contract.requiredResponseKeys.includes(key)) {
|
|
415
|
+
failures.push(`profile_${label}_missing_required_key_${key}`);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
if (!contract.reviewGates.includes('source-evidence-required')) {
|
|
419
|
+
failures.push(`profile_${label}_missing_source_evidence_gate`);
|
|
420
|
+
}
|
|
421
|
+
if (!contract.reviewGates.includes('human-engineering-review-required')) {
|
|
422
|
+
failures.push(`profile_${label}_missing_human_review_gate`);
|
|
423
|
+
}
|
|
424
|
+
if (BYOK_TEXT_EVIDENCE_PROFILE_IDS.has(profile.id)) {
|
|
425
|
+
if (!profile.capabilityProfile.preprocessingPolicy.requirePreprocessedEvidence) {
|
|
426
|
+
failures.push(`profile_${label}_does_not_require_preprocessed_evidence`);
|
|
427
|
+
}
|
|
428
|
+
if (profile.id === 'openrouter-free' && !contract.reviewGates.includes('free-route-capacity-and-feature-variance')) {
|
|
429
|
+
failures.push(`profile_${label}_missing_free_route_gate`);
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
const evidenceIds = new Set();
|
|
433
|
+
for (const item of contract.sourceEvidence) {
|
|
434
|
+
if (evidenceIds.has(item.evidenceId)) {
|
|
435
|
+
failures.push(`profile_${label}_duplicate_evidence_id_${sanitizeFailureToken(item.evidenceId)}`);
|
|
436
|
+
}
|
|
437
|
+
evidenceIds.add(item.evidenceId);
|
|
438
|
+
if (!item.sourcePage.trim()) {
|
|
439
|
+
failures.push(`profile_${label}_missing_source_page_${sanitizeFailureToken(item.evidenceId)}`);
|
|
440
|
+
}
|
|
441
|
+
if (!Number.isFinite(item.confidence) || item.confidence < 0 || item.confidence > 1) {
|
|
442
|
+
failures.push(`profile_${label}_invalid_evidence_confidence_${sanitizeFailureToken(item.evidenceId)}`);
|
|
443
|
+
}
|
|
444
|
+
if (!item.snippet.trim()) {
|
|
445
|
+
failures.push(`profile_${label}_empty_evidence_snippet_${sanitizeFailureToken(item.evidenceId)}`);
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
if (evidenceIds.size === 0) {
|
|
449
|
+
failures.push(`profile_${label}_missing_source_evidence`);
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
function byokEvidenceComparabilitySignature(contract) {
|
|
453
|
+
return JSON.stringify({
|
|
454
|
+
evidenceInput: contract.evidenceInput,
|
|
455
|
+
prohibitedInputs: [...contract.prohibitedInputs].sort(),
|
|
456
|
+
requiredResponseKeys: contract.requiredResponseKeys,
|
|
457
|
+
sourceEvidence: contract.sourceEvidence.map((item) => ({
|
|
458
|
+
evidenceId: item.evidenceId,
|
|
459
|
+
sourcePage: item.sourcePage,
|
|
460
|
+
method: item.method,
|
|
461
|
+
snippet: item.snippet,
|
|
462
|
+
})),
|
|
463
|
+
});
|
|
464
|
+
}
|
|
465
|
+
function redactByokBenchmarkRun(run) {
|
|
466
|
+
return {
|
|
467
|
+
...run,
|
|
468
|
+
profile: redactByokProviderBenchmarkProfile(run.profile),
|
|
469
|
+
model: redactNullableString(run.model),
|
|
470
|
+
response: {
|
|
471
|
+
...run.response,
|
|
472
|
+
citedEvidenceIds: run.response.citedEvidenceIds.map(redactSensitiveReportText),
|
|
473
|
+
failures: run.response.failures.map(redactSensitiveReportText),
|
|
474
|
+
warnings: run.response.warnings.map(redactSensitiveReportText),
|
|
475
|
+
},
|
|
476
|
+
error: run.error == null ? undefined : redactSensitiveReportText(run.error),
|
|
477
|
+
};
|
|
478
|
+
}
|
|
479
|
+
function redactByokProviderBenchmarkProfile(profile) {
|
|
480
|
+
return {
|
|
481
|
+
...profile,
|
|
482
|
+
modelId: redactNullableString(profile.modelId),
|
|
483
|
+
visionModelId: redactNullableString(profile.visionModelId),
|
|
484
|
+
capabilityProfile: {
|
|
485
|
+
...profile.capabilityProfile,
|
|
486
|
+
modelId: redactNullableString(profile.capabilityProfile.modelId),
|
|
487
|
+
visionModelId: redactNullableString(profile.capabilityProfile.visionModelId),
|
|
488
|
+
reviewGates: profile.capabilityProfile.reviewGates.map(redactSensitiveReportText),
|
|
489
|
+
},
|
|
490
|
+
evidenceContract: {
|
|
491
|
+
...profile.evidenceContract,
|
|
492
|
+
sourceEvidence: profile.evidenceContract.sourceEvidence.map((item) => ({
|
|
493
|
+
...item,
|
|
494
|
+
evidenceId: redactSensitiveReportText(item.evidenceId),
|
|
495
|
+
sourcePage: redactSensitiveReportText(item.sourcePage),
|
|
496
|
+
snippet: redactSensitiveReportText(item.snippet),
|
|
497
|
+
})),
|
|
498
|
+
requiredResponseKeys: profile.evidenceContract.requiredResponseKeys.map(redactSensitiveReportText),
|
|
499
|
+
reviewGates: profile.evidenceContract.reviewGates.map(redactSensitiveReportText),
|
|
500
|
+
},
|
|
501
|
+
};
|
|
502
|
+
}
|
|
503
|
+
function redactNullableString(value) {
|
|
504
|
+
return value == null ? null : redactSensitiveReportText(value);
|
|
505
|
+
}
|
|
506
|
+
function redactSensitiveReportText(value) {
|
|
507
|
+
return value
|
|
508
|
+
.replace(/\b[A-Za-z]:[\\/][^\s"',}<]+/g, '[redacted-path]')
|
|
509
|
+
.replace(/(^|[\s"'=])\/(?:Users|home|var|tmp|private|mnt|Volumes|etc)\/[^\s"',}<]+/gi, '$1[redacted-path]')
|
|
510
|
+
.replace(/sk-or-v1-[A-Za-z0-9_-]{8,}/g, 'sk-or-v1-***')
|
|
511
|
+
.replace(/sk-ant-[A-Za-z0-9_-]{8,}/g, 'sk-ant-***')
|
|
512
|
+
.replace(/sk-[A-Za-z0-9_-]{8,}/g, 'sk-***')
|
|
513
|
+
.replace(/hf_[A-Za-z0-9_-]{8,}/g, 'hf_***')
|
|
514
|
+
.replace(/Bearer\s+[A-Za-z0-9._-]{16,}/gi, 'Bearer ***')
|
|
515
|
+
.replace(/((?:api[_-]?key|token|secret)\s*[:=]\s*)[A-Za-z0-9._-]{8,}/gi, '$1***');
|
|
516
|
+
}
|
|
517
|
+
function collectSensitiveReportPointers(value, pointer = 'report') {
|
|
518
|
+
if (typeof value === 'string') {
|
|
519
|
+
return redactSensitiveReportText(value) === value ? [] : [pointer];
|
|
520
|
+
}
|
|
521
|
+
if (!value || typeof value !== 'object') {
|
|
522
|
+
return [];
|
|
523
|
+
}
|
|
524
|
+
if (Array.isArray(value)) {
|
|
525
|
+
return value.flatMap((item, index) => collectSensitiveReportPointers(item, `${pointer}_${index}`));
|
|
526
|
+
}
|
|
527
|
+
return Object.entries(value).flatMap(([key, item]) => collectSensitiveReportPointers(item, `${pointer}_${key}`));
|
|
528
|
+
}
|
|
529
|
+
//# sourceMappingURL=byok-benchmark.js.map
|