@tracemarketplace/shared 0.0.10 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/extractor-claude-code.test.js +53 -0
- package/dist/extractor-claude-code.test.js.map +1 -1
- package/dist/extractor-codex.test.js +5 -0
- package/dist/extractor-codex.test.js.map +1 -1
- package/dist/extractors/claude-code.d.ts.map +1 -1
- package/dist/extractors/claude-code.js +4 -4
- package/dist/extractors/claude-code.js.map +1 -1
- package/dist/extractors/codex.d.ts.map +1 -1
- package/dist/extractors/codex.js +3 -1
- package/dist/extractors/codex.js.map +1 -1
- package/dist/extractors/common.d.ts +1 -2
- package/dist/extractors/common.d.ts.map +1 -1
- package/dist/extractors/common.js +2 -37
- package/dist/extractors/common.js.map +1 -1
- package/dist/extractors/common.test.d.ts +2 -0
- package/dist/extractors/common.test.d.ts.map +1 -0
- package/dist/extractors/common.test.js +17 -0
- package/dist/extractors/common.test.js.map +1 -0
- package/dist/extractors/cursor.d.ts.map +1 -1
- package/dist/extractors/cursor.js +8 -0
- package/dist/extractors/cursor.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/redact.d.ts.map +1 -1
- package/dist/redact.js +3 -1
- package/dist/redact.js.map +1 -1
- package/dist/redact.test.js +9 -0
- package/dist/redact.test.js.map +1 -1
- package/dist/scoring.d.ts +5 -3
- package/dist/scoring.d.ts.map +1 -1
- package/dist/scoring.fixtures.test.d.ts +2 -0
- package/dist/scoring.fixtures.test.d.ts.map +1 -0
- package/dist/scoring.fixtures.test.js +47 -0
- package/dist/scoring.fixtures.test.js.map +1 -0
- package/dist/scoring.js +381 -62
- package/dist/scoring.js.map +1 -1
- package/dist/scoring.test.js +125 -26
- package/dist/scoring.test.js.map +1 -1
- package/dist/tool-normalization.d.ts +66 -0
- package/dist/tool-normalization.d.ts.map +1 -0
- package/dist/tool-normalization.generated.d.ts +181 -0
- package/dist/tool-normalization.generated.d.ts.map +1 -0
- package/dist/tool-normalization.generated.js +261 -0
- package/dist/tool-normalization.generated.js.map +1 -0
- package/dist/tool-normalization.js +463 -0
- package/dist/tool-normalization.js.map +1 -0
- package/dist/tool-normalization.test.d.ts +2 -0
- package/dist/tool-normalization.test.d.ts.map +1 -0
- package/dist/tool-normalization.test.js +188 -0
- package/dist/tool-normalization.test.js.map +1 -0
- package/dist/types.d.ts +38 -1
- package/dist/types.d.ts.map +1 -1
- package/dist/validators.d.ts +23 -6
- package/dist/validators.d.ts.map +1 -1
- package/dist/validators.js +4 -0
- package/dist/validators.js.map +1 -1
- package/dist/validators.test.js +7 -0
- package/dist/validators.test.js.map +1 -1
- package/package.json +5 -5
- package/scripts/generate-tool-normalization.mjs +16 -0
- package/src/extractor-claude-code.test.ts +59 -0
- package/src/extractor-codex.test.ts +5 -0
- package/src/extractors/claude-code.ts +8 -4
- package/src/extractors/codex.ts +4 -2
- package/src/extractors/common.test.ts +21 -0
- package/src/extractors/common.ts +15 -49
- package/src/extractors/cursor.ts +9 -0
- package/src/index.ts +1 -0
- package/src/redact.test.ts +9 -0
- package/src/redact.ts +3 -1
- package/src/scoring.fixtures.test.ts +71 -0
- package/src/scoring.test.ts +151 -26
- package/src/scoring.ts +582 -84
- package/src/tool-normalization.generated.ts +262 -0
- package/src/tool-normalization.spec.json +205 -0
- package/src/tool-normalization.test.ts +221 -0
- package/src/tool-normalization.ts +670 -0
- package/src/types.ts +50 -0
- package/src/validators.test.ts +8 -0
- package/src/validators.ts +8 -0
package/src/scoring.ts
CHANGED
|
@@ -1,76 +1,446 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type {
|
|
2
|
+
FailureMode,
|
|
3
|
+
JsonValue,
|
|
4
|
+
NormalizedTrace,
|
|
5
|
+
QualityTier,
|
|
6
|
+
TraceScore,
|
|
7
|
+
TraceScoreBreakdown,
|
|
8
|
+
TraceScoreComponent,
|
|
9
|
+
TraceScoreContext,
|
|
10
|
+
} from "./types.js";
|
|
11
|
+
import {
|
|
12
|
+
extractFailureExchanges,
|
|
13
|
+
type FailureExchange,
|
|
14
|
+
normalizeTraceForEvaluation,
|
|
15
|
+
type TraceNormalization,
|
|
16
|
+
} from "./tool-normalization.js";
|
|
2
17
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
18
|
+
const CONTEXT_LIMIT_REGEX = /context.*(limit|window|maximum)|context limit/i;
|
|
19
|
+
const CONTEXT_LIMIT_TOKEN_THRESHOLD = 150_000;
|
|
20
|
+
const GENERIC_ERROR_LINE_REGEX =
|
|
21
|
+
/^(process exited with code|exit status \d+|\^c|error: process exited|process running with session id)/i;
|
|
22
|
+
const LOW_SIGNAL_ERROR_LINE_REGEX = /^(fail|error)(\s+(<file>|<path>))?$/i;
|
|
23
|
+
const REPEATED_ROOT_CAUSE_EXCHANGE_THRESHOLD = 3;
|
|
24
|
+
const SILVER_SCORE_THRESHOLD = 0.64;
|
|
25
|
+
const GOLD_SCORE_THRESHOLD = 0.82;
|
|
26
|
+
const SCORE_VERSION = "v1-signal-aggregation";
|
|
27
|
+
const COMPONENT_WEIGHTS = {
|
|
28
|
+
completeness: 1.25,
|
|
29
|
+
fidelity: 1.2,
|
|
30
|
+
executionDepth: 1.0,
|
|
31
|
+
failureSalience: 1.2,
|
|
32
|
+
complexity: 0.9,
|
|
33
|
+
workflowShape: 0.25,
|
|
34
|
+
lengthBucket: 0.2,
|
|
35
|
+
toolDensity: 0.2,
|
|
36
|
+
failureJudge: 0.45,
|
|
37
|
+
novelty: 0.55,
|
|
38
|
+
} as const;
|
|
39
|
+
const TOTAL_COMPONENT_WEIGHT = Object.values(COMPONENT_WEIGHTS).reduce((sum, weight) => sum + weight, 0);
|
|
40
|
+
const FAILURE_MODE_ALIASES: Partial<Record<FailureMode, FailureMode>> = {
|
|
41
|
+
repeated_tool_calls: "repeated_failing_root_cause",
|
|
42
|
+
};
|
|
43
|
+
const WORKFLOW_SHAPE_SCORES = {
|
|
44
|
+
chat_only: 0.2,
|
|
45
|
+
tool_other: 0.5,
|
|
46
|
+
shell_only: 0.62,
|
|
47
|
+
editor_only: 0.68,
|
|
48
|
+
shell_and_editor: 0.9,
|
|
49
|
+
} as const;
|
|
50
|
+
const LENGTH_BUCKET_SCORES = {
|
|
51
|
+
short: 0.35,
|
|
52
|
+
medium: 0.68,
|
|
53
|
+
long: 0.92,
|
|
54
|
+
} as const;
|
|
55
|
+
const TOOL_DENSITY_SCORES = {
|
|
56
|
+
none: 0.2,
|
|
57
|
+
light: 0.45,
|
|
58
|
+
medium: 0.72,
|
|
59
|
+
heavy: 0.9,
|
|
60
|
+
} as const;
|
|
61
|
+
const FAILURE_JUDGE_VERDICT_SCORES: Record<string, number> = {
|
|
62
|
+
confirmed_failure: 0.92,
|
|
63
|
+
unclear: 0.55,
|
|
64
|
+
false_positive: 0.18,
|
|
65
|
+
};
|
|
66
|
+
const FAILURE_JUDGE_AGREEMENT_ADJUSTMENT: Record<string, number> = {
|
|
67
|
+
agree: 0.08,
|
|
68
|
+
partial: 0.03,
|
|
69
|
+
disagree: -0.08,
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
interface NormalizationStats {
|
|
73
|
+
exchangeCount: number;
|
|
74
|
+
actionCount: number;
|
|
75
|
+
uniqueToolKinds: number;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function hasContextLimitEvidence(trace: NormalizedTrace): boolean {
|
|
79
|
+
const totalTokens = (trace.total_input_tokens ?? 0) + (trace.total_output_tokens ?? 0);
|
|
80
|
+
if (totalTokens > CONTEXT_LIMIT_TOKEN_THRESHOLD) {
|
|
81
|
+
return true;
|
|
82
|
+
}
|
|
6
83
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
84
|
+
return trace.turns.some((turn) =>
|
|
85
|
+
turn.content.some(
|
|
86
|
+
(block) => block.type === "text" && CONTEXT_LIMIT_REGEX.test(block.text),
|
|
87
|
+
),
|
|
10
88
|
);
|
|
11
|
-
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function normalizeErrorText(text: string | null | undefined, limit = 140): string | null {
|
|
92
|
+
if (!text) return null;
|
|
93
|
+
|
|
94
|
+
const candidateLines: string[] = [];
|
|
95
|
+
for (const rawLine of text.split(/\r?\n/)) {
|
|
96
|
+
let cleaned = rawLine.replace(/\s+/g, " ").trim().toLowerCase();
|
|
97
|
+
if (!cleaned || GENERIC_ERROR_LINE_REGEX.test(cleaned)) {
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
cleaned = cleaned
|
|
102
|
+
.replace(/\/[^\s]+/g, "<path>")
|
|
103
|
+
.replace(/\b[\w./-]+\.[a-z]{1,6}\b/g, "<file>")
|
|
104
|
+
.replace(/\b0x[a-f0-9]+\b/g, "<hex>")
|
|
105
|
+
.replace(/\b\d+\b/g, "<num>");
|
|
106
|
+
|
|
107
|
+
if (cleaned === "fail" || cleaned === "error" || LOW_SIGNAL_ERROR_LINE_REGEX.test(cleaned)) {
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
candidateLines.push(cleaned);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const normalized = candidateLines[0] ?? null;
|
|
115
|
+
if (!normalized || normalized.length < 12) {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
12
118
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
119
|
+
return normalized.slice(0, limit);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function buildErrorSignature(
|
|
123
|
+
toolName: string | null | undefined,
|
|
124
|
+
resultContent: string | null | undefined,
|
|
125
|
+
exitCode: number | null | undefined,
|
|
126
|
+
): string | null {
|
|
127
|
+
const normalizedText = normalizeErrorText(resultContent);
|
|
128
|
+
if (!normalizedText) return null;
|
|
129
|
+
|
|
130
|
+
const toolSegment = toolName?.trim().toLowerCase() || "unknown";
|
|
131
|
+
const exitSegment = exitCode == null ? "" : `:${exitCode}`;
|
|
132
|
+
return `${toolSegment}${exitSegment}|${normalizedText}`;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function hasRepeatedFailingRootCause(trace: NormalizedTrace): boolean {
|
|
136
|
+
let exchangeIndex = -1;
|
|
137
|
+
const toolNames = new Map<string, string>();
|
|
138
|
+
const signatureToExchanges = new Map<string, Set<number>>();
|
|
139
|
+
|
|
140
|
+
for (const turn of trace.turns) {
|
|
141
|
+
if (turn.role === "user") {
|
|
142
|
+
exchangeIndex += 1;
|
|
143
|
+
continue;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if (exchangeIndex < 0) {
|
|
147
|
+
exchangeIndex = 0;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
for (const block of turn.content) {
|
|
151
|
+
if (block.type === "tool_use") {
|
|
152
|
+
toolNames.set(block.tool_call_id, block.tool_name);
|
|
153
|
+
continue;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (block.type !== "tool_result") {
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
const isError = block.is_error || (block.exit_code != null && block.exit_code !== 0);
|
|
161
|
+
if (!isError) {
|
|
162
|
+
continue;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const signature = buildErrorSignature(
|
|
166
|
+
toolNames.get(block.tool_call_id),
|
|
167
|
+
block.result_content,
|
|
168
|
+
block.exit_code,
|
|
169
|
+
);
|
|
170
|
+
if (!signature) {
|
|
171
|
+
continue;
|
|
27
172
|
}
|
|
28
|
-
|
|
29
|
-
|
|
173
|
+
|
|
174
|
+
const exchanges = signatureToExchanges.get(signature) ?? new Set<number>();
|
|
175
|
+
exchanges.add(exchangeIndex);
|
|
176
|
+
signatureToExchanges.set(signature, exchanges);
|
|
30
177
|
}
|
|
31
178
|
}
|
|
32
179
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
const hasContextLimit = trace.turns.some((t) =>
|
|
36
|
-
t.content.some(
|
|
37
|
-
(b) =>
|
|
38
|
-
b.type === "text" && contextLimitRegex.test(b.text)
|
|
39
|
-
)
|
|
180
|
+
return Array.from(signatureToExchanges.values()).some(
|
|
181
|
+
(exchangeIndexes) => exchangeIndexes.size >= REPEATED_ROOT_CAUSE_EXCHANGE_THRESHOLD,
|
|
40
182
|
);
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
);
|
|
52
|
-
if (hasRecovery) modes.add("graceful_recovery");
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
function hasGracefulRecovery(exchanges: FailureExchange[]): boolean {
|
|
186
|
+
for (let index = 0; index < exchanges.length - 1; index += 1) {
|
|
187
|
+
if (
|
|
188
|
+
exchanges[index]?.outcome === "failure" &&
|
|
189
|
+
["success", "success_after_retry"].includes(exchanges[index + 1]?.outcome ?? "")
|
|
190
|
+
) {
|
|
191
|
+
return true;
|
|
192
|
+
}
|
|
53
193
|
}
|
|
54
194
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
195
|
+
return false;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function clamp01(value: number): number {
|
|
199
|
+
return Math.max(0, Math.min(1, value));
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function round4(value: number): number {
|
|
203
|
+
return Number(value.toFixed(4));
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
function normalizeLinear(value: number, minValue: number, maxValue: number): number {
|
|
207
|
+
if (maxValue <= minValue) {
|
|
208
|
+
return value > minValue ? 1 : 0;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
return clamp01((value - minValue) / (maxValue - minValue));
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function normalizeLogScale(value: number, minValue: number, maxValue: number): number {
|
|
215
|
+
if (value <= minValue) return 0;
|
|
216
|
+
if (value >= maxValue) return 1;
|
|
217
|
+
|
|
218
|
+
const numerator = Math.log1p(value) - Math.log1p(minValue);
|
|
219
|
+
const denominator = Math.log1p(maxValue) - Math.log1p(minValue);
|
|
220
|
+
if (denominator <= 0) return 0;
|
|
221
|
+
return clamp01(numerator / denominator);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
function countToolUses(trace: NormalizedTrace): number {
|
|
225
|
+
let count = 0;
|
|
226
|
+
for (const turn of trace.turns) {
|
|
227
|
+
for (const block of turn.content) {
|
|
228
|
+
if (block.type === "tool_use") {
|
|
229
|
+
count += 1;
|
|
230
|
+
}
|
|
63
231
|
}
|
|
64
232
|
}
|
|
233
|
+
return count;
|
|
234
|
+
}
|
|
65
235
|
|
|
66
|
-
|
|
67
|
-
const
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
)
|
|
73
|
-
|
|
236
|
+
function collectNormalizationStats(normalization: TraceNormalization): NormalizationStats {
|
|
237
|
+
const uniqueToolKinds = new Set<string>();
|
|
238
|
+
let actionCount = 0;
|
|
239
|
+
|
|
240
|
+
for (const exchange of normalization.exchanges) {
|
|
241
|
+
actionCount += exchange.actions.length;
|
|
242
|
+
for (const action of exchange.actions) {
|
|
243
|
+
uniqueToolKinds.add(action.normalizedToolId);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
return {
|
|
248
|
+
exchangeCount: normalization.exchanges.length,
|
|
249
|
+
actionCount,
|
|
250
|
+
uniqueToolKinds: uniqueToolKinds.size,
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function normalizeFailureModes(
|
|
255
|
+
failureModes: FailureMode[] | null | undefined,
|
|
256
|
+
): FailureMode[] {
|
|
257
|
+
const normalized: FailureMode[] = [];
|
|
258
|
+
for (const mode of failureModes ?? []) {
|
|
259
|
+
const canonical = FAILURE_MODE_ALIASES[mode] ?? mode;
|
|
260
|
+
if (!normalized.includes(canonical)) {
|
|
261
|
+
normalized.push(canonical);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
return normalized.length > 0 ? normalized : ["no_failure"];
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function resolveFailureModes(
|
|
268
|
+
trace: NormalizedTrace,
|
|
269
|
+
normalization: TraceNormalization,
|
|
270
|
+
overrideModes?: FailureMode[],
|
|
271
|
+
): FailureMode[] {
|
|
272
|
+
if (overrideModes && overrideModes.length > 0) {
|
|
273
|
+
return normalizeFailureModes(overrideModes);
|
|
274
|
+
}
|
|
275
|
+
return normalizeFailureModes(detectFailureModes(trace, normalization));
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
function scoreCompletenessValue(
|
|
279
|
+
completeness: TraceScore["completeness"],
|
|
280
|
+
): number {
|
|
281
|
+
if (completeness === "complete") return 1;
|
|
282
|
+
if (completeness === "incomplete") return 0.45;
|
|
283
|
+
return 0.02;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
function scoreFidelityValue(contentFidelity: NormalizedTrace["content_fidelity"]): number {
|
|
287
|
+
return contentFidelity === "full" ? 1 : 0.4;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function scoreExecutionDepthValue(
|
|
291
|
+
trace: NormalizedTrace,
|
|
292
|
+
normalizationStats: NormalizationStats,
|
|
293
|
+
): number {
|
|
294
|
+
const toolCallCount = trace.tool_call_count ?? countToolUses(trace);
|
|
295
|
+
const toolCallScore = normalizeLogScale(toolCallCount, 0, 12);
|
|
296
|
+
const exchangeScore = normalizeLinear(normalizationStats.exchangeCount, 0, 6);
|
|
297
|
+
const actionScore = normalizeLogScale(normalizationStats.actionCount, 0, 18);
|
|
298
|
+
|
|
299
|
+
let modalityBonus = 0;
|
|
300
|
+
if (trace.has_shell_commands) modalityBonus += 0.18;
|
|
301
|
+
if (trace.has_file_changes) modalityBonus += 0.18;
|
|
302
|
+
if (trace.has_thinking_blocks) modalityBonus += 0.08;
|
|
303
|
+
|
|
304
|
+
const base = toolCallCount > 0 ? 0.12 : 0.04;
|
|
305
|
+
return clamp01(
|
|
306
|
+
base
|
|
307
|
+
+ toolCallScore * 0.34
|
|
308
|
+
+ exchangeScore * 0.22
|
|
309
|
+
+ actionScore * 0.18
|
|
310
|
+
+ modalityBonus,
|
|
311
|
+
);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
function scoreFailureSalienceValue(failureModes: FailureMode[]): number {
|
|
315
|
+
let score = failureModes.includes("no_failure") ? 0.12 : 0.18;
|
|
316
|
+
if (failureModes.includes("tool_call_failure")) score += 0.18;
|
|
317
|
+
if (failureModes.includes("repeated_failing_root_cause")) score += 0.24;
|
|
318
|
+
if (failureModes.includes("context_limit_approached")) score += 0.12;
|
|
319
|
+
if (failureModes.includes("graceful_recovery")) score += 0.22;
|
|
320
|
+
if (failureModes.includes("catastrophic_failure")) score += 0.18;
|
|
321
|
+
return clamp01(score);
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
function scoreComplexityValue(
|
|
325
|
+
trace: NormalizedTrace,
|
|
326
|
+
normalizationStats: NormalizationStats,
|
|
327
|
+
): number {
|
|
328
|
+
const totalTokens = (trace.total_input_tokens ?? 0) + (trace.total_output_tokens ?? 0);
|
|
329
|
+
const turnCount = trace.turn_count ?? trace.turns.length;
|
|
330
|
+
|
|
331
|
+
const tokenScore = normalizeLogScale(totalTokens, 800, 80_000);
|
|
332
|
+
const turnScore = normalizeLinear(turnCount, 2, 24);
|
|
333
|
+
const exchangeScore = normalizeLinear(normalizationStats.exchangeCount, 1, 8);
|
|
334
|
+
const toolVarietyScore = normalizeLinear(normalizationStats.uniqueToolKinds, 1, 6);
|
|
335
|
+
|
|
336
|
+
return clamp01(
|
|
337
|
+
tokenScore * 0.38
|
|
338
|
+
+ turnScore * 0.23
|
|
339
|
+
+ exchangeScore * 0.17
|
|
340
|
+
+ toolVarietyScore * 0.22,
|
|
341
|
+
);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
function scoreMappedLabelValue(
|
|
345
|
+
value: string | null | undefined,
|
|
346
|
+
scoreMap: Record<string, number>,
|
|
347
|
+
): number | null {
|
|
348
|
+
if (!value) return null;
|
|
349
|
+
return value in scoreMap ? scoreMap[value] : null;
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
function scoreFailureJudgeValue(context: TraceScoreContext): number | null {
|
|
353
|
+
if (!context.failure_judge_verdict) {
|
|
354
|
+
return null;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
const verdictScore = FAILURE_JUDGE_VERDICT_SCORES[context.failure_judge_verdict];
|
|
358
|
+
if (verdictScore == null) {
|
|
359
|
+
return null;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
const agreementAdjustment = context.failure_judge_agreement
|
|
363
|
+
? (FAILURE_JUDGE_AGREEMENT_ADJUSTMENT[context.failure_judge_agreement] ?? 0)
|
|
364
|
+
: 0;
|
|
365
|
+
const confidence = context.failure_judge_confidence == null
|
|
366
|
+
? null
|
|
367
|
+
: clamp01(context.failure_judge_confidence);
|
|
368
|
+
const confidenceMultiplier = confidence == null ? 1 : 0.7 + (confidence * 0.3);
|
|
369
|
+
|
|
370
|
+
return clamp01((verdictScore + agreementAdjustment) * confidenceMultiplier);
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
function scoreNoveltyValue(anomalyScore: number | null | undefined): number | null {
|
|
374
|
+
if (anomalyScore == null || !Number.isFinite(anomalyScore) || anomalyScore <= 0) {
|
|
375
|
+
return null;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
return clamp01(Math.log1p(anomalyScore) / Math.log1p(4));
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
function buildComponent(
|
|
382
|
+
key: string,
|
|
383
|
+
label: string,
|
|
384
|
+
score: number,
|
|
385
|
+
weight: number,
|
|
386
|
+
source: TraceScoreComponent["source"],
|
|
387
|
+
reason: string,
|
|
388
|
+
evidence: JsonValue | null,
|
|
389
|
+
): TraceScoreComponent {
|
|
390
|
+
return {
|
|
391
|
+
key,
|
|
392
|
+
label,
|
|
393
|
+
score: round4(clamp01(score)),
|
|
394
|
+
weight: round4(weight),
|
|
395
|
+
source,
|
|
396
|
+
available: true,
|
|
397
|
+
reason,
|
|
398
|
+
evidence,
|
|
399
|
+
};
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
function buildScoreBreakdown(components: TraceScoreComponent[]): TraceScoreBreakdown {
|
|
403
|
+
const availableWeight = components.reduce((sum, component) => sum + component.weight, 0);
|
|
404
|
+
|
|
405
|
+
return {
|
|
406
|
+
aggregation: "weighted_average",
|
|
407
|
+
component_count: components.length,
|
|
408
|
+
available_weight: round4(availableWeight),
|
|
409
|
+
total_weight: round4(TOTAL_COMPONENT_WEIGHT),
|
|
410
|
+
components,
|
|
411
|
+
};
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
export function deriveQualityTier(total: number): QualityTier {
|
|
415
|
+
if (total >= GOLD_SCORE_THRESHOLD) return "gold";
|
|
416
|
+
if (total >= SILVER_SCORE_THRESHOLD) return "silver";
|
|
417
|
+
return "bronze";
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
export function detectFailureModes(
|
|
421
|
+
trace: NormalizedTrace,
|
|
422
|
+
normalization?: TraceNormalization,
|
|
423
|
+
): FailureMode[] {
|
|
424
|
+
const modes = new Set<FailureMode>();
|
|
425
|
+
const exchanges = normalization
|
|
426
|
+
? normalization.exchanges.map((exchange) => ({
|
|
427
|
+
toolTokens: exchange.toolTokens,
|
|
428
|
+
hasError: exchange.hasError,
|
|
429
|
+
outcome: exchange.outcome,
|
|
430
|
+
}))
|
|
431
|
+
: extractFailureExchanges(trace);
|
|
432
|
+
|
|
433
|
+
const hasToolError = exchanges.some((exchange) => exchange.hasError);
|
|
434
|
+
if (hasToolError) modes.add("tool_call_failure");
|
|
435
|
+
|
|
436
|
+
if (hasRepeatedFailingRootCause(trace)) modes.add("repeated_failing_root_cause");
|
|
437
|
+
|
|
438
|
+
if (hasContextLimitEvidence(trace)) modes.add("context_limit_approached");
|
|
439
|
+
|
|
440
|
+
if (hasGracefulRecovery(exchanges)) modes.add("graceful_recovery");
|
|
441
|
+
|
|
442
|
+
const finalOutcome = exchanges[exchanges.length - 1]?.outcome;
|
|
443
|
+
if (finalOutcome === "failure" && !modes.has("graceful_recovery")) {
|
|
74
444
|
modes.add("catastrophic_failure");
|
|
75
445
|
}
|
|
76
446
|
|
|
@@ -103,47 +473,175 @@ export function checkCompleteness(
|
|
|
103
473
|
return "incomplete";
|
|
104
474
|
}
|
|
105
475
|
|
|
106
|
-
export function scoreTrace(
|
|
476
|
+
export function scoreTrace(
|
|
477
|
+
trace: NormalizedTrace,
|
|
478
|
+
normalization: TraceNormalization = normalizeTraceForEvaluation(trace),
|
|
479
|
+
context: TraceScoreContext = {},
|
|
480
|
+
): TraceScore {
|
|
107
481
|
const completeness = checkCompleteness(trace);
|
|
108
|
-
const failureModes =
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
482
|
+
const failureModes = resolveFailureModes(trace, normalization, context.failure_modes_override);
|
|
483
|
+
const normalizationStats = collectNormalizationStats(normalization);
|
|
484
|
+
|
|
485
|
+
const components: TraceScoreComponent[] = [
|
|
486
|
+
buildComponent(
|
|
487
|
+
"completeness",
|
|
488
|
+
"Completeness",
|
|
489
|
+
scoreCompletenessValue(completeness),
|
|
490
|
+
COMPONENT_WEIGHTS.completeness,
|
|
491
|
+
"trace",
|
|
492
|
+
completeness,
|
|
493
|
+
{ completeness },
|
|
494
|
+
),
|
|
495
|
+
buildComponent(
|
|
496
|
+
"fidelity",
|
|
497
|
+
"Content Fidelity",
|
|
498
|
+
scoreFidelityValue(trace.content_fidelity),
|
|
499
|
+
COMPONENT_WEIGHTS.fidelity,
|
|
500
|
+
"trace",
|
|
501
|
+
trace.content_fidelity === "full" ? "full transcript" : "chat-only transcript",
|
|
502
|
+
{ content_fidelity: trace.content_fidelity },
|
|
503
|
+
),
|
|
504
|
+
buildComponent(
|
|
505
|
+
"execution_depth",
|
|
506
|
+
"Execution Depth",
|
|
507
|
+
scoreExecutionDepthValue(trace, normalizationStats),
|
|
508
|
+
COMPONENT_WEIGHTS.executionDepth,
|
|
509
|
+
"normalization",
|
|
510
|
+
"tool/exchange richness",
|
|
511
|
+
{
|
|
512
|
+
tool_call_count: trace.tool_call_count ?? countToolUses(trace),
|
|
513
|
+
exchange_count: normalizationStats.exchangeCount,
|
|
514
|
+
action_count: normalizationStats.actionCount,
|
|
515
|
+
has_shell_commands: trace.has_shell_commands ?? false,
|
|
516
|
+
has_file_changes: trace.has_file_changes ?? false,
|
|
517
|
+
has_thinking_blocks: trace.has_thinking_blocks ?? false,
|
|
518
|
+
},
|
|
519
|
+
),
|
|
520
|
+
buildComponent(
|
|
521
|
+
"failure_salience",
|
|
522
|
+
"Failure Salience",
|
|
523
|
+
scoreFailureSalienceValue(failureModes),
|
|
524
|
+
COMPONENT_WEIGHTS.failureSalience,
|
|
525
|
+
"trace",
|
|
526
|
+
failureModes.join(", "),
|
|
527
|
+
{ failure_modes: failureModes },
|
|
528
|
+
),
|
|
529
|
+
buildComponent(
|
|
530
|
+
"complexity",
|
|
531
|
+
"Complexity",
|
|
532
|
+
scoreComplexityValue(trace, normalizationStats),
|
|
533
|
+
COMPONENT_WEIGHTS.complexity,
|
|
534
|
+
"normalization",
|
|
535
|
+
"token/turn/tool complexity",
|
|
536
|
+
{
|
|
537
|
+
total_tokens: (trace.total_input_tokens ?? 0) + (trace.total_output_tokens ?? 0),
|
|
538
|
+
turn_count: trace.turn_count ?? trace.turns.length,
|
|
539
|
+
exchange_count: normalizationStats.exchangeCount,
|
|
540
|
+
unique_tool_kinds: normalizationStats.uniqueToolKinds,
|
|
541
|
+
},
|
|
542
|
+
),
|
|
543
|
+
];
|
|
544
|
+
|
|
545
|
+
const workflowShapeScore = scoreMappedLabelValue(context.workflow_shape, WORKFLOW_SHAPE_SCORES);
|
|
546
|
+
if (workflowShapeScore != null) {
|
|
547
|
+
components.push(
|
|
548
|
+
buildComponent(
|
|
549
|
+
"workflow_shape",
|
|
550
|
+
"Workflow Shape",
|
|
551
|
+
workflowShapeScore,
|
|
552
|
+
COMPONENT_WEIGHTS.workflowShape,
|
|
553
|
+
"label",
|
|
554
|
+
context.workflow_shape ?? "unknown",
|
|
555
|
+
{ workflow_shape: context.workflow_shape ?? null },
|
|
556
|
+
),
|
|
557
|
+
);
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
const lengthBucketScore = scoreMappedLabelValue(context.length_bucket, LENGTH_BUCKET_SCORES);
|
|
561
|
+
if (lengthBucketScore != null) {
|
|
562
|
+
components.push(
|
|
563
|
+
buildComponent(
|
|
564
|
+
"length_bucket",
|
|
565
|
+
"Length Bucket",
|
|
566
|
+
lengthBucketScore,
|
|
567
|
+
COMPONENT_WEIGHTS.lengthBucket,
|
|
568
|
+
"label",
|
|
569
|
+
context.length_bucket ?? "unknown",
|
|
570
|
+
{ length_bucket: context.length_bucket ?? null },
|
|
571
|
+
),
|
|
572
|
+
);
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
const toolDensityScore = scoreMappedLabelValue(context.tool_density, TOOL_DENSITY_SCORES);
|
|
576
|
+
if (toolDensityScore != null) {
|
|
577
|
+
components.push(
|
|
578
|
+
buildComponent(
|
|
579
|
+
"tool_density",
|
|
580
|
+
"Tool Density",
|
|
581
|
+
toolDensityScore,
|
|
582
|
+
COMPONENT_WEIGHTS.toolDensity,
|
|
583
|
+
"label",
|
|
584
|
+
context.tool_density ?? "unknown",
|
|
585
|
+
{ tool_density: context.tool_density ?? null },
|
|
586
|
+
),
|
|
587
|
+
);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
const failureJudgeScore = scoreFailureJudgeValue(context);
|
|
591
|
+
if (failureJudgeScore != null) {
|
|
592
|
+
components.push(
|
|
593
|
+
buildComponent(
|
|
594
|
+
"failure_judge",
|
|
595
|
+
"Failure Judge",
|
|
596
|
+
failureJudgeScore,
|
|
597
|
+
COMPONENT_WEIGHTS.failureJudge,
|
|
598
|
+
"judge",
|
|
599
|
+
context.failure_judge_verdict ?? "unknown",
|
|
600
|
+
{
|
|
601
|
+
verdict: context.failure_judge_verdict ?? null,
|
|
602
|
+
agreement: context.failure_judge_agreement ?? null,
|
|
603
|
+
confidence: context.failure_judge_confidence ?? null,
|
|
604
|
+
},
|
|
605
|
+
),
|
|
606
|
+
);
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
const noveltyScore = scoreNoveltyValue(context.anomaly_score);
|
|
610
|
+
if (noveltyScore != null) {
|
|
611
|
+
components.push(
|
|
612
|
+
buildComponent(
|
|
613
|
+
"novelty",
|
|
614
|
+
"Novelty",
|
|
615
|
+
noveltyScore,
|
|
616
|
+
COMPONENT_WEIGHTS.novelty,
|
|
617
|
+
"corpus",
|
|
618
|
+
"corpus anomaly signal",
|
|
619
|
+
{ anomaly_score: round4(context.anomaly_score ?? 0) },
|
|
620
|
+
),
|
|
621
|
+
);
|
|
622
|
+
}
|
|
128
623
|
|
|
129
|
-
const
|
|
624
|
+
const breakdown = buildScoreBreakdown(components);
|
|
625
|
+
const weightedTotal = components.reduce((sum, component) => sum + (component.score * component.weight), 0);
|
|
626
|
+
const total = round4(components.length > 0 ? weightedTotal / breakdown.available_weight : 0);
|
|
130
627
|
const payoutCents = Math.min(500, Math.round(total * 500));
|
|
131
628
|
|
|
132
629
|
return {
|
|
133
630
|
completeness,
|
|
134
631
|
failure_modes: failureModes,
|
|
135
632
|
has_error_recovery: failureModes.includes("graceful_recovery"),
|
|
136
|
-
has_repeated_calls: failureModes.includes("
|
|
633
|
+
has_repeated_calls: failureModes.includes("repeated_failing_root_cause"),
|
|
137
634
|
content_fidelity: trace.content_fidelity,
|
|
138
635
|
total,
|
|
139
636
|
payout_cents: payoutCents,
|
|
637
|
+
breakdown,
|
|
140
638
|
failure_taxonomy_label: null,
|
|
141
639
|
failure_taxonomy_explanation: null,
|
|
142
|
-
rarity_score: null,
|
|
640
|
+
rarity_score: noveltyScore == null ? null : round4(noveltyScore),
|
|
143
641
|
cluster_id: null,
|
|
144
642
|
is_duplicate: false,
|
|
145
643
|
duplicate_of: null,
|
|
146
644
|
scored_at: new Date().toISOString(),
|
|
147
|
-
scorer_version:
|
|
645
|
+
scorer_version: SCORE_VERSION,
|
|
148
646
|
};
|
|
149
647
|
}
|