@tracemarketplace/shared 0.0.9 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/dist/extractor-claude-code.test.js +53 -0
  2. package/dist/extractor-claude-code.test.js.map +1 -1
  3. package/dist/extractor-codex.test.js +5 -0
  4. package/dist/extractor-codex.test.js.map +1 -1
  5. package/dist/extractors/claude-code.d.ts.map +1 -1
  6. package/dist/extractors/claude-code.js +4 -4
  7. package/dist/extractors/claude-code.js.map +1 -1
  8. package/dist/extractors/codex.d.ts.map +1 -1
  9. package/dist/extractors/codex.js +2 -0
  10. package/dist/extractors/codex.js.map +1 -1
  11. package/dist/extractors/common.d.ts +1 -2
  12. package/dist/extractors/common.d.ts.map +1 -1
  13. package/dist/extractors/common.js +2 -37
  14. package/dist/extractors/common.js.map +1 -1
  15. package/dist/extractors/common.test.d.ts +2 -0
  16. package/dist/extractors/common.test.d.ts.map +1 -0
  17. package/dist/extractors/common.test.js +17 -0
  18. package/dist/extractors/common.test.js.map +1 -0
  19. package/dist/extractors/cursor.d.ts.map +1 -1
  20. package/dist/extractors/cursor.js +8 -0
  21. package/dist/extractors/cursor.js.map +1 -1
  22. package/dist/index.d.ts +1 -0
  23. package/dist/index.d.ts.map +1 -1
  24. package/dist/index.js +1 -0
  25. package/dist/index.js.map +1 -1
  26. package/dist/redact.d.ts.map +1 -1
  27. package/dist/redact.js +3 -1
  28. package/dist/redact.js.map +1 -1
  29. package/dist/redact.test.js +9 -0
  30. package/dist/redact.test.js.map +1 -1
  31. package/dist/scoring.d.ts +5 -3
  32. package/dist/scoring.d.ts.map +1 -1
  33. package/dist/scoring.fixtures.test.d.ts +2 -0
  34. package/dist/scoring.fixtures.test.d.ts.map +1 -0
  35. package/dist/scoring.fixtures.test.js +47 -0
  36. package/dist/scoring.fixtures.test.js.map +1 -0
  37. package/dist/scoring.js +381 -62
  38. package/dist/scoring.js.map +1 -1
  39. package/dist/scoring.test.js +125 -26
  40. package/dist/scoring.test.js.map +1 -1
  41. package/dist/tool-normalization.d.ts +66 -0
  42. package/dist/tool-normalization.d.ts.map +1 -0
  43. package/dist/tool-normalization.generated.d.ts +181 -0
  44. package/dist/tool-normalization.generated.d.ts.map +1 -0
  45. package/dist/tool-normalization.generated.js +261 -0
  46. package/dist/tool-normalization.generated.js.map +1 -0
  47. package/dist/tool-normalization.js +463 -0
  48. package/dist/tool-normalization.js.map +1 -0
  49. package/dist/tool-normalization.test.d.ts +2 -0
  50. package/dist/tool-normalization.test.d.ts.map +1 -0
  51. package/dist/tool-normalization.test.js +188 -0
  52. package/dist/tool-normalization.test.js.map +1 -0
  53. package/dist/turn-actors.d.ts +1 -0
  54. package/dist/turn-actors.d.ts.map +1 -1
  55. package/dist/turn-actors.js.map +1 -1
  56. package/dist/types.d.ts +38 -1
  57. package/dist/types.d.ts.map +1 -1
  58. package/dist/validators.d.ts +23 -6
  59. package/dist/validators.d.ts.map +1 -1
  60. package/dist/validators.js +4 -0
  61. package/dist/validators.js.map +1 -1
  62. package/dist/validators.test.js +7 -0
  63. package/dist/validators.test.js.map +1 -1
  64. package/package.json +5 -6
  65. package/scripts/generate-tool-normalization.mjs +16 -0
  66. package/src/extractor-claude-code.test.ts +59 -0
  67. package/src/extractor-codex.test.ts +5 -0
  68. package/src/extractors/claude-code.ts +8 -4
  69. package/src/extractors/codex.ts +2 -0
  70. package/src/extractors/common.test.ts +21 -0
  71. package/src/extractors/common.ts +15 -49
  72. package/src/extractors/cursor.ts +9 -0
  73. package/src/index.ts +1 -0
  74. package/src/redact.test.ts +9 -0
  75. package/src/redact.ts +3 -1
  76. package/src/scoring.fixtures.test.ts +71 -0
  77. package/src/scoring.test.ts +151 -26
  78. package/src/scoring.ts +582 -84
  79. package/src/tool-normalization.generated.ts +262 -0
  80. package/src/tool-normalization.spec.json +205 -0
  81. package/src/tool-normalization.test.ts +221 -0
  82. package/src/tool-normalization.ts +670 -0
  83. package/src/turn-actors.ts +2 -0
  84. package/src/types.ts +50 -0
  85. package/src/validators.test.ts +8 -0
  86. package/src/validators.ts +8 -0
package/src/scoring.ts CHANGED
@@ -1,76 +1,446 @@
1
- import type { NormalizedTrace, TraceScore, FailureMode } from "./types.js";
1
+ import type {
2
+ FailureMode,
3
+ JsonValue,
4
+ NormalizedTrace,
5
+ QualityTier,
6
+ TraceScore,
7
+ TraceScoreBreakdown,
8
+ TraceScoreComponent,
9
+ TraceScoreContext,
10
+ } from "./types.js";
11
+ import {
12
+ extractFailureExchanges,
13
+ type FailureExchange,
14
+ normalizeTraceForEvaluation,
15
+ type TraceNormalization,
16
+ } from "./tool-normalization.js";
2
17
 
3
- export function detectFailureModes(trace: NormalizedTrace): FailureMode[] {
4
- const modes = new Set<FailureMode>();
5
- const allBlocks = trace.turns.flatMap((t) => t.content);
18
+ const CONTEXT_LIMIT_REGEX = /context.*(limit|window|maximum)|context limit/i;
19
+ const CONTEXT_LIMIT_TOKEN_THRESHOLD = 150_000;
20
+ const GENERIC_ERROR_LINE_REGEX =
21
+ /^(process exited with code|exit status \d+|\^c|error: process exited|process running with session id)/i;
22
+ const LOW_SIGNAL_ERROR_LINE_REGEX = /^(fail|error)(\s+(<file>|<path>))?$/i;
23
+ const REPEATED_ROOT_CAUSE_EXCHANGE_THRESHOLD = 3;
24
+ const SILVER_SCORE_THRESHOLD = 0.64;
25
+ const GOLD_SCORE_THRESHOLD = 0.82;
26
+ const SCORE_VERSION = "v1-signal-aggregation";
27
+ const COMPONENT_WEIGHTS = {
28
+ completeness: 1.25,
29
+ fidelity: 1.2,
30
+ executionDepth: 1.0,
31
+ failureSalience: 1.2,
32
+ complexity: 0.9,
33
+ workflowShape: 0.25,
34
+ lengthBucket: 0.2,
35
+ toolDensity: 0.2,
36
+ failureJudge: 0.45,
37
+ novelty: 0.55,
38
+ } as const;
39
+ const TOTAL_COMPONENT_WEIGHT = Object.values(COMPONENT_WEIGHTS).reduce((sum, weight) => sum + weight, 0);
40
+ const FAILURE_MODE_ALIASES: Partial<Record<FailureMode, FailureMode>> = {
41
+ repeated_tool_calls: "repeated_failing_root_cause",
42
+ };
43
+ const WORKFLOW_SHAPE_SCORES = {
44
+ chat_only: 0.2,
45
+ tool_other: 0.5,
46
+ shell_only: 0.62,
47
+ editor_only: 0.68,
48
+ shell_and_editor: 0.9,
49
+ } as const;
50
+ const LENGTH_BUCKET_SCORES = {
51
+ short: 0.35,
52
+ medium: 0.68,
53
+ long: 0.92,
54
+ } as const;
55
+ const TOOL_DENSITY_SCORES = {
56
+ none: 0.2,
57
+ light: 0.45,
58
+ medium: 0.72,
59
+ heavy: 0.9,
60
+ } as const;
61
+ const FAILURE_JUDGE_VERDICT_SCORES: Record<string, number> = {
62
+ confirmed_failure: 0.92,
63
+ unclear: 0.55,
64
+ false_positive: 0.18,
65
+ };
66
+ const FAILURE_JUDGE_AGREEMENT_ADJUSTMENT: Record<string, number> = {
67
+ agree: 0.08,
68
+ partial: 0.03,
69
+ disagree: -0.08,
70
+ };
71
+
72
+ interface NormalizationStats {
73
+ exchangeCount: number;
74
+ actionCount: number;
75
+ uniqueToolKinds: number;
76
+ }
77
+
78
+ function hasContextLimitEvidence(trace: NormalizedTrace): boolean {
79
+ const totalTokens = (trace.total_input_tokens ?? 0) + (trace.total_output_tokens ?? 0);
80
+ if (totalTokens > CONTEXT_LIMIT_TOKEN_THRESHOLD) {
81
+ return true;
82
+ }
6
83
 
7
- // tool_call_failure: any tool_result with is_error=true
8
- const hasToolError = allBlocks.some(
9
- (b) => b.type === "tool_result" && b.is_error
84
+ return trace.turns.some((turn) =>
85
+ turn.content.some(
86
+ (block) => block.type === "text" && CONTEXT_LIMIT_REGEX.test(block.text),
87
+ ),
10
88
  );
11
- if (hasToolError) modes.add("tool_call_failure");
89
+ }
90
+
91
+ function normalizeErrorText(text: string | null | undefined, limit = 140): string | null {
92
+ if (!text) return null;
93
+
94
+ const candidateLines: string[] = [];
95
+ for (const rawLine of text.split(/\r?\n/)) {
96
+ let cleaned = rawLine.replace(/\s+/g, " ").trim().toLowerCase();
97
+ if (!cleaned || GENERIC_ERROR_LINE_REGEX.test(cleaned)) {
98
+ continue;
99
+ }
100
+
101
+ cleaned = cleaned
102
+ .replace(/\/[^\s]+/g, "<path>")
103
+ .replace(/\b[\w./-]+\.[a-z]{1,6}\b/g, "<file>")
104
+ .replace(/\b0x[a-f0-9]+\b/g, "<hex>")
105
+ .replace(/\b\d+\b/g, "<num>");
106
+
107
+ if (cleaned === "fail" || cleaned === "error" || LOW_SIGNAL_ERROR_LINE_REGEX.test(cleaned)) {
108
+ continue;
109
+ }
110
+
111
+ candidateLines.push(cleaned);
112
+ }
113
+
114
+ const normalized = candidateLines[0] ?? null;
115
+ if (!normalized || normalized.length < 12) {
116
+ return null;
117
+ }
12
118
 
13
- // repeated_tool_calls: same tool_name 3+ times in a row
14
- const toolUses = allBlocks.filter((b) => b.type === "tool_use") as Array<{
15
- type: "tool_use";
16
- tool_call_id: string;
17
- tool_name: string;
18
- tool_input: Record<string, unknown>;
19
- }>;
20
- let streak = 1;
21
- for (let i = 1; i < toolUses.length; i++) {
22
- if (toolUses[i].tool_name === toolUses[i - 1].tool_name) {
23
- streak++;
24
- if (streak >= 3) {
25
- modes.add("repeated_tool_calls");
26
- break;
119
+ return normalized.slice(0, limit);
120
+ }
121
+
122
+ function buildErrorSignature(
123
+ toolName: string | null | undefined,
124
+ resultContent: string | null | undefined,
125
+ exitCode: number | null | undefined,
126
+ ): string | null {
127
+ const normalizedText = normalizeErrorText(resultContent);
128
+ if (!normalizedText) return null;
129
+
130
+ const toolSegment = toolName?.trim().toLowerCase() || "unknown";
131
+ const exitSegment = exitCode == null ? "" : `:${exitCode}`;
132
+ return `${toolSegment}${exitSegment}|${normalizedText}`;
133
+ }
134
+
135
+ function hasRepeatedFailingRootCause(trace: NormalizedTrace): boolean {
136
+ let exchangeIndex = -1;
137
+ const toolNames = new Map<string, string>();
138
+ const signatureToExchanges = new Map<string, Set<number>>();
139
+
140
+ for (const turn of trace.turns) {
141
+ if (turn.role === "user") {
142
+ exchangeIndex += 1;
143
+ continue;
144
+ }
145
+
146
+ if (exchangeIndex < 0) {
147
+ exchangeIndex = 0;
148
+ }
149
+
150
+ for (const block of turn.content) {
151
+ if (block.type === "tool_use") {
152
+ toolNames.set(block.tool_call_id, block.tool_name);
153
+ continue;
154
+ }
155
+
156
+ if (block.type !== "tool_result") {
157
+ continue;
158
+ }
159
+
160
+ const isError = block.is_error || (block.exit_code != null && block.exit_code !== 0);
161
+ if (!isError) {
162
+ continue;
163
+ }
164
+
165
+ const signature = buildErrorSignature(
166
+ toolNames.get(block.tool_call_id),
167
+ block.result_content,
168
+ block.exit_code,
169
+ );
170
+ if (!signature) {
171
+ continue;
27
172
  }
28
- } else {
29
- streak = 1;
173
+
174
+ const exchanges = signatureToExchanges.get(signature) ?? new Set<number>();
175
+ exchanges.add(exchangeIndex);
176
+ signatureToExchanges.set(signature, exchanges);
30
177
  }
31
178
  }
32
179
 
33
- // context_limit_approached: text mentioning context/limit
34
- const contextLimitRegex = /context.*(limit|window|maximum)|context limit/i;
35
- const hasContextLimit = trace.turns.some((t) =>
36
- t.content.some(
37
- (b) =>
38
- b.type === "text" && contextLimitRegex.test(b.text)
39
- )
180
+ return Array.from(signatureToExchanges.values()).some(
181
+ (exchangeIndexes) => exchangeIndexes.size >= REPEATED_ROOT_CAUSE_EXCHANGE_THRESHOLD,
40
182
  );
41
- if (hasContextLimit) modes.add("context_limit_approached");
42
-
43
- // graceful_recovery: tool errors followed by recovery text
44
- if (hasToolError) {
45
- const recoveryRegex = /let me try|instead|alternative|another approach|different way/i;
46
- const laterTurns = trace.turns.slice(Math.floor(trace.turns.length / 2));
47
- const hasRecovery = laterTurns.some((t) =>
48
- t.content.some(
49
- (b) => b.type === "text" && recoveryRegex.test(b.text)
50
- )
51
- );
52
- if (hasRecovery) modes.add("graceful_recovery");
183
+ }
184
+
185
+ function hasGracefulRecovery(exchanges: FailureExchange[]): boolean {
186
+ for (let index = 0; index < exchanges.length - 1; index += 1) {
187
+ if (
188
+ exchanges[index]?.outcome === "failure" &&
189
+ ["success", "success_after_retry"].includes(exchanges[index + 1]?.outcome ?? "")
190
+ ) {
191
+ return true;
192
+ }
53
193
  }
54
194
 
55
- // repeated_tool_calls → graceful_recovery if later success
56
- if (modes.has("repeated_tool_calls")) {
57
- const lastTurn = trace.turns[trace.turns.length - 1];
58
- if (lastTurn?.role === "assistant") {
59
- const hasSuccessText = lastTurn.content.some(
60
- (b) => b.type === "text" && b.text.length > 50
61
- );
62
- if (hasSuccessText) modes.add("graceful_recovery");
195
+ return false;
196
+ }
197
+
198
+ function clamp01(value: number): number {
199
+ return Math.max(0, Math.min(1, value));
200
+ }
201
+
202
+ function round4(value: number): number {
203
+ return Number(value.toFixed(4));
204
+ }
205
+
206
+ function normalizeLinear(value: number, minValue: number, maxValue: number): number {
207
+ if (maxValue <= minValue) {
208
+ return value > minValue ? 1 : 0;
209
+ }
210
+
211
+ return clamp01((value - minValue) / (maxValue - minValue));
212
+ }
213
+
214
+ function normalizeLogScale(value: number, minValue: number, maxValue: number): number {
215
+ if (value <= minValue) return 0;
216
+ if (value >= maxValue) return 1;
217
+
218
+ const numerator = Math.log1p(value) - Math.log1p(minValue);
219
+ const denominator = Math.log1p(maxValue) - Math.log1p(minValue);
220
+ if (denominator <= 0) return 0;
221
+ return clamp01(numerator / denominator);
222
+ }
223
+
224
+ function countToolUses(trace: NormalizedTrace): number {
225
+ let count = 0;
226
+ for (const turn of trace.turns) {
227
+ for (const block of turn.content) {
228
+ if (block.type === "tool_use") {
229
+ count += 1;
230
+ }
63
231
  }
64
232
  }
233
+ return count;
234
+ }
65
235
 
66
- // catastrophic_failure: last 3+ turns are all errors with no recovery
67
- const lastTurns = trace.turns.slice(-3);
68
- const allLastAreErrors =
69
- lastTurns.length >= 2 &&
70
- lastTurns.every((t) =>
71
- t.content.some((b) => b.type === "tool_result" && b.is_error)
72
- );
73
- if (allLastAreErrors && !modes.has("graceful_recovery")) {
236
+ function collectNormalizationStats(normalization: TraceNormalization): NormalizationStats {
237
+ const uniqueToolKinds = new Set<string>();
238
+ let actionCount = 0;
239
+
240
+ for (const exchange of normalization.exchanges) {
241
+ actionCount += exchange.actions.length;
242
+ for (const action of exchange.actions) {
243
+ uniqueToolKinds.add(action.normalizedToolId);
244
+ }
245
+ }
246
+
247
+ return {
248
+ exchangeCount: normalization.exchanges.length,
249
+ actionCount,
250
+ uniqueToolKinds: uniqueToolKinds.size,
251
+ };
252
+ }
253
+
254
+ function normalizeFailureModes(
255
+ failureModes: FailureMode[] | null | undefined,
256
+ ): FailureMode[] {
257
+ const normalized: FailureMode[] = [];
258
+ for (const mode of failureModes ?? []) {
259
+ const canonical = FAILURE_MODE_ALIASES[mode] ?? mode;
260
+ if (!normalized.includes(canonical)) {
261
+ normalized.push(canonical);
262
+ }
263
+ }
264
+ return normalized.length > 0 ? normalized : ["no_failure"];
265
+ }
266
+
267
+ function resolveFailureModes(
268
+ trace: NormalizedTrace,
269
+ normalization: TraceNormalization,
270
+ overrideModes?: FailureMode[],
271
+ ): FailureMode[] {
272
+ if (overrideModes && overrideModes.length > 0) {
273
+ return normalizeFailureModes(overrideModes);
274
+ }
275
+ return normalizeFailureModes(detectFailureModes(trace, normalization));
276
+ }
277
+
278
+ function scoreCompletenessValue(
279
+ completeness: TraceScore["completeness"],
280
+ ): number {
281
+ if (completeness === "complete") return 1;
282
+ if (completeness === "incomplete") return 0.45;
283
+ return 0.02;
284
+ }
285
+
286
+ function scoreFidelityValue(contentFidelity: NormalizedTrace["content_fidelity"]): number {
287
+ return contentFidelity === "full" ? 1 : 0.4;
288
+ }
289
+
290
+ function scoreExecutionDepthValue(
291
+ trace: NormalizedTrace,
292
+ normalizationStats: NormalizationStats,
293
+ ): number {
294
+ const toolCallCount = trace.tool_call_count ?? countToolUses(trace);
295
+ const toolCallScore = normalizeLogScale(toolCallCount, 0, 12);
296
+ const exchangeScore = normalizeLinear(normalizationStats.exchangeCount, 0, 6);
297
+ const actionScore = normalizeLogScale(normalizationStats.actionCount, 0, 18);
298
+
299
+ let modalityBonus = 0;
300
+ if (trace.has_shell_commands) modalityBonus += 0.18;
301
+ if (trace.has_file_changes) modalityBonus += 0.18;
302
+ if (trace.has_thinking_blocks) modalityBonus += 0.08;
303
+
304
+ const base = toolCallCount > 0 ? 0.12 : 0.04;
305
+ return clamp01(
306
+ base
307
+ + toolCallScore * 0.34
308
+ + exchangeScore * 0.22
309
+ + actionScore * 0.18
310
+ + modalityBonus,
311
+ );
312
+ }
313
+
314
+ function scoreFailureSalienceValue(failureModes: FailureMode[]): number {
315
+ let score = failureModes.includes("no_failure") ? 0.12 : 0.18;
316
+ if (failureModes.includes("tool_call_failure")) score += 0.18;
317
+ if (failureModes.includes("repeated_failing_root_cause")) score += 0.24;
318
+ if (failureModes.includes("context_limit_approached")) score += 0.12;
319
+ if (failureModes.includes("graceful_recovery")) score += 0.22;
320
+ if (failureModes.includes("catastrophic_failure")) score += 0.18;
321
+ return clamp01(score);
322
+ }
323
+
324
+ function scoreComplexityValue(
325
+ trace: NormalizedTrace,
326
+ normalizationStats: NormalizationStats,
327
+ ): number {
328
+ const totalTokens = (trace.total_input_tokens ?? 0) + (trace.total_output_tokens ?? 0);
329
+ const turnCount = trace.turn_count ?? trace.turns.length;
330
+
331
+ const tokenScore = normalizeLogScale(totalTokens, 800, 80_000);
332
+ const turnScore = normalizeLinear(turnCount, 2, 24);
333
+ const exchangeScore = normalizeLinear(normalizationStats.exchangeCount, 1, 8);
334
+ const toolVarietyScore = normalizeLinear(normalizationStats.uniqueToolKinds, 1, 6);
335
+
336
+ return clamp01(
337
+ tokenScore * 0.38
338
+ + turnScore * 0.23
339
+ + exchangeScore * 0.17
340
+ + toolVarietyScore * 0.22,
341
+ );
342
+ }
343
+
344
+ function scoreMappedLabelValue(
345
+ value: string | null | undefined,
346
+ scoreMap: Record<string, number>,
347
+ ): number | null {
348
+ if (!value) return null;
349
+ return value in scoreMap ? scoreMap[value] : null;
350
+ }
351
+
352
+ function scoreFailureJudgeValue(context: TraceScoreContext): number | null {
353
+ if (!context.failure_judge_verdict) {
354
+ return null;
355
+ }
356
+
357
+ const verdictScore = FAILURE_JUDGE_VERDICT_SCORES[context.failure_judge_verdict];
358
+ if (verdictScore == null) {
359
+ return null;
360
+ }
361
+
362
+ const agreementAdjustment = context.failure_judge_agreement
363
+ ? (FAILURE_JUDGE_AGREEMENT_ADJUSTMENT[context.failure_judge_agreement] ?? 0)
364
+ : 0;
365
+ const confidence = context.failure_judge_confidence == null
366
+ ? null
367
+ : clamp01(context.failure_judge_confidence);
368
+ const confidenceMultiplier = confidence == null ? 1 : 0.7 + (confidence * 0.3);
369
+
370
+ return clamp01((verdictScore + agreementAdjustment) * confidenceMultiplier);
371
+ }
372
+
373
+ function scoreNoveltyValue(anomalyScore: number | null | undefined): number | null {
374
+ if (anomalyScore == null || !Number.isFinite(anomalyScore) || anomalyScore <= 0) {
375
+ return null;
376
+ }
377
+
378
+ return clamp01(Math.log1p(anomalyScore) / Math.log1p(4));
379
+ }
380
+
381
+ function buildComponent(
382
+ key: string,
383
+ label: string,
384
+ score: number,
385
+ weight: number,
386
+ source: TraceScoreComponent["source"],
387
+ reason: string,
388
+ evidence: JsonValue | null,
389
+ ): TraceScoreComponent {
390
+ return {
391
+ key,
392
+ label,
393
+ score: round4(clamp01(score)),
394
+ weight: round4(weight),
395
+ source,
396
+ available: true,
397
+ reason,
398
+ evidence,
399
+ };
400
+ }
401
+
402
+ function buildScoreBreakdown(components: TraceScoreComponent[]): TraceScoreBreakdown {
403
+ const availableWeight = components.reduce((sum, component) => sum + component.weight, 0);
404
+
405
+ return {
406
+ aggregation: "weighted_average",
407
+ component_count: components.length,
408
+ available_weight: round4(availableWeight),
409
+ total_weight: round4(TOTAL_COMPONENT_WEIGHT),
410
+ components,
411
+ };
412
+ }
413
+
414
+ export function deriveQualityTier(total: number): QualityTier {
415
+ if (total >= GOLD_SCORE_THRESHOLD) return "gold";
416
+ if (total >= SILVER_SCORE_THRESHOLD) return "silver";
417
+ return "bronze";
418
+ }
419
+
420
+ export function detectFailureModes(
421
+ trace: NormalizedTrace,
422
+ normalization?: TraceNormalization,
423
+ ): FailureMode[] {
424
+ const modes = new Set<FailureMode>();
425
+ const exchanges = normalization
426
+ ? normalization.exchanges.map((exchange) => ({
427
+ toolTokens: exchange.toolTokens,
428
+ hasError: exchange.hasError,
429
+ outcome: exchange.outcome,
430
+ }))
431
+ : extractFailureExchanges(trace);
432
+
433
+ const hasToolError = exchanges.some((exchange) => exchange.hasError);
434
+ if (hasToolError) modes.add("tool_call_failure");
435
+
436
+ if (hasRepeatedFailingRootCause(trace)) modes.add("repeated_failing_root_cause");
437
+
438
+ if (hasContextLimitEvidence(trace)) modes.add("context_limit_approached");
439
+
440
+ if (hasGracefulRecovery(exchanges)) modes.add("graceful_recovery");
441
+
442
+ const finalOutcome = exchanges[exchanges.length - 1]?.outcome;
443
+ if (finalOutcome === "failure" && !modes.has("graceful_recovery")) {
74
444
  modes.add("catastrophic_failure");
75
445
  }
76
446
 
@@ -103,47 +473,175 @@ export function checkCompleteness(
103
473
  return "incomplete";
104
474
  }
105
475
 
106
- export function scoreTrace(trace: NormalizedTrace): TraceScore {
476
+ export function scoreTrace(
477
+ trace: NormalizedTrace,
478
+ normalization: TraceNormalization = normalizeTraceForEvaluation(trace),
479
+ context: TraceScoreContext = {},
480
+ ): TraceScore {
107
481
  const completeness = checkCompleteness(trace);
108
- const failureModes = detectFailureModes(trace);
109
-
110
- const fidelityBase = trace.content_fidelity === "full" ? 0.4 : 0.15;
111
-
112
- let interestBonus = 0;
113
- if (failureModes.includes("graceful_recovery")) interestBonus += 0.3;
114
- if (failureModes.includes("repeated_tool_calls")) interestBonus += 0.2;
115
- if (failureModes.includes("catastrophic_failure")) interestBonus += 0.15;
116
- if (
117
- failureModes.includes("tool_call_failure") &&
118
- !failureModes.includes("graceful_recovery")
119
- )
120
- interestBonus += 0.1;
121
-
122
- const totalTokens =
123
- (trace.total_input_tokens ?? 0) + (trace.total_output_tokens ?? 0);
124
- const lengthBonus = Math.min(
125
- 0.15,
126
- Math.log10(Math.max(1, totalTokens / 1000)) * 0.05
127
- );
482
+ const failureModes = resolveFailureModes(trace, normalization, context.failure_modes_override);
483
+ const normalizationStats = collectNormalizationStats(normalization);
484
+
485
+ const components: TraceScoreComponent[] = [
486
+ buildComponent(
487
+ "completeness",
488
+ "Completeness",
489
+ scoreCompletenessValue(completeness),
490
+ COMPONENT_WEIGHTS.completeness,
491
+ "trace",
492
+ completeness,
493
+ { completeness },
494
+ ),
495
+ buildComponent(
496
+ "fidelity",
497
+ "Content Fidelity",
498
+ scoreFidelityValue(trace.content_fidelity),
499
+ COMPONENT_WEIGHTS.fidelity,
500
+ "trace",
501
+ trace.content_fidelity === "full" ? "full transcript" : "chat-only transcript",
502
+ { content_fidelity: trace.content_fidelity },
503
+ ),
504
+ buildComponent(
505
+ "execution_depth",
506
+ "Execution Depth",
507
+ scoreExecutionDepthValue(trace, normalizationStats),
508
+ COMPONENT_WEIGHTS.executionDepth,
509
+ "normalization",
510
+ "tool/exchange richness",
511
+ {
512
+ tool_call_count: trace.tool_call_count ?? countToolUses(trace),
513
+ exchange_count: normalizationStats.exchangeCount,
514
+ action_count: normalizationStats.actionCount,
515
+ has_shell_commands: trace.has_shell_commands ?? false,
516
+ has_file_changes: trace.has_file_changes ?? false,
517
+ has_thinking_blocks: trace.has_thinking_blocks ?? false,
518
+ },
519
+ ),
520
+ buildComponent(
521
+ "failure_salience",
522
+ "Failure Salience",
523
+ scoreFailureSalienceValue(failureModes),
524
+ COMPONENT_WEIGHTS.failureSalience,
525
+ "trace",
526
+ failureModes.join(", "),
527
+ { failure_modes: failureModes },
528
+ ),
529
+ buildComponent(
530
+ "complexity",
531
+ "Complexity",
532
+ scoreComplexityValue(trace, normalizationStats),
533
+ COMPONENT_WEIGHTS.complexity,
534
+ "normalization",
535
+ "token/turn/tool complexity",
536
+ {
537
+ total_tokens: (trace.total_input_tokens ?? 0) + (trace.total_output_tokens ?? 0),
538
+ turn_count: trace.turn_count ?? trace.turns.length,
539
+ exchange_count: normalizationStats.exchangeCount,
540
+ unique_tool_kinds: normalizationStats.uniqueToolKinds,
541
+ },
542
+ ),
543
+ ];
544
+
545
+ const workflowShapeScore = scoreMappedLabelValue(context.workflow_shape, WORKFLOW_SHAPE_SCORES);
546
+ if (workflowShapeScore != null) {
547
+ components.push(
548
+ buildComponent(
549
+ "workflow_shape",
550
+ "Workflow Shape",
551
+ workflowShapeScore,
552
+ COMPONENT_WEIGHTS.workflowShape,
553
+ "label",
554
+ context.workflow_shape ?? "unknown",
555
+ { workflow_shape: context.workflow_shape ?? null },
556
+ ),
557
+ );
558
+ }
559
+
560
+ const lengthBucketScore = scoreMappedLabelValue(context.length_bucket, LENGTH_BUCKET_SCORES);
561
+ if (lengthBucketScore != null) {
562
+ components.push(
563
+ buildComponent(
564
+ "length_bucket",
565
+ "Length Bucket",
566
+ lengthBucketScore,
567
+ COMPONENT_WEIGHTS.lengthBucket,
568
+ "label",
569
+ context.length_bucket ?? "unknown",
570
+ { length_bucket: context.length_bucket ?? null },
571
+ ),
572
+ );
573
+ }
574
+
575
+ const toolDensityScore = scoreMappedLabelValue(context.tool_density, TOOL_DENSITY_SCORES);
576
+ if (toolDensityScore != null) {
577
+ components.push(
578
+ buildComponent(
579
+ "tool_density",
580
+ "Tool Density",
581
+ toolDensityScore,
582
+ COMPONENT_WEIGHTS.toolDensity,
583
+ "label",
584
+ context.tool_density ?? "unknown",
585
+ { tool_density: context.tool_density ?? null },
586
+ ),
587
+ );
588
+ }
589
+
590
+ const failureJudgeScore = scoreFailureJudgeValue(context);
591
+ if (failureJudgeScore != null) {
592
+ components.push(
593
+ buildComponent(
594
+ "failure_judge",
595
+ "Failure Judge",
596
+ failureJudgeScore,
597
+ COMPONENT_WEIGHTS.failureJudge,
598
+ "judge",
599
+ context.failure_judge_verdict ?? "unknown",
600
+ {
601
+ verdict: context.failure_judge_verdict ?? null,
602
+ agreement: context.failure_judge_agreement ?? null,
603
+ confidence: context.failure_judge_confidence ?? null,
604
+ },
605
+ ),
606
+ );
607
+ }
608
+
609
+ const noveltyScore = scoreNoveltyValue(context.anomaly_score);
610
+ if (noveltyScore != null) {
611
+ components.push(
612
+ buildComponent(
613
+ "novelty",
614
+ "Novelty",
615
+ noveltyScore,
616
+ COMPONENT_WEIGHTS.novelty,
617
+ "corpus",
618
+ "corpus anomaly signal",
619
+ { anomaly_score: round4(context.anomaly_score ?? 0) },
620
+ ),
621
+ );
622
+ }
128
623
 
129
- const total = Math.min(1.0, fidelityBase + interestBonus + lengthBonus);
624
+ const breakdown = buildScoreBreakdown(components);
625
+ const weightedTotal = components.reduce((sum, component) => sum + (component.score * component.weight), 0);
626
+ const total = round4(components.length > 0 ? weightedTotal / breakdown.available_weight : 0);
130
627
  const payoutCents = Math.min(500, Math.round(total * 500));
131
628
 
132
629
  return {
133
630
  completeness,
134
631
  failure_modes: failureModes,
135
632
  has_error_recovery: failureModes.includes("graceful_recovery"),
136
- has_repeated_calls: failureModes.includes("repeated_tool_calls"),
633
+ has_repeated_calls: failureModes.includes("repeated_failing_root_cause"),
137
634
  content_fidelity: trace.content_fidelity,
138
635
  total,
139
636
  payout_cents: payoutCents,
637
+ breakdown,
140
638
  failure_taxonomy_label: null,
141
639
  failure_taxonomy_explanation: null,
142
- rarity_score: null,
640
+ rarity_score: noveltyScore == null ? null : round4(noveltyScore),
143
641
  cluster_id: null,
144
642
  is_duplicate: false,
145
643
  duplicate_of: null,
146
644
  scored_at: new Date().toISOString(),
147
- scorer_version: "v0-heuristic",
645
+ scorer_version: SCORE_VERSION,
148
646
  };
149
647
  }