@tracemarketplace/shared 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/dist/extractor-claude-code.test.js +53 -0
  2. package/dist/extractor-claude-code.test.js.map +1 -1
  3. package/dist/extractor-codex.test.js +5 -0
  4. package/dist/extractor-codex.test.js.map +1 -1
  5. package/dist/extractors/claude-code.d.ts.map +1 -1
  6. package/dist/extractors/claude-code.js +4 -4
  7. package/dist/extractors/claude-code.js.map +1 -1
  8. package/dist/extractors/codex.d.ts.map +1 -1
  9. package/dist/extractors/codex.js +2 -0
  10. package/dist/extractors/codex.js.map +1 -1
  11. package/dist/extractors/common.d.ts +1 -2
  12. package/dist/extractors/common.d.ts.map +1 -1
  13. package/dist/extractors/common.js +2 -37
  14. package/dist/extractors/common.js.map +1 -1
  15. package/dist/extractors/common.test.d.ts +2 -0
  16. package/dist/extractors/common.test.d.ts.map +1 -0
  17. package/dist/extractors/common.test.js +17 -0
  18. package/dist/extractors/common.test.js.map +1 -0
  19. package/dist/extractors/cursor.d.ts.map +1 -1
  20. package/dist/extractors/cursor.js +8 -0
  21. package/dist/extractors/cursor.js.map +1 -1
  22. package/dist/index.d.ts +1 -0
  23. package/dist/index.d.ts.map +1 -1
  24. package/dist/index.js +1 -0
  25. package/dist/index.js.map +1 -1
  26. package/dist/redact.d.ts.map +1 -1
  27. package/dist/redact.js +3 -1
  28. package/dist/redact.js.map +1 -1
  29. package/dist/redact.test.js +9 -0
  30. package/dist/redact.test.js.map +1 -1
  31. package/dist/scoring.d.ts +5 -3
  32. package/dist/scoring.d.ts.map +1 -1
  33. package/dist/scoring.fixtures.test.d.ts +2 -0
  34. package/dist/scoring.fixtures.test.d.ts.map +1 -0
  35. package/dist/scoring.fixtures.test.js +47 -0
  36. package/dist/scoring.fixtures.test.js.map +1 -0
  37. package/dist/scoring.js +381 -62
  38. package/dist/scoring.js.map +1 -1
  39. package/dist/scoring.test.js +125 -26
  40. package/dist/scoring.test.js.map +1 -1
  41. package/dist/tool-normalization.d.ts +66 -0
  42. package/dist/tool-normalization.d.ts.map +1 -0
  43. package/dist/tool-normalization.generated.d.ts +181 -0
  44. package/dist/tool-normalization.generated.d.ts.map +1 -0
  45. package/dist/tool-normalization.generated.js +261 -0
  46. package/dist/tool-normalization.generated.js.map +1 -0
  47. package/dist/tool-normalization.js +463 -0
  48. package/dist/tool-normalization.js.map +1 -0
  49. package/dist/tool-normalization.test.d.ts +2 -0
  50. package/dist/tool-normalization.test.d.ts.map +1 -0
  51. package/dist/tool-normalization.test.js +188 -0
  52. package/dist/tool-normalization.test.js.map +1 -0
  53. package/dist/types.d.ts +38 -1
  54. package/dist/types.d.ts.map +1 -1
  55. package/dist/validators.d.ts +23 -6
  56. package/dist/validators.d.ts.map +1 -1
  57. package/dist/validators.js +4 -0
  58. package/dist/validators.js.map +1 -1
  59. package/dist/validators.test.js +7 -0
  60. package/dist/validators.test.js.map +1 -1
  61. package/package.json +5 -5
  62. package/scripts/generate-tool-normalization.mjs +16 -0
  63. package/src/extractor-claude-code.test.ts +59 -0
  64. package/src/extractor-codex.test.ts +5 -0
  65. package/src/extractors/claude-code.ts +8 -4
  66. package/src/extractors/codex.ts +2 -0
  67. package/src/extractors/common.test.ts +21 -0
  68. package/src/extractors/common.ts +15 -49
  69. package/src/extractors/cursor.ts +9 -0
  70. package/src/index.ts +1 -0
  71. package/src/redact.test.ts +9 -0
  72. package/src/redact.ts +3 -1
  73. package/src/scoring.fixtures.test.ts +71 -0
  74. package/src/scoring.test.ts +151 -26
  75. package/src/scoring.ts +582 -84
  76. package/src/tool-normalization.generated.ts +262 -0
  77. package/src/tool-normalization.spec.json +205 -0
  78. package/src/tool-normalization.test.ts +221 -0
  79. package/src/tool-normalization.ts +670 -0
  80. package/src/types.ts +50 -0
  81. package/src/validators.test.ts +8 -0
  82. package/src/validators.ts +8 -0
@@ -9,6 +9,7 @@ import type {
9
9
  TokenUsage,
10
10
  } from "../types.js";
11
11
  import { deriveTurnActors } from "../turn-actors.js";
12
+ import { pushUniqueTextBlock } from "./common.js";
12
13
 
13
14
  export async function extractClaudeCode(
14
15
  sessionFilePath: string,
@@ -146,10 +147,11 @@ export async function extractClaudeCode(
146
147
  if (block.type === "text") {
147
148
  contentBlocks.push({ type: "text", text: block.text ?? "" });
148
149
  } else if (block.type === "thinking") {
149
- contentBlocks.push({
150
- type: "thinking",
151
- text: block.thinking ?? block.text ?? "",
152
- });
150
+ pushUniqueTextBlock(
151
+ contentBlocks,
152
+ "thinking",
153
+ block.thinking ?? block.text ?? "",
154
+ );
153
155
  } else if (block.type === "tool_use") {
154
156
  contentBlocks.push({
155
157
  type: "tool_use",
@@ -278,6 +280,8 @@ export async function extractClaudeCode(
278
280
  extraction_method: "passive",
279
281
  },
280
282
  score: null,
283
+ raw_json: { lines } as NonNullable<NormalizedTrace["raw_json"]>,
284
+ raw_json_format: "claude_code.jsonl",
281
285
  raw_r2_key: "",
282
286
  normalized_r2_key: "",
283
287
  };
@@ -229,6 +229,8 @@ export async function extractCodex(
229
229
  content_fidelity: "full",
230
230
  env_state: createPassiveEnvState(),
231
231
  score: null,
232
+ raw_json: { events } as NonNullable<NormalizedTrace["raw_json"]>,
233
+ raw_json_format: "codex_cli.jsonl",
232
234
  raw_r2_key: "",
233
235
  normalized_r2_key: "",
234
236
  };
@@ -0,0 +1,21 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { isFileMutationTool, isShellToolName, isWriteShapedShellCommand } from "./common.js";
3
+
4
+ describe("common extractor helpers", () => {
5
+ it("treats codex shell aliases as shell tools", () => {
6
+ expect(isShellToolName("bash_command")).toBe(true);
7
+ expect(isShellToolName("run_shell_command")).toBe(true);
8
+ expect(isShellToolName("functions.exec_command")).toBe(true);
9
+ });
10
+
11
+ it("treats replace as a file mutation tool", () => {
12
+ expect(isFileMutationTool("replace", { input: "/app/ars.R" })).toBe(true);
13
+ expect(isFileMutationTool("write_file", { input: "/app/ars.R" })).toBe(true);
14
+ });
15
+
16
+ it("unwraps wrapped shell commands for write-shape detection", () => {
17
+ expect(
18
+ isWriteShapedShellCommand('bash -lc "cd /app && sort -o tmp/out.txt src/input.txt"'),
19
+ ).toBe(true);
20
+ });
21
+ });
@@ -1,54 +1,20 @@
1
+ import {
2
+ isFileMutationTool,
3
+ isShellToolName,
4
+ } from "../tool-normalization.js";
1
5
  import type { ContentBlock, EnvState, Turn } from "../types.js";
2
6
 
3
- const SHELL_TOOL_NAMES = ["exec_command", "bash", "shell", "write_stdin"];
4
- const FILE_MUTATION_TOOL_NAMES = [
5
- "apply_patch",
6
- "write_file",
7
- "create_file",
8
- "delete_file",
9
- "rename_file",
10
- "move_file",
11
- "file_change",
12
- "edit",
13
- "edit_file",
14
- "multiedit",
15
- "write",
16
- ];
17
-
18
- function normalizeToolName(toolName: string): string {
19
- return toolName.trim().toLowerCase();
20
- }
21
-
22
- function toolNameMatches(toolName: string, candidate: string): boolean {
23
- const normalized = normalizeToolName(toolName);
24
- return normalized === candidate || normalized.endsWith(`.${candidate}`);
25
- }
26
-
27
- export function isShellToolName(toolName: string): boolean {
28
- return SHELL_TOOL_NAMES.some((candidate) => toolNameMatches(toolName, candidate));
29
- }
30
-
31
- export function isFileMutationTool(
32
- toolName: string,
33
- toolInput: Record<string, unknown>,
34
- ): boolean {
35
- if (
36
- FILE_MUTATION_TOOL_NAMES.some((candidate) => toolNameMatches(toolName, candidate))
37
- ) {
38
- return true;
39
- }
40
-
41
- if (!isShellToolName(toolName)) return false;
42
-
43
- const command =
44
- typeof toolInput.cmd === "string"
45
- ? toolInput.cmd
46
- : typeof toolInput.command === "string"
47
- ? toolInput.command
48
- : null;
49
-
50
- return command !== null && /\bapply_patch\b|\bsed\s+-i\b|\bperl\s+-pi\b/.test(command);
51
- }
7
+ export {
8
+ extractShellCommand,
9
+ getNormalizedToolHierarchy,
10
+ hasShellWriteRedirect,
11
+ isFileMutationTool,
12
+ isShellToolName,
13
+ isWriteShapedShellCommand,
14
+ listNormalizedToolsByFamily,
15
+ normalizeToolName,
16
+ toolNameMatches,
17
+ } from "../tool-normalization.js";
52
18
 
53
19
  export function collectTraceMetrics(turns: Turn[]) {
54
20
  const allBlocks = turns.flatMap((turn) => turn.content);
@@ -265,6 +265,7 @@ export async function extractCursor(
265
265
 
266
266
  const turns: Turn[] = [];
267
267
  const openFiles: string[] = [];
268
+ const rawBubbles: unknown[] = [];
268
269
  let totalInputTokens = 0;
269
270
  let totalOutputTokens = 0;
270
271
 
@@ -275,6 +276,8 @@ export async function extractCursor(
275
276
  const blob = readCursorBlob(db, sessionId, bubbleId);
276
277
  if (!blob) continue;
277
278
 
279
+ rawBubbles.push({ header, blob });
280
+
278
281
  const role = cursorRole(header, blob);
279
282
  const tokenUsage = extractCursorTokenUsage(blob);
280
283
  if (tokenUsage) {
@@ -344,6 +347,12 @@ export async function extractCursor(
344
347
  open_files_in_editor: openFiles.length > 0 ? Array.from(new Set(openFiles)) : null,
345
348
  }),
346
349
  score: null,
350
+ raw_json: {
351
+ composerData,
352
+ headers,
353
+ bubbles: rawBubbles,
354
+ } as NonNullable<NormalizedTrace["raw_json"]>,
355
+ raw_json_format: "cursor.composer_disk_kv",
347
356
  raw_r2_key: "",
348
357
  normalized_r2_key: "",
349
358
  };
package/src/index.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  export * from "./types.js";
2
2
  export * from "./hash.js";
3
3
  export * from "./scoring.js";
4
+ export * from "./tool-normalization.js";
4
5
  export * from "./utils.js";
5
6
  export * from "./validators.js";
6
7
  export * from "./redact.js";
@@ -68,6 +68,11 @@ function makeTrace(): NormalizedTrace {
68
68
  extraction_method: "passive",
69
69
  },
70
70
  score: null,
71
+ raw_json: {
72
+ secret: "sk-proj-secret-secret-secret-secret",
73
+ path: "/Users/tester/project/.env",
74
+ },
75
+ raw_json_format: "claude_code.jsonl",
71
76
  raw_r2_key: "",
72
77
  normalized_r2_key: "",
73
78
  };
@@ -93,6 +98,10 @@ describe("redactTraceWithStats", () => {
93
98
  }
94
99
  expect(toolResult.result_content).toBe("Bearer [BEARER_TOKEN]");
95
100
  expect(result.trace.env_state?.inferred_file_tree?.[0]).toBe("~/project/.env");
101
+ expect(result.trace.raw_json).toEqual({
102
+ secret: "[OPENAI_KEY]",
103
+ path: "~/project/.env",
104
+ });
96
105
  expect(result.stats.changed).toBe(true);
97
106
  expect(result.stats.piiMatches).toBe(0);
98
107
  expect(result.stats.totalMatches).toBeGreaterThanOrEqual(3);
package/src/redact.ts CHANGED
@@ -230,6 +230,7 @@ export function redactTraceWithStats(
230
230
  const envFileTree = trace.env_state?.inferred_file_tree?.map((path) => redactStringWithStats(path, home)) ?? [];
231
231
  const envChangedFiles = trace.env_state?.inferred_changed_files?.map((path) => redactStringWithStats(path, home)) ?? [];
232
232
  const envErrorFiles = trace.env_state?.inferred_error_files?.map((path) => redactStringWithStats(path, home)) ?? [];
233
+ const rawJsonResult = redactUnknown(trace.raw_json ?? null, home);
233
234
  const envStats = mergeStats(
234
235
  ...envFileTree.map((entry) => entry.stats),
235
236
  ...envChangedFiles.map((entry) => entry.stats),
@@ -241,6 +242,7 @@ export function redactTraceWithStats(
241
242
  ...trace,
242
243
  submitted_by: "[redacted]",
243
244
  turns: turnResults.map((result) => result.turn),
245
+ raw_json: rawJsonResult.value as NormalizedTrace["raw_json"],
244
246
  env_state: trace.env_state
245
247
  ? {
246
248
  ...trace.env_state,
@@ -250,6 +252,6 @@ export function redactTraceWithStats(
250
252
  }
251
253
  : null,
252
254
  },
253
- stats: mergeStats(...turnResults.map((result) => result.stats), envStats),
255
+ stats: mergeStats(...turnResults.map((result) => result.stats), envStats, rawJsonResult.stats),
254
256
  };
255
257
  }
@@ -0,0 +1,71 @@
1
+ import { readFileSync } from "node:fs";
2
+ import { describe, expect, it } from "vitest";
3
+ import { scoreTrace } from "./scoring.js";
4
+ import type { FailureMode, NormalizedTrace, SourceTool, TraceScore } from "./types.js";
5
+
6
+ interface FixtureManifestEntry {
7
+ fixture_id: string;
8
+ trace_path: string;
9
+ description: string;
10
+ source_tool: SourceTool;
11
+ ts_completeness?: TraceScore["completeness"];
12
+ ts_failure_modes: FailureMode[];
13
+ }
14
+
15
+ interface FixtureManifest {
16
+ schema_version: number;
17
+ fixtures: FixtureManifestEntry[];
18
+ }
19
+
20
+ function loadManifest(): FixtureManifest {
21
+ return JSON.parse(
22
+ readFileSync(new URL("../../../fixtures/pipeline/manifest.json", import.meta.url), "utf8"),
23
+ ) as FixtureManifest;
24
+ }
25
+
26
+ function loadTrace(tracePath: string): NormalizedTrace {
27
+ return JSON.parse(
28
+ readFileSync(new URL(`../../../fixtures/pipeline/${tracePath}`, import.meta.url), "utf8"),
29
+ ) as NormalizedTrace;
30
+ }
31
+
32
+ describe("shared pipeline fixtures", () => {
33
+ const manifest = loadManifest();
34
+
35
+ it("covers the expected first-pass parity pack", () => {
36
+ expect(manifest.schema_version).toBe(1);
37
+ expect(manifest.fixtures.map((fixture) => fixture.fixture_id)).toEqual([
38
+ "eligibility_gap",
39
+ "bronze_floor_eligibility",
40
+ "matching_trigger_success",
41
+ "matching_trigger_failure",
42
+ "incomplete_trace_ineligible",
43
+ "bash_category_drift",
44
+ "sed_read_semantics",
45
+ "bash_text_filter_taxonomy",
46
+ "bash_text_filter_write_edges",
47
+ "recovery_without_recovery_text",
48
+ "long_context_without_mention",
49
+ "catastrophic_failure_boundary",
50
+ "codex_exec_command_collapse",
51
+ "codex_write_stdin_followup",
52
+ "codex_write_stdin_control_sequences",
53
+ "codex_write_stdin_interrupt_with_error_output",
54
+ "cursor_bronze_cap",
55
+ "text_only_no_exchange",
56
+ ]);
57
+ });
58
+
59
+ for (const fixture of manifest.fixtures) {
60
+ it(`${fixture.fixture_id} matches TypeScript failure expectations`, () => {
61
+ const trace = loadTrace(fixture.trace_path);
62
+ const score = scoreTrace(trace);
63
+
64
+ expect(trace.source_tool).toBe(fixture.source_tool);
65
+ expect([...score.failure_modes].sort()).toEqual([...fixture.ts_failure_modes].sort());
66
+ if (fixture.ts_completeness) {
67
+ expect(score.completeness).toBe(fixture.ts_completeness);
68
+ }
69
+ });
70
+ }
71
+ });
@@ -1,5 +1,5 @@
1
1
  import { describe, it, expect } from "vitest";
2
- import { detectFailureModes, checkCompleteness, scoreTrace } from "./scoring.js";
2
+ import { detectFailureModes, checkCompleteness, deriveQualityTier, scoreTrace } from "./scoring.js";
3
3
  import type { NormalizedTrace, Turn, ContentBlock } from "./types.js";
4
4
 
5
5
  function makeTrace(overrides: Partial<NormalizedTrace> = {}): NormalizedTrace {
@@ -49,32 +49,62 @@ function makeTurn(role: "user" | "assistant", content: ContentBlock[]): Turn {
49
49
  };
50
50
  }
51
51
 
52
+ function makeToolUse(
53
+ tool_call_id: string,
54
+ tool_name: string,
55
+ tool_input: Record<string, unknown>,
56
+ ): ContentBlock {
57
+ return { type: "tool_use", tool_call_id, tool_name, tool_input };
58
+ }
59
+
60
+ function makeToolResult(
61
+ tool_call_id: string,
62
+ is_error: boolean,
63
+ result_content: string,
64
+ exit_code: number | null,
65
+ ): ContentBlock {
66
+ return { type: "tool_result", tool_call_id, is_error, result_content, exit_code };
67
+ }
68
+
52
69
  describe("detectFailureModes", () => {
53
70
  it("empty turns → no_failure", () => {
54
71
  const result = detectFailureModes(makeTrace({ turns: [] }));
55
72
  expect(result).toEqual(["no_failure"]);
56
73
  });
57
74
 
58
- it("tool_result with is_error → tool_call_failure", () => {
75
+ it("failed exchange → tool_call_failure", () => {
59
76
  const turns = [
60
- makeTurn("user", [{ type: "tool_result", tool_call_id: "t1", is_error: true, result_content: "err", exit_code: 1 }]),
77
+ makeTurn("user", [{ type: "text", text: "Run the tests." }]),
78
+ makeTurn("assistant", [
79
+ makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
80
+ makeToolResult("t1", true, "test failure", 1),
81
+ ]),
61
82
  ];
62
83
  const result = detectFailureModes(makeTrace({ turns }));
63
84
  expect(result).toContain("tool_call_failure");
64
85
  });
65
86
 
66
- it("same tool consecutive repeated_tool_calls", () => {
67
- const toolUse = (n: number): ContentBlock => ({
68
- type: "tool_use",
69
- tool_call_id: `t${n}`,
70
- tool_name: "bash",
71
- tool_input: {},
72
- });
87
+ it("repeated failing root cause across three exchanges is detected", () => {
73
88
  const turns = [
74
- makeTurn("assistant", [toolUse(1), toolUse(2), toolUse(3)]),
89
+ makeTurn("user", [{ type: "text", text: "Run the tests." }]),
90
+ makeTurn("assistant", [
91
+ makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
92
+ makeToolResult("t1", true, "FAIL src/app.test.ts\nTypeError: config is undefined", 1),
93
+ ]),
94
+ makeTurn("user", [{ type: "text", text: "Try again with a focused rerun." }]),
95
+ makeTurn("assistant", [
96
+ makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
97
+ makeToolResult("t2", true, "FAIL src/app.test.ts\nTypeError: config is undefined", 1),
98
+ ]),
99
+ makeTurn("user", [{ type: "text", text: "One more focused attempt." }]),
100
+ makeTurn("assistant", [
101
+ makeToolUse("t3", "functions.exec_command", { cmd: "vitest src/app.test.ts" }),
102
+ makeToolResult("t3", true, "FAIL src/app.test.ts\nTypeError: config is undefined", 1),
103
+ ]),
75
104
  ];
76
105
  const result = detectFailureModes(makeTrace({ turns }));
77
- expect(result).toContain("repeated_tool_calls");
106
+ expect(result).toContain("repeated_failing_root_cause");
107
+ expect(result).toContain("catastrophic_failure");
78
108
  });
79
109
 
80
110
  it("context window text → context_limit_approached", () => {
@@ -85,24 +115,34 @@ describe("detectFailureModes", () => {
85
115
  expect(result).toContain("context_limit_approached");
86
116
  });
87
117
 
88
- it("final turns all errors → catastrophic_failure", () => {
89
- const errResult: ContentBlock = { type: "tool_result", tool_call_id: "t1", is_error: true, result_content: "fail", exit_code: 1 };
118
+ it("final failed exchange → catastrophic_failure", () => {
90
119
  const turns = [
91
- makeTurn("user", [errResult]),
92
- makeTurn("user", [errResult]),
120
+ makeTurn("user", [{ type: "text", text: "Run the failing command." }]),
121
+ makeTurn("assistant", [
122
+ makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
123
+ makeToolResult("t1", true, "fail", 1),
124
+ ]),
93
125
  ];
94
126
  const result = detectFailureModes(makeTrace({ turns }));
95
127
  expect(result).toContain("catastrophic_failure");
96
128
  });
97
129
 
98
- it("tool errors + later recovery text → graceful_recovery", () => {
99
- const errResult: ContentBlock = { type: "tool_result", tool_call_id: "t1", is_error: true, result_content: "fail", exit_code: 1 };
130
+ it("failed exchange followed by successful exchange → graceful_recovery", () => {
100
131
  const turns = [
101
- makeTurn("user", [errResult]),
102
- makeTurn("assistant", [{ type: "text", text: "Let me try a different approach instead." }]),
132
+ makeTurn("user", [{ type: "text", text: "Run the tests." }]),
133
+ makeTurn("assistant", [
134
+ makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
135
+ makeToolResult("t1", true, "fail", 1),
136
+ ]),
137
+ makeTurn("user", [{ type: "text", text: "Try a safer command." }]),
138
+ makeTurn("assistant", [
139
+ makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
140
+ makeToolResult("t2", false, "ok", 0),
141
+ ]),
103
142
  ];
104
143
  const result = detectFailureModes(makeTrace({ turns }));
105
144
  expect(result).toContain("graceful_recovery");
145
+ expect(result).toContain("tool_call_failure");
106
146
  });
107
147
  });
108
148
 
@@ -135,14 +175,22 @@ describe("scoreTrace", () => {
135
175
  const score = scoreTrace(makeTrace({ turns: [], content_fidelity: "chat_only" }));
136
176
  expect(score.completeness).toBe("malformed");
137
177
  expect(score.payout_cents).toBeLessThan(100);
178
+ expect(score.breakdown.component_count).toBeGreaterThan(0);
138
179
  });
139
180
 
140
181
  it("graceful_recovery + tool_call_failure → bonuses stack", () => {
141
182
  const baseScore = scoreTrace(makeTrace({ turns: [], content_fidelity: "full" }));
142
- const errResult: ContentBlock = { type: "tool_result", tool_call_id: "t1", is_error: true, result_content: "fail", exit_code: 1 };
143
183
  const turns = [
144
- makeTurn("user", [errResult]),
145
- makeTurn("assistant", [{ type: "text", text: "Let me try a different approach instead." }]),
184
+ makeTurn("user", [{ type: "text", text: "Run the tests." }]),
185
+ makeTurn("assistant", [
186
+ makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
187
+ makeToolResult("t1", true, "fail", 1),
188
+ ]),
189
+ makeTurn("user", [{ type: "text", text: "Try again." }]),
190
+ makeTurn("assistant", [
191
+ makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
192
+ makeToolResult("t2", false, "ok", 0),
193
+ ]),
146
194
  ];
147
195
  const bonusScore = scoreTrace(makeTrace({ turns, content_fidelity: "full" }));
148
196
  expect(bonusScore.payout_cents).toBeGreaterThan(baseScore.payout_cents);
@@ -151,10 +199,17 @@ describe("scoreTrace", () => {
151
199
  });
152
200
 
153
201
  it("total clamps to [0, 1]", () => {
154
- const errResult: ContentBlock = { type: "tool_result", tool_call_id: "t1", is_error: true, result_content: "fail", exit_code: 1 };
155
202
  const turns = [
156
- makeTurn("user", [errResult]),
157
- makeTurn("assistant", [{ type: "text", text: "Let me try a different approach instead." }]),
203
+ makeTurn("user", [{ type: "text", text: "Run the tests." }]),
204
+ makeTurn("assistant", [
205
+ makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
206
+ makeToolResult("t1", true, "fail", 1),
207
+ ]),
208
+ makeTurn("user", [{ type: "text", text: "Try again." }]),
209
+ makeTurn("assistant", [
210
+ makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
211
+ makeToolResult("t2", false, "ok", 0),
212
+ ]),
158
213
  ];
159
214
  const score = scoreTrace(makeTrace({ turns, content_fidelity: "full", total_input_tokens: 1000000, total_output_tokens: 1000000 }));
160
215
  expect(score.total).toBeGreaterThanOrEqual(0);
@@ -170,4 +225,74 @@ describe("scoreTrace", () => {
170
225
  const expected = Math.min(500, Math.round(score.total * 500));
171
226
  expect(score.payout_cents).toBe(expected);
172
227
  });
228
+
229
+ it("uses async label and novelty context when provided", () => {
230
+ const turns = [
231
+ makeTurn("user", [{ type: "text", text: "Run the tests." }]),
232
+ makeTurn("assistant", [
233
+ makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
234
+ makeToolResult("t1", true, "fail", 1),
235
+ ]),
236
+ makeTurn("user", [{ type: "text", text: "Try again with a fix." }]),
237
+ makeTurn("assistant", [
238
+ makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
239
+ makeToolResult("t2", false, "ok", 0),
240
+ ]),
241
+ ];
242
+
243
+ const baseScore = scoreTrace(makeTrace({ turns, content_fidelity: "full" }));
244
+ const enrichedScore = scoreTrace(
245
+ makeTrace({ turns, content_fidelity: "full" }),
246
+ undefined,
247
+ {
248
+ anomaly_score: 2.4,
249
+ workflow_shape: "shell_and_editor",
250
+ length_bucket: "medium",
251
+ tool_density: "medium",
252
+ failure_judge_verdict: "confirmed_failure",
253
+ failure_judge_agreement: "agree",
254
+ failure_judge_confidence: 0.94,
255
+ },
256
+ );
257
+
258
+ expect(enrichedScore.total).toBeGreaterThan(baseScore.total);
259
+ expect(enrichedScore.breakdown.components.some((component) => component.key === "novelty")).toBe(true);
260
+ expect(enrichedScore.breakdown.components.some((component) => component.key === "failure_judge")).toBe(true);
261
+ });
262
+
263
+ it("respects failure mode overrides from downstream labels", () => {
264
+ const turns = [
265
+ makeTurn("user", [{ type: "text", text: "Run the tests." }]),
266
+ makeTurn("assistant", [
267
+ makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
268
+ makeToolResult("t1", true, "fail", 1),
269
+ ]),
270
+ ];
271
+
272
+ const baseScore = scoreTrace(makeTrace({ turns, content_fidelity: "full" }));
273
+ const overriddenScore = scoreTrace(
274
+ makeTrace({ turns, content_fidelity: "full" }),
275
+ undefined,
276
+ {
277
+ failure_modes_override: ["no_failure"],
278
+ failure_judge_verdict: "false_positive",
279
+ failure_judge_agreement: "disagree",
280
+ failure_judge_confidence: 0.9,
281
+ },
282
+ );
283
+
284
+ expect(baseScore.failure_modes).toContain("tool_call_failure");
285
+ expect(overriddenScore.failure_modes).toEqual(["no_failure"]);
286
+ expect(overriddenScore.total).toBeLessThan(baseScore.total);
287
+ });
288
+ });
289
+
290
+ describe("deriveQualityTier", () => {
291
+ it("applies the new score band thresholds", () => {
292
+ expect(deriveQualityTier(0.2)).toBe("bronze");
293
+ expect(deriveQualityTier(0.63)).toBe("bronze");
294
+ expect(deriveQualityTier(0.64)).toBe("silver");
295
+ expect(deriveQualityTier(0.81)).toBe("silver");
296
+ expect(deriveQualityTier(0.82)).toBe("gold");
297
+ });
173
298
  });