@tracemarketplace/shared 0.0.9 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/extractor-claude-code.test.js +53 -0
- package/dist/extractor-claude-code.test.js.map +1 -1
- package/dist/extractor-codex.test.js +5 -0
- package/dist/extractor-codex.test.js.map +1 -1
- package/dist/extractors/claude-code.d.ts.map +1 -1
- package/dist/extractors/claude-code.js +4 -4
- package/dist/extractors/claude-code.js.map +1 -1
- package/dist/extractors/codex.d.ts.map +1 -1
- package/dist/extractors/codex.js +2 -0
- package/dist/extractors/codex.js.map +1 -1
- package/dist/extractors/common.d.ts +1 -2
- package/dist/extractors/common.d.ts.map +1 -1
- package/dist/extractors/common.js +2 -37
- package/dist/extractors/common.js.map +1 -1
- package/dist/extractors/common.test.d.ts +2 -0
- package/dist/extractors/common.test.d.ts.map +1 -0
- package/dist/extractors/common.test.js +17 -0
- package/dist/extractors/common.test.js.map +1 -0
- package/dist/extractors/cursor.d.ts.map +1 -1
- package/dist/extractors/cursor.js +8 -0
- package/dist/extractors/cursor.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -1
- package/dist/redact.d.ts.map +1 -1
- package/dist/redact.js +3 -1
- package/dist/redact.js.map +1 -1
- package/dist/redact.test.js +9 -0
- package/dist/redact.test.js.map +1 -1
- package/dist/scoring.d.ts +5 -3
- package/dist/scoring.d.ts.map +1 -1
- package/dist/scoring.fixtures.test.d.ts +2 -0
- package/dist/scoring.fixtures.test.d.ts.map +1 -0
- package/dist/scoring.fixtures.test.js +47 -0
- package/dist/scoring.fixtures.test.js.map +1 -0
- package/dist/scoring.js +381 -62
- package/dist/scoring.js.map +1 -1
- package/dist/scoring.test.js +125 -26
- package/dist/scoring.test.js.map +1 -1
- package/dist/tool-normalization.d.ts +66 -0
- package/dist/tool-normalization.d.ts.map +1 -0
- package/dist/tool-normalization.generated.d.ts +181 -0
- package/dist/tool-normalization.generated.d.ts.map +1 -0
- package/dist/tool-normalization.generated.js +261 -0
- package/dist/tool-normalization.generated.js.map +1 -0
- package/dist/tool-normalization.js +463 -0
- package/dist/tool-normalization.js.map +1 -0
- package/dist/tool-normalization.test.d.ts +2 -0
- package/dist/tool-normalization.test.d.ts.map +1 -0
- package/dist/tool-normalization.test.js +188 -0
- package/dist/tool-normalization.test.js.map +1 -0
- package/dist/turn-actors.d.ts +1 -0
- package/dist/turn-actors.d.ts.map +1 -1
- package/dist/turn-actors.js.map +1 -1
- package/dist/types.d.ts +38 -1
- package/dist/types.d.ts.map +1 -1
- package/dist/validators.d.ts +23 -6
- package/dist/validators.d.ts.map +1 -1
- package/dist/validators.js +4 -0
- package/dist/validators.js.map +1 -1
- package/dist/validators.test.js +7 -0
- package/dist/validators.test.js.map +1 -1
- package/package.json +5 -6
- package/scripts/generate-tool-normalization.mjs +16 -0
- package/src/extractor-claude-code.test.ts +59 -0
- package/src/extractor-codex.test.ts +5 -0
- package/src/extractors/claude-code.ts +8 -4
- package/src/extractors/codex.ts +2 -0
- package/src/extractors/common.test.ts +21 -0
- package/src/extractors/common.ts +15 -49
- package/src/extractors/cursor.ts +9 -0
- package/src/index.ts +1 -0
- package/src/redact.test.ts +9 -0
- package/src/redact.ts +3 -1
- package/src/scoring.fixtures.test.ts +71 -0
- package/src/scoring.test.ts +151 -26
- package/src/scoring.ts +582 -84
- package/src/tool-normalization.generated.ts +262 -0
- package/src/tool-normalization.spec.json +205 -0
- package/src/tool-normalization.test.ts +221 -0
- package/src/tool-normalization.ts +670 -0
- package/src/turn-actors.ts +2 -0
- package/src/types.ts +50 -0
- package/src/validators.test.ts +8 -0
- package/src/validators.ts +8 -0
|
@@ -108,6 +108,10 @@ describe("extractCodex", () => {
|
|
|
108
108
|
expect(trace.has_shell_commands).toBe(true);
|
|
109
109
|
expect(trace.has_tool_calls).toBe(true);
|
|
110
110
|
expect(trace.cwd_hash).toBeTruthy();
|
|
111
|
+
expect(trace.raw_json_format).toBe("codex_cli.jsonl");
|
|
112
|
+
expect(trace.raw_json).toMatchObject({
|
|
113
|
+
events: expect.any(Array),
|
|
114
|
+
});
|
|
111
115
|
|
|
112
116
|
expect(trace.turns[0]).toMatchObject({
|
|
113
117
|
role: "user",
|
|
@@ -221,5 +225,6 @@ describe("extractCodex", () => {
|
|
|
221
225
|
is_error: true,
|
|
222
226
|
exit_code: 2,
|
|
223
227
|
});
|
|
228
|
+
expect(trace.raw_json_format).toBe("codex_cli.jsonl");
|
|
224
229
|
});
|
|
225
230
|
});
|
|
@@ -9,6 +9,7 @@ import type {
|
|
|
9
9
|
TokenUsage,
|
|
10
10
|
} from "../types.js";
|
|
11
11
|
import { deriveTurnActors } from "../turn-actors.js";
|
|
12
|
+
import { pushUniqueTextBlock } from "./common.js";
|
|
12
13
|
|
|
13
14
|
export async function extractClaudeCode(
|
|
14
15
|
sessionFilePath: string,
|
|
@@ -146,10 +147,11 @@ export async function extractClaudeCode(
|
|
|
146
147
|
if (block.type === "text") {
|
|
147
148
|
contentBlocks.push({ type: "text", text: block.text ?? "" });
|
|
148
149
|
} else if (block.type === "thinking") {
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
150
|
+
pushUniqueTextBlock(
|
|
151
|
+
contentBlocks,
|
|
152
|
+
"thinking",
|
|
153
|
+
block.thinking ?? block.text ?? "",
|
|
154
|
+
);
|
|
153
155
|
} else if (block.type === "tool_use") {
|
|
154
156
|
contentBlocks.push({
|
|
155
157
|
type: "tool_use",
|
|
@@ -278,6 +280,8 @@ export async function extractClaudeCode(
|
|
|
278
280
|
extraction_method: "passive",
|
|
279
281
|
},
|
|
280
282
|
score: null,
|
|
283
|
+
raw_json: { lines } as NonNullable<NormalizedTrace["raw_json"]>,
|
|
284
|
+
raw_json_format: "claude_code.jsonl",
|
|
281
285
|
raw_r2_key: "",
|
|
282
286
|
normalized_r2_key: "",
|
|
283
287
|
};
|
package/src/extractors/codex.ts
CHANGED
|
@@ -229,6 +229,8 @@ export async function extractCodex(
|
|
|
229
229
|
content_fidelity: "full",
|
|
230
230
|
env_state: createPassiveEnvState(),
|
|
231
231
|
score: null,
|
|
232
|
+
raw_json: { events } as NonNullable<NormalizedTrace["raw_json"]>,
|
|
233
|
+
raw_json_format: "codex_cli.jsonl",
|
|
232
234
|
raw_r2_key: "",
|
|
233
235
|
normalized_r2_key: "",
|
|
234
236
|
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { isFileMutationTool, isShellToolName, isWriteShapedShellCommand } from "./common.js";
|
|
3
|
+
|
|
4
|
+
describe("common extractor helpers", () => {
|
|
5
|
+
it("treats codex shell aliases as shell tools", () => {
|
|
6
|
+
expect(isShellToolName("bash_command")).toBe(true);
|
|
7
|
+
expect(isShellToolName("run_shell_command")).toBe(true);
|
|
8
|
+
expect(isShellToolName("functions.exec_command")).toBe(true);
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
it("treats replace as a file mutation tool", () => {
|
|
12
|
+
expect(isFileMutationTool("replace", { input: "/app/ars.R" })).toBe(true);
|
|
13
|
+
expect(isFileMutationTool("write_file", { input: "/app/ars.R" })).toBe(true);
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
it("unwraps wrapped shell commands for write-shape detection", () => {
|
|
17
|
+
expect(
|
|
18
|
+
isWriteShapedShellCommand('bash -lc "cd /app && sort -o tmp/out.txt src/input.txt"'),
|
|
19
|
+
).toBe(true);
|
|
20
|
+
});
|
|
21
|
+
});
|
package/src/extractors/common.ts
CHANGED
|
@@ -1,54 +1,20 @@
|
|
|
1
|
+
import {
|
|
2
|
+
isFileMutationTool,
|
|
3
|
+
isShellToolName,
|
|
4
|
+
} from "../tool-normalization.js";
|
|
1
5
|
import type { ContentBlock, EnvState, Turn } from "../types.js";
|
|
2
6
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
"multiedit",
|
|
15
|
-
"write",
|
|
16
|
-
];
|
|
17
|
-
|
|
18
|
-
function normalizeToolName(toolName: string): string {
|
|
19
|
-
return toolName.trim().toLowerCase();
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
function toolNameMatches(toolName: string, candidate: string): boolean {
|
|
23
|
-
const normalized = normalizeToolName(toolName);
|
|
24
|
-
return normalized === candidate || normalized.endsWith(`.${candidate}`);
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
export function isShellToolName(toolName: string): boolean {
|
|
28
|
-
return SHELL_TOOL_NAMES.some((candidate) => toolNameMatches(toolName, candidate));
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
export function isFileMutationTool(
|
|
32
|
-
toolName: string,
|
|
33
|
-
toolInput: Record<string, unknown>,
|
|
34
|
-
): boolean {
|
|
35
|
-
if (
|
|
36
|
-
FILE_MUTATION_TOOL_NAMES.some((candidate) => toolNameMatches(toolName, candidate))
|
|
37
|
-
) {
|
|
38
|
-
return true;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
if (!isShellToolName(toolName)) return false;
|
|
42
|
-
|
|
43
|
-
const command =
|
|
44
|
-
typeof toolInput.cmd === "string"
|
|
45
|
-
? toolInput.cmd
|
|
46
|
-
: typeof toolInput.command === "string"
|
|
47
|
-
? toolInput.command
|
|
48
|
-
: null;
|
|
49
|
-
|
|
50
|
-
return command !== null && /\bapply_patch\b|\bsed\s+-i\b|\bperl\s+-pi\b/.test(command);
|
|
51
|
-
}
|
|
7
|
+
export {
|
|
8
|
+
extractShellCommand,
|
|
9
|
+
getNormalizedToolHierarchy,
|
|
10
|
+
hasShellWriteRedirect,
|
|
11
|
+
isFileMutationTool,
|
|
12
|
+
isShellToolName,
|
|
13
|
+
isWriteShapedShellCommand,
|
|
14
|
+
listNormalizedToolsByFamily,
|
|
15
|
+
normalizeToolName,
|
|
16
|
+
toolNameMatches,
|
|
17
|
+
} from "../tool-normalization.js";
|
|
52
18
|
|
|
53
19
|
export function collectTraceMetrics(turns: Turn[]) {
|
|
54
20
|
const allBlocks = turns.flatMap((turn) => turn.content);
|
package/src/extractors/cursor.ts
CHANGED
|
@@ -265,6 +265,7 @@ export async function extractCursor(
|
|
|
265
265
|
|
|
266
266
|
const turns: Turn[] = [];
|
|
267
267
|
const openFiles: string[] = [];
|
|
268
|
+
const rawBubbles: unknown[] = [];
|
|
268
269
|
let totalInputTokens = 0;
|
|
269
270
|
let totalOutputTokens = 0;
|
|
270
271
|
|
|
@@ -275,6 +276,8 @@ export async function extractCursor(
|
|
|
275
276
|
const blob = readCursorBlob(db, sessionId, bubbleId);
|
|
276
277
|
if (!blob) continue;
|
|
277
278
|
|
|
279
|
+
rawBubbles.push({ header, blob });
|
|
280
|
+
|
|
278
281
|
const role = cursorRole(header, blob);
|
|
279
282
|
const tokenUsage = extractCursorTokenUsage(blob);
|
|
280
283
|
if (tokenUsage) {
|
|
@@ -344,6 +347,12 @@ export async function extractCursor(
|
|
|
344
347
|
open_files_in_editor: openFiles.length > 0 ? Array.from(new Set(openFiles)) : null,
|
|
345
348
|
}),
|
|
346
349
|
score: null,
|
|
350
|
+
raw_json: {
|
|
351
|
+
composerData,
|
|
352
|
+
headers,
|
|
353
|
+
bubbles: rawBubbles,
|
|
354
|
+
} as NonNullable<NormalizedTrace["raw_json"]>,
|
|
355
|
+
raw_json_format: "cursor.composer_disk_kv",
|
|
347
356
|
raw_r2_key: "",
|
|
348
357
|
normalized_r2_key: "",
|
|
349
358
|
};
|
package/src/index.ts
CHANGED
package/src/redact.test.ts
CHANGED
|
@@ -68,6 +68,11 @@ function makeTrace(): NormalizedTrace {
|
|
|
68
68
|
extraction_method: "passive",
|
|
69
69
|
},
|
|
70
70
|
score: null,
|
|
71
|
+
raw_json: {
|
|
72
|
+
secret: "sk-proj-secret-secret-secret-secret",
|
|
73
|
+
path: "/Users/tester/project/.env",
|
|
74
|
+
},
|
|
75
|
+
raw_json_format: "claude_code.jsonl",
|
|
71
76
|
raw_r2_key: "",
|
|
72
77
|
normalized_r2_key: "",
|
|
73
78
|
};
|
|
@@ -93,6 +98,10 @@ describe("redactTraceWithStats", () => {
|
|
|
93
98
|
}
|
|
94
99
|
expect(toolResult.result_content).toBe("Bearer [BEARER_TOKEN]");
|
|
95
100
|
expect(result.trace.env_state?.inferred_file_tree?.[0]).toBe("~/project/.env");
|
|
101
|
+
expect(result.trace.raw_json).toEqual({
|
|
102
|
+
secret: "[OPENAI_KEY]",
|
|
103
|
+
path: "~/project/.env",
|
|
104
|
+
});
|
|
96
105
|
expect(result.stats.changed).toBe(true);
|
|
97
106
|
expect(result.stats.piiMatches).toBe(0);
|
|
98
107
|
expect(result.stats.totalMatches).toBeGreaterThanOrEqual(3);
|
package/src/redact.ts
CHANGED
|
@@ -230,6 +230,7 @@ export function redactTraceWithStats(
|
|
|
230
230
|
const envFileTree = trace.env_state?.inferred_file_tree?.map((path) => redactStringWithStats(path, home)) ?? [];
|
|
231
231
|
const envChangedFiles = trace.env_state?.inferred_changed_files?.map((path) => redactStringWithStats(path, home)) ?? [];
|
|
232
232
|
const envErrorFiles = trace.env_state?.inferred_error_files?.map((path) => redactStringWithStats(path, home)) ?? [];
|
|
233
|
+
const rawJsonResult = redactUnknown(trace.raw_json ?? null, home);
|
|
233
234
|
const envStats = mergeStats(
|
|
234
235
|
...envFileTree.map((entry) => entry.stats),
|
|
235
236
|
...envChangedFiles.map((entry) => entry.stats),
|
|
@@ -241,6 +242,7 @@ export function redactTraceWithStats(
|
|
|
241
242
|
...trace,
|
|
242
243
|
submitted_by: "[redacted]",
|
|
243
244
|
turns: turnResults.map((result) => result.turn),
|
|
245
|
+
raw_json: rawJsonResult.value as NormalizedTrace["raw_json"],
|
|
244
246
|
env_state: trace.env_state
|
|
245
247
|
? {
|
|
246
248
|
...trace.env_state,
|
|
@@ -250,6 +252,6 @@ export function redactTraceWithStats(
|
|
|
250
252
|
}
|
|
251
253
|
: null,
|
|
252
254
|
},
|
|
253
|
-
stats: mergeStats(...turnResults.map((result) => result.stats), envStats),
|
|
255
|
+
stats: mergeStats(...turnResults.map((result) => result.stats), envStats, rawJsonResult.stats),
|
|
254
256
|
};
|
|
255
257
|
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
import { describe, expect, it } from "vitest";
|
|
3
|
+
import { scoreTrace } from "./scoring.js";
|
|
4
|
+
import type { FailureMode, NormalizedTrace, SourceTool, TraceScore } from "./types.js";
|
|
5
|
+
|
|
6
|
+
interface FixtureManifestEntry {
|
|
7
|
+
fixture_id: string;
|
|
8
|
+
trace_path: string;
|
|
9
|
+
description: string;
|
|
10
|
+
source_tool: SourceTool;
|
|
11
|
+
ts_completeness?: TraceScore["completeness"];
|
|
12
|
+
ts_failure_modes: FailureMode[];
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
interface FixtureManifest {
|
|
16
|
+
schema_version: number;
|
|
17
|
+
fixtures: FixtureManifestEntry[];
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function loadManifest(): FixtureManifest {
|
|
21
|
+
return JSON.parse(
|
|
22
|
+
readFileSync(new URL("../../../fixtures/pipeline/manifest.json", import.meta.url), "utf8"),
|
|
23
|
+
) as FixtureManifest;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function loadTrace(tracePath: string): NormalizedTrace {
|
|
27
|
+
return JSON.parse(
|
|
28
|
+
readFileSync(new URL(`../../../fixtures/pipeline/${tracePath}`, import.meta.url), "utf8"),
|
|
29
|
+
) as NormalizedTrace;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
describe("shared pipeline fixtures", () => {
|
|
33
|
+
const manifest = loadManifest();
|
|
34
|
+
|
|
35
|
+
it("covers the expected first-pass parity pack", () => {
|
|
36
|
+
expect(manifest.schema_version).toBe(1);
|
|
37
|
+
expect(manifest.fixtures.map((fixture) => fixture.fixture_id)).toEqual([
|
|
38
|
+
"eligibility_gap",
|
|
39
|
+
"bronze_floor_eligibility",
|
|
40
|
+
"matching_trigger_success",
|
|
41
|
+
"matching_trigger_failure",
|
|
42
|
+
"incomplete_trace_ineligible",
|
|
43
|
+
"bash_category_drift",
|
|
44
|
+
"sed_read_semantics",
|
|
45
|
+
"bash_text_filter_taxonomy",
|
|
46
|
+
"bash_text_filter_write_edges",
|
|
47
|
+
"recovery_without_recovery_text",
|
|
48
|
+
"long_context_without_mention",
|
|
49
|
+
"catastrophic_failure_boundary",
|
|
50
|
+
"codex_exec_command_collapse",
|
|
51
|
+
"codex_write_stdin_followup",
|
|
52
|
+
"codex_write_stdin_control_sequences",
|
|
53
|
+
"codex_write_stdin_interrupt_with_error_output",
|
|
54
|
+
"cursor_bronze_cap",
|
|
55
|
+
"text_only_no_exchange",
|
|
56
|
+
]);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
for (const fixture of manifest.fixtures) {
|
|
60
|
+
it(`${fixture.fixture_id} matches TypeScript failure expectations`, () => {
|
|
61
|
+
const trace = loadTrace(fixture.trace_path);
|
|
62
|
+
const score = scoreTrace(trace);
|
|
63
|
+
|
|
64
|
+
expect(trace.source_tool).toBe(fixture.source_tool);
|
|
65
|
+
expect([...score.failure_modes].sort()).toEqual([...fixture.ts_failure_modes].sort());
|
|
66
|
+
if (fixture.ts_completeness) {
|
|
67
|
+
expect(score.completeness).toBe(fixture.ts_completeness);
|
|
68
|
+
}
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
});
|
package/src/scoring.test.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { describe, it, expect } from "vitest";
|
|
2
|
-
import { detectFailureModes, checkCompleteness, scoreTrace } from "./scoring.js";
|
|
2
|
+
import { detectFailureModes, checkCompleteness, deriveQualityTier, scoreTrace } from "./scoring.js";
|
|
3
3
|
import type { NormalizedTrace, Turn, ContentBlock } from "./types.js";
|
|
4
4
|
|
|
5
5
|
function makeTrace(overrides: Partial<NormalizedTrace> = {}): NormalizedTrace {
|
|
@@ -49,32 +49,62 @@ function makeTurn(role: "user" | "assistant", content: ContentBlock[]): Turn {
|
|
|
49
49
|
};
|
|
50
50
|
}
|
|
51
51
|
|
|
52
|
+
function makeToolUse(
|
|
53
|
+
tool_call_id: string,
|
|
54
|
+
tool_name: string,
|
|
55
|
+
tool_input: Record<string, unknown>,
|
|
56
|
+
): ContentBlock {
|
|
57
|
+
return { type: "tool_use", tool_call_id, tool_name, tool_input };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function makeToolResult(
|
|
61
|
+
tool_call_id: string,
|
|
62
|
+
is_error: boolean,
|
|
63
|
+
result_content: string,
|
|
64
|
+
exit_code: number | null,
|
|
65
|
+
): ContentBlock {
|
|
66
|
+
return { type: "tool_result", tool_call_id, is_error, result_content, exit_code };
|
|
67
|
+
}
|
|
68
|
+
|
|
52
69
|
describe("detectFailureModes", () => {
|
|
53
70
|
it("empty turns → no_failure", () => {
|
|
54
71
|
const result = detectFailureModes(makeTrace({ turns: [] }));
|
|
55
72
|
expect(result).toEqual(["no_failure"]);
|
|
56
73
|
});
|
|
57
74
|
|
|
58
|
-
it("
|
|
75
|
+
it("failed exchange → tool_call_failure", () => {
|
|
59
76
|
const turns = [
|
|
60
|
-
makeTurn("user", [{ type: "
|
|
77
|
+
makeTurn("user", [{ type: "text", text: "Run the tests." }]),
|
|
78
|
+
makeTurn("assistant", [
|
|
79
|
+
makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
|
|
80
|
+
makeToolResult("t1", true, "test failure", 1),
|
|
81
|
+
]),
|
|
61
82
|
];
|
|
62
83
|
const result = detectFailureModes(makeTrace({ turns }));
|
|
63
84
|
expect(result).toContain("tool_call_failure");
|
|
64
85
|
});
|
|
65
86
|
|
|
66
|
-
it("
|
|
67
|
-
const toolUse = (n: number): ContentBlock => ({
|
|
68
|
-
type: "tool_use",
|
|
69
|
-
tool_call_id: `t${n}`,
|
|
70
|
-
tool_name: "bash",
|
|
71
|
-
tool_input: {},
|
|
72
|
-
});
|
|
87
|
+
it("repeated failing root cause across three exchanges is detected", () => {
|
|
73
88
|
const turns = [
|
|
74
|
-
makeTurn("
|
|
89
|
+
makeTurn("user", [{ type: "text", text: "Run the tests." }]),
|
|
90
|
+
makeTurn("assistant", [
|
|
91
|
+
makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
|
|
92
|
+
makeToolResult("t1", true, "FAIL src/app.test.ts\nTypeError: config is undefined", 1),
|
|
93
|
+
]),
|
|
94
|
+
makeTurn("user", [{ type: "text", text: "Try again with a focused rerun." }]),
|
|
95
|
+
makeTurn("assistant", [
|
|
96
|
+
makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
|
|
97
|
+
makeToolResult("t2", true, "FAIL src/app.test.ts\nTypeError: config is undefined", 1),
|
|
98
|
+
]),
|
|
99
|
+
makeTurn("user", [{ type: "text", text: "One more focused attempt." }]),
|
|
100
|
+
makeTurn("assistant", [
|
|
101
|
+
makeToolUse("t3", "functions.exec_command", { cmd: "vitest src/app.test.ts" }),
|
|
102
|
+
makeToolResult("t3", true, "FAIL src/app.test.ts\nTypeError: config is undefined", 1),
|
|
103
|
+
]),
|
|
75
104
|
];
|
|
76
105
|
const result = detectFailureModes(makeTrace({ turns }));
|
|
77
|
-
expect(result).toContain("
|
|
106
|
+
expect(result).toContain("repeated_failing_root_cause");
|
|
107
|
+
expect(result).toContain("catastrophic_failure");
|
|
78
108
|
});
|
|
79
109
|
|
|
80
110
|
it("context window text → context_limit_approached", () => {
|
|
@@ -85,24 +115,34 @@ describe("detectFailureModes", () => {
|
|
|
85
115
|
expect(result).toContain("context_limit_approached");
|
|
86
116
|
});
|
|
87
117
|
|
|
88
|
-
it("final
|
|
89
|
-
const errResult: ContentBlock = { type: "tool_result", tool_call_id: "t1", is_error: true, result_content: "fail", exit_code: 1 };
|
|
118
|
+
it("final failed exchange → catastrophic_failure", () => {
|
|
90
119
|
const turns = [
|
|
91
|
-
makeTurn("user", [
|
|
92
|
-
makeTurn("
|
|
120
|
+
makeTurn("user", [{ type: "text", text: "Run the failing command." }]),
|
|
121
|
+
makeTurn("assistant", [
|
|
122
|
+
makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
|
|
123
|
+
makeToolResult("t1", true, "fail", 1),
|
|
124
|
+
]),
|
|
93
125
|
];
|
|
94
126
|
const result = detectFailureModes(makeTrace({ turns }));
|
|
95
127
|
expect(result).toContain("catastrophic_failure");
|
|
96
128
|
});
|
|
97
129
|
|
|
98
|
-
it("
|
|
99
|
-
const errResult: ContentBlock = { type: "tool_result", tool_call_id: "t1", is_error: true, result_content: "fail", exit_code: 1 };
|
|
130
|
+
it("failed exchange followed by successful exchange → graceful_recovery", () => {
|
|
100
131
|
const turns = [
|
|
101
|
-
makeTurn("user", [
|
|
102
|
-
makeTurn("assistant", [
|
|
132
|
+
makeTurn("user", [{ type: "text", text: "Run the tests." }]),
|
|
133
|
+
makeTurn("assistant", [
|
|
134
|
+
makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
|
|
135
|
+
makeToolResult("t1", true, "fail", 1),
|
|
136
|
+
]),
|
|
137
|
+
makeTurn("user", [{ type: "text", text: "Try a safer command." }]),
|
|
138
|
+
makeTurn("assistant", [
|
|
139
|
+
makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
|
|
140
|
+
makeToolResult("t2", false, "ok", 0),
|
|
141
|
+
]),
|
|
103
142
|
];
|
|
104
143
|
const result = detectFailureModes(makeTrace({ turns }));
|
|
105
144
|
expect(result).toContain("graceful_recovery");
|
|
145
|
+
expect(result).toContain("tool_call_failure");
|
|
106
146
|
});
|
|
107
147
|
});
|
|
108
148
|
|
|
@@ -135,14 +175,22 @@ describe("scoreTrace", () => {
|
|
|
135
175
|
const score = scoreTrace(makeTrace({ turns: [], content_fidelity: "chat_only" }));
|
|
136
176
|
expect(score.completeness).toBe("malformed");
|
|
137
177
|
expect(score.payout_cents).toBeLessThan(100);
|
|
178
|
+
expect(score.breakdown.component_count).toBeGreaterThan(0);
|
|
138
179
|
});
|
|
139
180
|
|
|
140
181
|
it("graceful_recovery + tool_call_failure → bonuses stack", () => {
|
|
141
182
|
const baseScore = scoreTrace(makeTrace({ turns: [], content_fidelity: "full" }));
|
|
142
|
-
const errResult: ContentBlock = { type: "tool_result", tool_call_id: "t1", is_error: true, result_content: "fail", exit_code: 1 };
|
|
143
183
|
const turns = [
|
|
144
|
-
makeTurn("user", [
|
|
145
|
-
makeTurn("assistant", [
|
|
184
|
+
makeTurn("user", [{ type: "text", text: "Run the tests." }]),
|
|
185
|
+
makeTurn("assistant", [
|
|
186
|
+
makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
|
|
187
|
+
makeToolResult("t1", true, "fail", 1),
|
|
188
|
+
]),
|
|
189
|
+
makeTurn("user", [{ type: "text", text: "Try again." }]),
|
|
190
|
+
makeTurn("assistant", [
|
|
191
|
+
makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
|
|
192
|
+
makeToolResult("t2", false, "ok", 0),
|
|
193
|
+
]),
|
|
146
194
|
];
|
|
147
195
|
const bonusScore = scoreTrace(makeTrace({ turns, content_fidelity: "full" }));
|
|
148
196
|
expect(bonusScore.payout_cents).toBeGreaterThan(baseScore.payout_cents);
|
|
@@ -151,10 +199,17 @@ describe("scoreTrace", () => {
|
|
|
151
199
|
});
|
|
152
200
|
|
|
153
201
|
it("total clamps to [0, 1]", () => {
|
|
154
|
-
const errResult: ContentBlock = { type: "tool_result", tool_call_id: "t1", is_error: true, result_content: "fail", exit_code: 1 };
|
|
155
202
|
const turns = [
|
|
156
|
-
makeTurn("user", [
|
|
157
|
-
makeTurn("assistant", [
|
|
203
|
+
makeTurn("user", [{ type: "text", text: "Run the tests." }]),
|
|
204
|
+
makeTurn("assistant", [
|
|
205
|
+
makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
|
|
206
|
+
makeToolResult("t1", true, "fail", 1),
|
|
207
|
+
]),
|
|
208
|
+
makeTurn("user", [{ type: "text", text: "Try again." }]),
|
|
209
|
+
makeTurn("assistant", [
|
|
210
|
+
makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
|
|
211
|
+
makeToolResult("t2", false, "ok", 0),
|
|
212
|
+
]),
|
|
158
213
|
];
|
|
159
214
|
const score = scoreTrace(makeTrace({ turns, content_fidelity: "full", total_input_tokens: 1000000, total_output_tokens: 1000000 }));
|
|
160
215
|
expect(score.total).toBeGreaterThanOrEqual(0);
|
|
@@ -170,4 +225,74 @@ describe("scoreTrace", () => {
|
|
|
170
225
|
const expected = Math.min(500, Math.round(score.total * 500));
|
|
171
226
|
expect(score.payout_cents).toBe(expected);
|
|
172
227
|
});
|
|
228
|
+
|
|
229
|
+
it("uses async label and novelty context when provided", () => {
|
|
230
|
+
const turns = [
|
|
231
|
+
makeTurn("user", [{ type: "text", text: "Run the tests." }]),
|
|
232
|
+
makeTurn("assistant", [
|
|
233
|
+
makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
|
|
234
|
+
makeToolResult("t1", true, "fail", 1),
|
|
235
|
+
]),
|
|
236
|
+
makeTurn("user", [{ type: "text", text: "Try again with a fix." }]),
|
|
237
|
+
makeTurn("assistant", [
|
|
238
|
+
makeToolUse("t2", "functions.exec_command", { cmd: "pnpm test --runInBand" }),
|
|
239
|
+
makeToolResult("t2", false, "ok", 0),
|
|
240
|
+
]),
|
|
241
|
+
];
|
|
242
|
+
|
|
243
|
+
const baseScore = scoreTrace(makeTrace({ turns, content_fidelity: "full" }));
|
|
244
|
+
const enrichedScore = scoreTrace(
|
|
245
|
+
makeTrace({ turns, content_fidelity: "full" }),
|
|
246
|
+
undefined,
|
|
247
|
+
{
|
|
248
|
+
anomaly_score: 2.4,
|
|
249
|
+
workflow_shape: "shell_and_editor",
|
|
250
|
+
length_bucket: "medium",
|
|
251
|
+
tool_density: "medium",
|
|
252
|
+
failure_judge_verdict: "confirmed_failure",
|
|
253
|
+
failure_judge_agreement: "agree",
|
|
254
|
+
failure_judge_confidence: 0.94,
|
|
255
|
+
},
|
|
256
|
+
);
|
|
257
|
+
|
|
258
|
+
expect(enrichedScore.total).toBeGreaterThan(baseScore.total);
|
|
259
|
+
expect(enrichedScore.breakdown.components.some((component) => component.key === "novelty")).toBe(true);
|
|
260
|
+
expect(enrichedScore.breakdown.components.some((component) => component.key === "failure_judge")).toBe(true);
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
it("respects failure mode overrides from downstream labels", () => {
|
|
264
|
+
const turns = [
|
|
265
|
+
makeTurn("user", [{ type: "text", text: "Run the tests." }]),
|
|
266
|
+
makeTurn("assistant", [
|
|
267
|
+
makeToolUse("t1", "functions.exec_command", { cmd: "pnpm test" }),
|
|
268
|
+
makeToolResult("t1", true, "fail", 1),
|
|
269
|
+
]),
|
|
270
|
+
];
|
|
271
|
+
|
|
272
|
+
const baseScore = scoreTrace(makeTrace({ turns, content_fidelity: "full" }));
|
|
273
|
+
const overriddenScore = scoreTrace(
|
|
274
|
+
makeTrace({ turns, content_fidelity: "full" }),
|
|
275
|
+
undefined,
|
|
276
|
+
{
|
|
277
|
+
failure_modes_override: ["no_failure"],
|
|
278
|
+
failure_judge_verdict: "false_positive",
|
|
279
|
+
failure_judge_agreement: "disagree",
|
|
280
|
+
failure_judge_confidence: 0.9,
|
|
281
|
+
},
|
|
282
|
+
);
|
|
283
|
+
|
|
284
|
+
expect(baseScore.failure_modes).toContain("tool_call_failure");
|
|
285
|
+
expect(overriddenScore.failure_modes).toEqual(["no_failure"]);
|
|
286
|
+
expect(overriddenScore.total).toBeLessThan(baseScore.total);
|
|
287
|
+
});
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
describe("deriveQualityTier", () => {
|
|
291
|
+
it("applies the new score band thresholds", () => {
|
|
292
|
+
expect(deriveQualityTier(0.2)).toBe("bronze");
|
|
293
|
+
expect(deriveQualityTier(0.63)).toBe("bronze");
|
|
294
|
+
expect(deriveQualityTier(0.64)).toBe("silver");
|
|
295
|
+
expect(deriveQualityTier(0.81)).toBe("silver");
|
|
296
|
+
expect(deriveQualityTier(0.82)).toBe("gold");
|
|
297
|
+
});
|
|
173
298
|
});
|