@jean.gnc/harness-kit 0.12.8 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -0
- package/dist/cli.js +13 -1
- package/dist/cli.js.map +1 -1
- package/dist/eval/capture.d.ts +23 -0
- package/dist/eval/capture.d.ts.map +1 -0
- package/dist/eval/capture.js +79 -0
- package/dist/eval/capture.js.map +1 -0
- package/dist/eval/cases.d.ts +10 -2
- package/dist/eval/cases.d.ts.map +1 -1
- package/dist/eval/cases.js +9 -3
- package/dist/eval/cases.js.map +1 -1
- package/dist/eval/detect.d.ts +1 -0
- package/dist/eval/detect.d.ts.map +1 -1
- package/dist/eval/detect.js +1 -1
- package/dist/eval/detect.js.map +1 -1
- package/dist/eval/grade-deterministic.d.ts +9 -0
- package/dist/eval/grade-deterministic.d.ts.map +1 -0
- package/dist/eval/grade-deterministic.js +87 -0
- package/dist/eval/grade-deterministic.js.map +1 -0
- package/dist/eval/grade-judge.d.ts +12 -0
- package/dist/eval/grade-judge.d.ts.map +1 -0
- package/dist/eval/grade-judge.js +14 -0
- package/dist/eval/grade-judge.js.map +1 -0
- package/dist/eval/grade.d.ts +5 -0
- package/dist/eval/grade.d.ts.map +1 -0
- package/dist/eval/grade.js +25 -0
- package/dist/eval/grade.js.map +1 -0
- package/dist/eval/index.d.ts +4 -0
- package/dist/eval/index.d.ts.map +1 -1
- package/dist/eval/index.js +27 -5
- package/dist/eval/index.js.map +1 -1
- package/dist/eval/judge.d.ts +26 -0
- package/dist/eval/judge.d.ts.map +1 -0
- package/dist/eval/judge.js +55 -0
- package/dist/eval/judge.js.map +1 -0
- package/dist/eval/report.d.ts +5 -1
- package/dist/eval/report.d.ts.map +1 -1
- package/dist/eval/report.js +66 -13
- package/dist/eval/report.js.map +1 -1
- package/dist/eval/runner.d.ts +13 -5
- package/dist/eval/runner.d.ts.map +1 -1
- package/dist/eval/runner.js +105 -31
- package/dist/eval/runner.js.map +1 -1
- package/dist/eval/schema.d.ts +644 -29
- package/dist/eval/schema.d.ts.map +1 -1
- package/dist/eval/schema.js +57 -6
- package/dist/eval/schema.js.map +1 -1
- package/dist/eval/score.d.ts +8 -0
- package/dist/eval/score.d.ts.map +1 -1
- package/dist/eval/score.js +17 -0
- package/dist/eval/score.js.map +1 -1
- package/package.json +2 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAyBxB,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAKf,CAAC;AAEH,QAAA,MAAM,
|
|
1
|
+
{"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAyBxB,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAKf,CAAC;AAEH,QAAA,MAAM,SAAS;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAmBb,CAAC;AAOH,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;IAGf,CAAC;AAEH,QAAA,MAAM,MAAM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAGV,CAAC;AAWH,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAAkE,CAAC;AAEpF,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAON,CAAC;AAEZ,eAAO,MAAM,KAAK,iCAAkC,CAAC;AAcrD,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAA2D,CAAC;AAEvF,MAAM,MAAM,IAAI,GAAG,CAAC,OAAO,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC;AAC1C,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,SAAS,CAAC,CAAC;AAClD,MAAM,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,MAAM,CAAC,CAAC;AAC5C,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,WAAW,GAAG,WAAW,CAAC;AACjD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAEtD,wBAAgB,cAAc,CAAC,WAAW,EAAE,WAAW,GAAG,SAAS,MAAM,EAAE,CAM1E;AAED,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,SAAS,MAAM,EAAE,CAGxE"}
|
package/dist/eval/schema.js
CHANGED
|
@@ -22,21 +22,67 @@ const Expectation = z.union([
|
|
|
22
22
|
PathExpectation,
|
|
23
23
|
NoSkillExpectation,
|
|
24
24
|
]);
|
|
25
|
-
const
|
|
25
|
+
const Assertion = z.discriminatedUnion("kind", [
|
|
26
|
+
z.object({
|
|
27
|
+
kind: z.literal("outputMatches"),
|
|
28
|
+
pattern: z.string().min(1),
|
|
29
|
+
regex: z.boolean().default(false),
|
|
30
|
+
}),
|
|
31
|
+
z.object({
|
|
32
|
+
kind: z.literal("outputExcludes"),
|
|
33
|
+
pattern: z.string().min(1),
|
|
34
|
+
regex: z.boolean().default(false),
|
|
35
|
+
}),
|
|
36
|
+
z.object({ kind: z.literal("usedTool"), tool: z.string().min(1) }),
|
|
37
|
+
z.object({ kind: z.literal("didNotUseTool"), tool: z.string().min(1) }),
|
|
38
|
+
z.object({
|
|
39
|
+
kind: z.literal("wroteFile"),
|
|
40
|
+
path: z.string().min(1),
|
|
41
|
+
contentMatches: z.string().min(1).optional(),
|
|
42
|
+
regex: z.boolean().default(false),
|
|
43
|
+
}),
|
|
44
|
+
]);
|
|
45
|
+
const Dimension = z.object({
|
|
46
|
+
dimension: z.string().min(1),
|
|
47
|
+
criterion: z.string().min(1),
|
|
48
|
+
});
|
|
49
|
+
const CombineRule = z.discriminatedUnion("combine", [
|
|
50
|
+
z.object({ combine: z.literal("all") }),
|
|
51
|
+
z.object({ combine: z.literal("fraction"), threshold: z.number().min(0).max(1) }),
|
|
52
|
+
]);
|
|
53
|
+
const Rubric = z.object({
|
|
54
|
+
dimensions: z.array(Dimension).min(1),
|
|
55
|
+
combine: CombineRule.default({ combine: "all" }),
|
|
56
|
+
});
|
|
57
|
+
const CommonCaseFields = {
|
|
26
58
|
id: z.string().min(1),
|
|
27
59
|
prompt: z.string().min(1),
|
|
28
|
-
expect: Expectation,
|
|
29
60
|
cwd: z.string().optional(),
|
|
30
61
|
runs: z.number().int().positive().optional(),
|
|
31
62
|
threshold: z.number().min(0).max(1).optional(),
|
|
32
63
|
note: z.string().optional(),
|
|
33
|
-
}
|
|
64
|
+
};
|
|
65
|
+
const RoutingCase = z.object({ ...CommonCaseFields, expect: Expectation }).strict();
|
|
66
|
+
const SolvingCase = z
|
|
67
|
+
.object({
|
|
68
|
+
...CommonCaseFields,
|
|
69
|
+
expectSkill: FqId.optional(),
|
|
70
|
+
assert: z.array(Assertion).default([]),
|
|
71
|
+
rubric: Rubric.optional(),
|
|
72
|
+
})
|
|
73
|
+
.strict();
|
|
34
74
|
export const TIERS = ["routing", "solving"];
|
|
35
|
-
|
|
75
|
+
const RoutingFile = z.object({
|
|
76
|
+
suite: z.string().min(1),
|
|
77
|
+
tier: z.literal("routing"),
|
|
78
|
+
cases: z.array(RoutingCase).min(1),
|
|
79
|
+
});
|
|
80
|
+
const SolvingFile = z.object({
|
|
36
81
|
suite: z.string().min(1),
|
|
37
|
-
tier: z.
|
|
38
|
-
cases: z.array(
|
|
82
|
+
tier: z.literal("solving"),
|
|
83
|
+
cases: z.array(SolvingCase).min(1),
|
|
39
84
|
});
|
|
85
|
+
export const CaseFileSchema = z.discriminatedUnion("tier", [RoutingFile, SolvingFile]);
|
|
40
86
|
export function expectedSkills(expectation) {
|
|
41
87
|
if ("noSkill" in expectation)
|
|
42
88
|
return [];
|
|
@@ -47,4 +93,9 @@ export function expectedSkills(expectation) {
|
|
|
47
93
|
return [...expectation.anyOf, ...forbidden];
|
|
48
94
|
return [...expectation.path, ...forbidden];
|
|
49
95
|
}
|
|
96
|
+
export function caseExpectedSkills(evalCase) {
|
|
97
|
+
if ("expect" in evalCase)
|
|
98
|
+
return expectedSkills(evalCase.expect);
|
|
99
|
+
return evalCase.expectSkill ? [evalCase.expectSkill] : [];
|
|
100
|
+
}
|
|
50
101
|
//# sourceMappingURL=schema.js.map
|
package/dist/eval/schema.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,EAAE,KAAK,EAAE,MAAM,WAAW,CAAC;AAElC,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,EAAE,4BAA4B,CAAC,CAAC;AAEnE,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,IAAI;IACX,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,kBAAkB,GAAG,CAAC,CAAC,MAAM,CAAC;IAClC,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC;CACzB,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC;IAC1B,gBAAgB;IAChB,gBAAgB;IAChB,eAAe;IACf,kBAAkB;CACnB,CAAC,CAAC;AAEH,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,EAAE,KAAK,EAAE,MAAM,WAAW,CAAC;AAElC,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,EAAE,4BAA4B,CAAC,CAAC;AAEnE,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,IAAI;IACX,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,kBAAkB,GAAG,CAAC,CAAC,MAAM,CAAC;IAClC,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC;CACzB,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC;IAC1B,gBAAgB;IAChB,gBAAgB;IAChB,eAAe;IACf,kBAAkB;CACnB,CAAC,CAAC;AAEH,MAAM,SAAS,GAAG,CAAC,CAAC,kBAAkB,CAAC,MAAM,EAAE;IAC7C,CAAC,CAAC,MAAM,CAAC;QACP,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,eAAe,CAAC;QAChC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1B,KAAK,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC;KAClC,CAAC;IACF,CAAC,CAAC,MAAM,CAAC;QACP,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,gBAAgB,CAAC;QACjC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1B,KAAK,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC;KAClC,CAAC;IACF,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;IAClE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,eAAe,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;IACvE,CAAC,CAAC,MAAM,CAAC;QACP,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,WAAW,CAAC;QAC5B,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QACvB,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;QAC5C,KAAK,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC;KAClC,CAAC;CACH,CAAC,CAAC;AAEH,MAAM,SAAS,GAAG,CAAC,CAAC,MAAM,CAAC;IACzB,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5B,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;CAC7B,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAC,kBAAkB,CAAC,SAAS,EAAE;IAClD,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;CAClF,CAAC,CAAC;AAEH,MAAM,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC;IACtB,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACrC,OAAO,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;CACjD,CAAC,CAAC;AAEH,MAAM,gBAAgB,GAAG;IACvB,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACrB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACzB,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC1B,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IAC5C,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC9C,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;CAC5B,CAAC;AAEF,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,gBAAgB,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC;AAEpF,MAAM,WAAW,GAAG,CAAC;KAClB,MAAM,CAAC;IACN,GAAG,gBAAgB;IACnB,WAAW,EAAE,IAAI,CAAC,QAAQ,EAAE;IAC5B,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC;IACtC,MAAM,EAAE,MAAM,CAAC,QAAQ,EAAE;CAC1B,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,MAAM,CAAC,MAAM,KAAK,GAAG,CAAC,SAAS,EAAE,SAAS,CAAU,CAAC;AAErD,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC;IAC3B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACxB,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC;IAC1B,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CACnC,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC;IAC3B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACxB,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC;IAC1B,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CACnC,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,kBAAkB,CAAC,MAAM,EAAE,CAAC,WAAW,EAAE,WAAW,CAAC,CAAC,CAAC;AAYvF,MAAM,UAAU,cAAc,CAAC,WAAwB;IACrD,IAAI,SAAS,IAAI,WAAW;QAAE,OAAO,EAAE,CAAC;IACxC,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,IAAI,EAAE,CAAC;IACxC,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,CAAC,WAAW,CAAC,KAAK,EAAE,GAAG,SAAS,CAAC,CAAC;IACrE,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,CAAC,GAAG,WAAW,CAAC,KAAK,EAAE,GAAG,SAAS,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,WAAW,CAAC,IAAI,EAAE,GAAG,SAAS,CAAC,CAAC;AAC7C,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,QAAkB;IACnD,IAAI,QAAQ,IAAI,QAAQ;QAAE,OAAO,cAAc,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IACjE,OAAO,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;AAC5D,CAAC"}
|
package/dist/eval/score.d.ts
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import type { DetectionResult } from "./detect.js";
|
|
2
|
+
import type { AssertionResult } from "./grade-deterministic.js";
|
|
3
|
+
import type { RubricResult } from "./grade-judge.js";
|
|
2
4
|
import type { Expectation } from "./schema.js";
|
|
3
5
|
export interface CaseScore {
|
|
4
6
|
readonly matched: number;
|
|
@@ -10,4 +12,10 @@ export interface CaseScore {
|
|
|
10
12
|
}
|
|
11
13
|
export declare function matchesExpectation(expectation: Expectation, run: DetectionResult): boolean;
|
|
12
14
|
export declare function scoreCase(expectation: Expectation, runs: readonly DetectionResult[], threshold?: number): CaseScore;
|
|
15
|
+
export interface SolvingRunResult {
|
|
16
|
+
readonly assertions: readonly AssertionResult[];
|
|
17
|
+
readonly rubric: RubricResult | null;
|
|
18
|
+
}
|
|
19
|
+
export declare function solvingRunPassed(run: SolvingRunResult): boolean;
|
|
20
|
+
export declare function scoreSolving(perRun: readonly SolvingRunResult[], threshold?: number): CaseScore;
|
|
13
21
|
//# sourceMappingURL=score.d.ts.map
|
package/dist/eval/score.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"score.d.ts","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACnD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAI/C,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACjD;AAED,wBAAgB,kBAAkB,CAAC,WAAW,EAAE,WAAW,EAAE,GAAG,EAAE,eAAe,GAAG,OAAO,CAa1F;AAED,wBAAgB,SAAS,CACvB,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,SAAS,eAAe,EAAE,EAChC,SAAS,SAAoB,GAC5B,SAAS,CAWX"}
|
|
1
|
+
{"version":3,"file":"score.d.ts","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAChE,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAI/C,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACjD;AAED,wBAAgB,kBAAkB,CAAC,WAAW,EAAE,WAAW,EAAE,GAAG,EAAE,eAAe,GAAG,OAAO,CAa1F;AAED,wBAAgB,SAAS,CACvB,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,SAAS,eAAe,EAAE,EAChC,SAAS,SAAoB,GAC5B,SAAS,CAWX;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,UAAU,EAAE,SAAS,eAAe,EAAE,CAAC;IAChD,QAAQ,CAAC,MAAM,EAAE,YAAY,GAAG,IAAI,CAAC;CACtC;AAED,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,gBAAgB,GAAG,OAAO,CAI/D;AAED,wBAAgB,YAAY,CAC1B,MAAM,EAAE,SAAS,gBAAgB,EAAE,EACnC,SAAS,SAAoB,GAC5B,SAAS,CAWX"}
|
package/dist/eval/score.js
CHANGED
|
@@ -25,6 +25,23 @@ export function scoreCase(expectation, runs, threshold = DEFAULT_THRESHOLD) {
|
|
|
25
25
|
histogram: histogramOf(runs),
|
|
26
26
|
};
|
|
27
27
|
}
|
|
28
|
+
export function solvingRunPassed(run) {
|
|
29
|
+
const assertionsPass = run.assertions.every((a) => a.pass);
|
|
30
|
+
const rubricPass = run.rubric === null || run.rubric.pass;
|
|
31
|
+
return assertionsPass && rubricPass;
|
|
32
|
+
}
|
|
33
|
+
export function scoreSolving(perRun, threshold = DEFAULT_THRESHOLD) {
|
|
34
|
+
const matched = perRun.filter(solvingRunPassed).length;
|
|
35
|
+
const triggerRate = perRun.length === 0 ? 0 : matched / perRun.length;
|
|
36
|
+
return {
|
|
37
|
+
matched,
|
|
38
|
+
runs: perRun.length,
|
|
39
|
+
triggerRate,
|
|
40
|
+
threshold,
|
|
41
|
+
pass: triggerRate >= threshold,
|
|
42
|
+
histogram: new Map(),
|
|
43
|
+
};
|
|
44
|
+
}
|
|
28
45
|
function violatesNot(expectation, run) {
|
|
29
46
|
const forbidden = "noSkill" in expectation ? undefined : expectation.not;
|
|
30
47
|
if (!forbidden || forbidden.length === 0)
|
package/dist/eval/score.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"score.js","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"score.js","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAKA,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAW9B,MAAM,UAAU,kBAAkB,CAAC,WAAwB,EAAE,GAAoB;IAC/E,IAAI,WAAW,CAAC,WAAW,EAAE,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IAEhD,IAAI,SAAS,IAAI,WAAW,EAAE,CAAC;QAC7B,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,CAAC;IACjC,CAAC;IACD,IAAI,OAAO,IAAI,WAAW,EAAE,CAAC;QAC3B,OAAO,GAAG,CAAC,UAAU,KAAK,WAAW,CAAC,KAAK,CAAC;IAC9C,CAAC;IACD,IAAI,OAAO,IAAI,WAAW,EAAE,CAAC;QAC3B,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,IAAI,WAAW,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAC/E,CAAC;IACD,OAAO,oBAAoB,CAAC,WAAW,CAAC,IAAI,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;AAC9D,CAAC;AAED,MAAM,UAAU,SAAS,CACvB,WAAwB,EACxB,IAAgC,EAChC,SAAS,GAAG,iBAAiB;IAE7B,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,kBAAkB,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;IAClF,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC;IAClE,OAAO;QACL,OAAO;QACP,IAAI,EAAE,IAAI,CAAC,MAAM;QACjB,WAAW;QACX,SAAS;QACT,IAAI,EAAE,WAAW,IAAI,SAAS;QAC9B,SAAS,EAAE,WAAW,CAAC,IAAI,CAAC;KAC7B,CAAC;AACJ,CAAC;AAOD,MAAM,UAAU,gBAAgB,CAAC,GAAqB;IACpD,MAAM,cAAc,GAAG,GAAG,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAC3D,MAAM,UAAU,GAAG,GAAG,CAAC,MAAM,KAAK,IAAI,IAAI,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;IAC1D,OAAO,cAAc,IAAI,UAAU,CAAC;AACtC,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,MAAmC,EACnC,SAAS,GAAG,iBAAiB;IAE7B,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC,MAAM,CAAC;IACvD,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;IACtE,OAAO;QACL,OAAO;QACP,IAAI,EAAE,MAAM,CAAC,MAAM;QACnB,WAAW;QACX,SAAS;QACT,IAAI,EAAE,WAAW,IAAI,SAAS;QAC9B,SAAS,EAAE,IAAI,GAAG,EAAE;KACrB,CAAC;AACJ,CAAC;AAED,SAAS,WAAW,CAAC,WAAwB,EAAE,GAAoB;IACjE,MAAM,SAAS,GAAG,SAAS,IAAI,WAAW,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC;IACzE,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACvD,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;AACvE,CAAC;AAED,SAAS,oBAAoB,CAAC,MAAyB,EAAE,QAA2B;IAClF,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,EAAE,KAAK,MAAM,CAAC,MAAM,CAAC;YAAE,MAAM,IAAI,CAAC,CAAC;QACvC,IAAI,MAAM,KAAK,MAAM,CAAC,MAAM;YAAE,OAAO,IAAI,CAAC;IAC5C,CAAC;IACD,OAAO,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC;AAClC,CAAC;AAED,SAAS,WAAW,CAAC,IAAgC;IACnD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,GAAG,CAAC,UAAU,IAAI,YAAY,CAAC;QAC3C,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jean.gnc/harness-kit",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.13.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Build your own multi-agent harness: typed toolkit for authoring plugins (skills, agents, commands, hooks) and shipping them to Claude Code and Codex from a single source tree.",
|
|
6
6
|
"license": "MIT",
|
|
@@ -68,6 +68,7 @@
|
|
|
68
68
|
]
|
|
69
69
|
},
|
|
70
70
|
"dependencies": {
|
|
71
|
+
"@anthropic-ai/sdk": "^0.100.1",
|
|
71
72
|
"citty": "^0.2.2",
|
|
72
73
|
"js-yaml": "^4.1.1",
|
|
73
74
|
"markdownlint-cli2": "^0.18.1",
|