@jean.gnc/harness-kit 0.12.8 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +43 -0
  2. package/dist/cli.js +13 -1
  3. package/dist/cli.js.map +1 -1
  4. package/dist/eval/capture.d.ts +23 -0
  5. package/dist/eval/capture.d.ts.map +1 -0
  6. package/dist/eval/capture.js +79 -0
  7. package/dist/eval/capture.js.map +1 -0
  8. package/dist/eval/cases.d.ts +10 -2
  9. package/dist/eval/cases.d.ts.map +1 -1
  10. package/dist/eval/cases.js +9 -3
  11. package/dist/eval/cases.js.map +1 -1
  12. package/dist/eval/detect.d.ts +1 -0
  13. package/dist/eval/detect.d.ts.map +1 -1
  14. package/dist/eval/detect.js +1 -1
  15. package/dist/eval/detect.js.map +1 -1
  16. package/dist/eval/grade-deterministic.d.ts +9 -0
  17. package/dist/eval/grade-deterministic.d.ts.map +1 -0
  18. package/dist/eval/grade-deterministic.js +87 -0
  19. package/dist/eval/grade-deterministic.js.map +1 -0
  20. package/dist/eval/grade-judge.d.ts +12 -0
  21. package/dist/eval/grade-judge.d.ts.map +1 -0
  22. package/dist/eval/grade-judge.js +14 -0
  23. package/dist/eval/grade-judge.js.map +1 -0
  24. package/dist/eval/grade.d.ts +5 -0
  25. package/dist/eval/grade.d.ts.map +1 -0
  26. package/dist/eval/grade.js +25 -0
  27. package/dist/eval/grade.js.map +1 -0
  28. package/dist/eval/index.d.ts +4 -0
  29. package/dist/eval/index.d.ts.map +1 -1
  30. package/dist/eval/index.js +27 -5
  31. package/dist/eval/index.js.map +1 -1
  32. package/dist/eval/judge.d.ts +26 -0
  33. package/dist/eval/judge.d.ts.map +1 -0
  34. package/dist/eval/judge.js +55 -0
  35. package/dist/eval/judge.js.map +1 -0
  36. package/dist/eval/report.d.ts +5 -1
  37. package/dist/eval/report.d.ts.map +1 -1
  38. package/dist/eval/report.js +66 -13
  39. package/dist/eval/report.js.map +1 -1
  40. package/dist/eval/runner.d.ts +13 -5
  41. package/dist/eval/runner.d.ts.map +1 -1
  42. package/dist/eval/runner.js +105 -31
  43. package/dist/eval/runner.js.map +1 -1
  44. package/dist/eval/schema.d.ts +644 -29
  45. package/dist/eval/schema.d.ts.map +1 -1
  46. package/dist/eval/schema.js +57 -6
  47. package/dist/eval/schema.js.map +1 -1
  48. package/dist/eval/score.d.ts +8 -0
  49. package/dist/eval/score.d.ts.map +1 -1
  50. package/dist/eval/score.js +17 -0
  51. package/dist/eval/score.js.map +1 -1
  52. package/package.json +2 -1
@@ -1 +1 @@
1
- {"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAyBxB,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAKf,CAAC;AAEH,QAAA,MAAM,IAAI;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAQR,CAAC;AAEH,eAAO,MAAM,KAAK,iCAAkC,CAAC;AAErD,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAIzB,CAAC;AAEH,MAAM,MAAM,IAAI,GAAG,CAAC,OAAO,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC;AAC1C,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,IAAI,CAAC,CAAC;AAC5C,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAEtD,wBAAgB,cAAc,CAAC,WAAW,EAAE,WAAW,GAAG,SAAS,MAAM,EAAE,CAM1E"}
1
+ {"version":3,"file":"schema.d.ts","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAyBxB,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAKf,CAAC;AAEH,QAAA,MAAM,SAAS;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAmBb,CAAC;AAOH,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;IAGf,CAAC;AAEH,QAAA,MAAM,MAAM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAGV,CAAC;AAWH,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAAkE,CAAC;AAEpF,QAAA,MAAM,WAAW;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAON,CAAC;AAEZ,eAAO,MAAM,KAAK,iCAAkC,CAAC;AAcrD,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;IAA2D,CAAC;AAEvF,MAAM,MAAM,IAAI,GAAG,CAAC,OAAO,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC;AAC1C,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,SAAS,CAAC,CAAC;AAClD,MAAM,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,MAAM,CAAC,CAAC;AAC5C,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AACtD,MAAM,MAAM,QAAQ,GAAG,WAAW,GAAG,WAAW,CAAC;AACjD,MAAM,MAAM,QAAQ,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC,CAAC;AAEtD,wBAAgB,cAAc,CAAC,WAAW,EAAE,WAAW,GAAG,SAAS,MAAM,EAAE,CAM1E;AAED,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,QAAQ,GAAG,SAAS,MAAM,EAAE,CAGxE"}
@@ -22,21 +22,67 @@ const Expectation = z.union([
22
22
  PathExpectation,
23
23
  NoSkillExpectation,
24
24
  ]);
25
- const Case = z.object({
25
+ const Assertion = z.discriminatedUnion("kind", [
26
+ z.object({
27
+ kind: z.literal("outputMatches"),
28
+ pattern: z.string().min(1),
29
+ regex: z.boolean().default(false),
30
+ }),
31
+ z.object({
32
+ kind: z.literal("outputExcludes"),
33
+ pattern: z.string().min(1),
34
+ regex: z.boolean().default(false),
35
+ }),
36
+ z.object({ kind: z.literal("usedTool"), tool: z.string().min(1) }),
37
+ z.object({ kind: z.literal("didNotUseTool"), tool: z.string().min(1) }),
38
+ z.object({
39
+ kind: z.literal("wroteFile"),
40
+ path: z.string().min(1),
41
+ contentMatches: z.string().min(1).optional(),
42
+ regex: z.boolean().default(false),
43
+ }),
44
+ ]);
45
+ const Dimension = z.object({
46
+ dimension: z.string().min(1),
47
+ criterion: z.string().min(1),
48
+ });
49
+ const CombineRule = z.discriminatedUnion("combine", [
50
+ z.object({ combine: z.literal("all") }),
51
+ z.object({ combine: z.literal("fraction"), threshold: z.number().min(0).max(1) }),
52
+ ]);
53
+ const Rubric = z.object({
54
+ dimensions: z.array(Dimension).min(1),
55
+ combine: CombineRule.default({ combine: "all" }),
56
+ });
57
+ const CommonCaseFields = {
26
58
  id: z.string().min(1),
27
59
  prompt: z.string().min(1),
28
- expect: Expectation,
29
60
  cwd: z.string().optional(),
30
61
  runs: z.number().int().positive().optional(),
31
62
  threshold: z.number().min(0).max(1).optional(),
32
63
  note: z.string().optional(),
33
- });
64
+ };
65
+ const RoutingCase = z.object({ ...CommonCaseFields, expect: Expectation }).strict();
66
+ const SolvingCase = z
67
+ .object({
68
+ ...CommonCaseFields,
69
+ expectSkill: FqId.optional(),
70
+ assert: z.array(Assertion).default([]),
71
+ rubric: Rubric.optional(),
72
+ })
73
+ .strict();
34
74
  export const TIERS = ["routing", "solving"];
35
- export const CaseFileSchema = z.object({
75
+ const RoutingFile = z.object({
76
+ suite: z.string().min(1),
77
+ tier: z.literal("routing"),
78
+ cases: z.array(RoutingCase).min(1),
79
+ });
80
+ const SolvingFile = z.object({
36
81
  suite: z.string().min(1),
37
- tier: z.enum(TIERS),
38
- cases: z.array(Case).min(1),
82
+ tier: z.literal("solving"),
83
+ cases: z.array(SolvingCase).min(1),
39
84
  });
85
+ export const CaseFileSchema = z.discriminatedUnion("tier", [RoutingFile, SolvingFile]);
40
86
  export function expectedSkills(expectation) {
41
87
  if ("noSkill" in expectation)
42
88
  return [];
@@ -47,4 +93,9 @@ export function expectedSkills(expectation) {
47
93
  return [...expectation.anyOf, ...forbidden];
48
94
  return [...expectation.path, ...forbidden];
49
95
  }
96
+ export function caseExpectedSkills(evalCase) {
97
+ if ("expect" in evalCase)
98
+ return expectedSkills(evalCase.expect);
99
+ return evalCase.expectSkill ? [evalCase.expectSkill] : [];
100
+ }
50
101
  //# sourceMappingURL=schema.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,EAAE,KAAK,EAAE,MAAM,WAAW,CAAC;AAElC,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,EAAE,4BAA4B,CAAC,CAAC;AAEnE,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,IAAI;IACX,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,kBAAkB,GAAG,CAAC,CAAC,MAAM,CAAC;IAClC,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC;CACzB,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC;IAC1B,gBAAgB;IAChB,gBAAgB;IAChB,eAAe;IACf,kBAAkB;CACnB,CAAC,CAAC;AAEH,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,CAAC;IACpB,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACrB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACzB,MAAM,EAAE,WAAW;IACnB,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC1B,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IAC5C,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC9C,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;CAC5B,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,KAAK,GAAG,CAAC,SAAS,EAAE,SAAS,CAAU,CAAC;AAErD,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IACrC,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACxB,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC;IACnB,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CAC5B,CAAC,CAAC;AAOH,MAAM,UAAU,cAAc,CAAC,WAAwB;IACrD,IAAI,SAAS,IAAI,WAAW;QAAE,OAAO,EAAE,CAAC;IACxC,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,IAAI,EAAE,CAAC;IACxC,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,CAAC,WAAW,CAAC,KAAK,EAAE,GAAG,SAAS,CAAC,CAAC;IACrE,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,CAAC,GAAG,WAAW,CAAC,KAAK,EAAE,GAAG,SAAS,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,WAAW,CAAC,IAAI,EAAE,GAAG,SAAS,CAAC,CAAC;AAC7C,CAAC"}
1
+ {"version":3,"file":"schema.js","sourceRoot":"","sources":["../../src/eval/schema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,EAAE,KAAK,EAAE,MAAM,WAAW,CAAC;AAElC,MAAM,IAAI,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,KAAK,CAAC,KAAK,EAAE,4BAA4B,CAAC,CAAC;AAEnE,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,IAAI;IACX,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1B,GAAG,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE;CAC9B,CAAC,CAAC;AAEH,MAAM,kBAAkB,GAAG,CAAC,CAAC,MAAM,CAAC;IAClC,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC;CACzB,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAC,KAAK,CAAC;IAC1B,gBAAgB;IAChB,gBAAgB;IAChB,eAAe;IACf,kBAAkB;CACnB,CAAC,CAAC;AAEH,MAAM,SAAS,GAAG,CAAC,CAAC,kBAAkB,CAAC,MAAM,EAAE;IAC7C,CAAC,CAAC,MAAM,CAAC;QACP,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,eAAe,CAAC;QAChC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1B,KAAK,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC;KAClC,CAAC;IACF,CAAC,CAAC,MAAM,CAAC;QACP,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,gBAAgB,CAAC;QACjC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1B,KAAK,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC;KAClC,CAAC;IACF,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;IAClE,CAAC,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,eAAe,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;IACvE,CAAC,CAAC,MAAM,CAAC;QACP,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,WAAW,CAAC;QAC5B,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;QACvB,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;QAC5C,KAAK,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC;KAClC,CAAC;CACH,CAAC,CAAC;AAEH,MAAM,SAAS,GAAG,CAAC,CAAC,MAAM,CAAC;IACzB,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IAC5B,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;CAC7B,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAC,kBAAkB,CAAC,SAAS,EAAE;IAClD,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;IACvC,CAAC,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;CAClF,CAAC,CAAC;AAEH,MAAM,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC;IACtB,UAAU,EAAE,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACrC,OAAO,EAAE,WAAW,CAAC,OAAO,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;CACjD,CAAC,CAAC;AAEH,MAAM,gBAAgB,GAAG;IACvB,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACrB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACzB,GAAG,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC1B,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,EAAE;IAC5C,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,EAAE;IAC9C,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;CAC5B,CAAC;AAEF,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC,EAAE,GAAG,gBAAgB,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC;AAEpF,MAAM,WAAW,GAAG,CAAC;KAClB,MAAM,CAAC;IACN,GAAG,gBAAgB;IACnB,WAAW,EAAE,IAAI,CAAC,QAAQ,EAAE;IAC5B,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC;IACtC,MAAM,EAAE,MAAM,CAAC,QAAQ,EAAE;CAC1B,CAAC;KACD,MAAM,EAAE,CAAC;AAEZ,MAAM,CAAC,MAAM,KAAK,GAAG,CAAC,SAAS,EAAE,SAAS,CAAU,CAAC;AAErD,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC;IAC3B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACxB,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC;IAC1B,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CACnC,CAAC,CAAC;AAEH,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC;IAC3B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC;IACxB,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC;IAC1B,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;CACnC,CAAC,CAAC;AAEH,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,kBAAkB,CAAC,MAAM,EAAE,CAAC,WAAW,EAAE,WAAW,CAAC,CAAC,CAAC;AAYvF,MAAM,UAAU,cAAc,CAAC,WAAwB;IACrD,IAAI,SAAS,IAAI,WAAW;QAAE,OAAO,EAAE,CAAC;IACxC,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,IAAI,EAAE,CAAC;IACxC,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,CAAC,WAAW,CAAC,KAAK,EAAE,GAAG,SAAS,CAAC,CAAC;IACrE,IAAI,OAAO,IAAI,WAAW;QAAE,OAAO,CAAC,GAAG,WAAW,CAAC,KAAK,EAAE,GAAG,SAAS,CAAC,CAAC;IACxE,OAAO,CAAC,GAAG,WAAW,CAAC,IAAI,EAAE,GAAG,SAAS,CAAC,CAAC;AAC7C,CAAC;AAED,MAAM,UAAU,kBAAkB,CAAC,QAAkB;IACnD,IAAI,QAAQ,IAAI,QAAQ;QAAE,OAAO,cAAc,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;IACjE,OAAO,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;AAC5D,CAAC"}
@@ -1,4 +1,6 @@
1
1
  import type { DetectionResult } from "./detect.js";
2
+ import type { AssertionResult } from "./grade-deterministic.js";
3
+ import type { RubricResult } from "./grade-judge.js";
2
4
  import type { Expectation } from "./schema.js";
3
5
  export interface CaseScore {
4
6
  readonly matched: number;
@@ -10,4 +12,10 @@ export interface CaseScore {
10
12
  }
11
13
  export declare function matchesExpectation(expectation: Expectation, run: DetectionResult): boolean;
12
14
  export declare function scoreCase(expectation: Expectation, runs: readonly DetectionResult[], threshold?: number): CaseScore;
15
+ export interface SolvingRunResult {
16
+ readonly assertions: readonly AssertionResult[];
17
+ readonly rubric: RubricResult | null;
18
+ }
19
+ export declare function solvingRunPassed(run: SolvingRunResult): boolean;
20
+ export declare function scoreSolving(perRun: readonly SolvingRunResult[], threshold?: number): CaseScore;
13
21
  //# sourceMappingURL=score.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"score.d.ts","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACnD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAI/C,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACjD;AAED,wBAAgB,kBAAkB,CAAC,WAAW,EAAE,WAAW,EAAE,GAAG,EAAE,eAAe,GAAG,OAAO,CAa1F;AAED,wBAAgB,SAAS,CACvB,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,SAAS,eAAe,EAAE,EAChC,SAAS,SAAoB,GAC5B,SAAS,CAWX"}
1
+ {"version":3,"file":"score.d.ts","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AACnD,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAChE,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAI/C,MAAM,WAAW,SAAS;IACxB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACjD;AAED,wBAAgB,kBAAkB,CAAC,WAAW,EAAE,WAAW,EAAE,GAAG,EAAE,eAAe,GAAG,OAAO,CAa1F;AAED,wBAAgB,SAAS,CACvB,WAAW,EAAE,WAAW,EACxB,IAAI,EAAE,SAAS,eAAe,EAAE,EAChC,SAAS,SAAoB,GAC5B,SAAS,CAWX;AAED,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,UAAU,EAAE,SAAS,eAAe,EAAE,CAAC;IAChD,QAAQ,CAAC,MAAM,EAAE,YAAY,GAAG,IAAI,CAAC;CACtC;AAED,wBAAgB,gBAAgB,CAAC,GAAG,EAAE,gBAAgB,GAAG,OAAO,CAI/D;AAED,wBAAgB,YAAY,CAC1B,MAAM,EAAE,SAAS,gBAAgB,EAAE,EACnC,SAAS,SAAoB,GAC5B,SAAS,CAWX"}
@@ -25,6 +25,23 @@ export function scoreCase(expectation, runs, threshold = DEFAULT_THRESHOLD) {
25
25
  histogram: histogramOf(runs),
26
26
  };
27
27
  }
28
+ export function solvingRunPassed(run) {
29
+ const assertionsPass = run.assertions.every((a) => a.pass);
30
+ const rubricPass = run.rubric === null || run.rubric.pass;
31
+ return assertionsPass && rubricPass;
32
+ }
33
+ export function scoreSolving(perRun, threshold = DEFAULT_THRESHOLD) {
34
+ const matched = perRun.filter(solvingRunPassed).length;
35
+ const triggerRate = perRun.length === 0 ? 0 : matched / perRun.length;
36
+ return {
37
+ matched,
38
+ runs: perRun.length,
39
+ triggerRate,
40
+ threshold,
41
+ pass: triggerRate >= threshold,
42
+ histogram: new Map(),
43
+ };
44
+ }
28
45
  function violatesNot(expectation, run) {
29
46
  const forbidden = "noSkill" in expectation ? undefined : expectation.not;
30
47
  if (!forbidden || forbidden.length === 0)
@@ -1 +1 @@
1
- {"version":3,"file":"score.js","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAGA,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAW9B,MAAM,UAAU,kBAAkB,CAAC,WAAwB,EAAE,GAAoB;IAC/E,IAAI,WAAW,CAAC,WAAW,EAAE,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IAEhD,IAAI,SAAS,IAAI,WAAW,EAAE,CAAC;QAC7B,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,CAAC;IACjC,CAAC;IACD,IAAI,OAAO,IAAI,WAAW,EAAE,CAAC;QAC3B,OAAO,GAAG,CAAC,UAAU,KAAK,WAAW,CAAC,KAAK,CAAC;IAC9C,CAAC;IACD,IAAI,OAAO,IAAI,WAAW,EAAE,CAAC;QAC3B,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,IAAI,WAAW,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAC/E,CAAC;IACD,OAAO,oBAAoB,CAAC,WAAW,CAAC,IAAI,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;AAC9D,CAAC;AAED,MAAM,UAAU,SAAS,CACvB,WAAwB,EACxB,IAAgC,EAChC,SAAS,GAAG,iBAAiB;IAE7B,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,kBAAkB,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;IAClF,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC;IAClE,OAAO;QACL,OAAO;QACP,IAAI,EAAE,IAAI,CAAC,MAAM;QACjB,WAAW;QACX,SAAS;QACT,IAAI,EAAE,WAAW,IAAI,SAAS;QAC9B,SAAS,EAAE,WAAW,CAAC,IAAI,CAAC;KAC7B,CAAC;AACJ,CAAC;AAED,SAAS,WAAW,CAAC,WAAwB,EAAE,GAAoB;IACjE,MAAM,SAAS,GAAG,SAAS,IAAI,WAAW,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC;IACzE,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACvD,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;AACvE,CAAC;AAED,SAAS,oBAAoB,CAAC,MAAyB,EAAE,QAA2B;IAClF,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,EAAE,KAAK,MAAM,CAAC,MAAM,CAAC;YAAE,MAAM,IAAI,CAAC,CAAC;QACvC,IAAI,MAAM,KAAK,MAAM,CAAC,MAAM;YAAE,OAAO,IAAI,CAAC;IAC5C,CAAC;IACD,OAAO,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC;AAClC,CAAC;AAED,SAAS,WAAW,CAAC,IAAgC;IACnD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,GAAG,CAAC,UAAU,IAAI,YAAY,CAAC;QAC3C,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
1
+ {"version":3,"file":"score.js","sourceRoot":"","sources":["../../src/eval/score.ts"],"names":[],"mappings":"AAKA,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAW9B,MAAM,UAAU,kBAAkB,CAAC,WAAwB,EAAE,GAAoB;IAC/E,IAAI,WAAW,CAAC,WAAW,EAAE,GAAG,CAAC;QAAE,OAAO,KAAK,CAAC;IAEhD,IAAI,SAAS,IAAI,WAAW,EAAE,CAAC;QAC7B,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,CAAC;IACjC,CAAC;IACD,IAAI,OAAO,IAAI,WAAW,EAAE,CAAC;QAC3B,OAAO,GAAG,CAAC,UAAU,KAAK,WAAW,CAAC,KAAK,CAAC;IAC9C,CAAC;IACD,IAAI,OAAO,IAAI,WAAW,EAAE,CAAC;QAC3B,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,IAAI,WAAW,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;IAC/E,CAAC;IACD,OAAO,oBAAoB,CAAC,WAAW,CAAC,IAAI,EAAE,GAAG,CAAC,QAAQ,CAAC,CAAC;AAC9D,CAAC;AAED,MAAM,UAAU,SAAS,CACvB,WAAwB,EACxB,IAAgC,EAChC,SAAS,GAAG,iBAAiB;IAE7B,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,kBAAkB,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;IAClF,MAAM,WAAW,GAAG,IAAI,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC;IAClE,OAAO;QACL,OAAO;QACP,IAAI,EAAE,IAAI,CAAC,MAAM;QACjB,WAAW;QACX,SAAS;QACT,IAAI,EAAE,WAAW,IAAI,SAAS;QAC9B,SAAS,EAAE,WAAW,CAAC,IAAI,CAAC;KAC7B,CAAC;AACJ,CAAC;AAOD,MAAM,UAAU,gBAAgB,CAAC,GAAqB;IACpD,MAAM,cAAc,GAAG,GAAG,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAC3D,MAAM,UAAU,GAAG,GAAG,CAAC,MAAM,KAAK,IAAI,IAAI,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC;IAC1D,OAAO,cAAc,IAAI,UAAU,CAAC;AACtC,CAAC;AAED,MAAM,UAAU,YAAY,CAC1B,MAAmC,EACnC,SAAS,GAAG,iBAAiB;IAE7B,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC,MAAM,CAAC;IACvD,MAAM,WAAW,GAAG,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;IACtE,OAAO;QACL,OAAO;QACP,IAAI,EAAE,MAAM,CAAC,MAAM;QACnB,WAAW;QACX,SAAS;QACT,IAAI,EAAE,WAAW,IAAI,SAAS;QAC9B,SAAS,EAAE,IAAI,GAAG,EAAE;KACrB,CAAC;AACJ,CAAC;AAED,SAAS,WAAW,CAAC,WAAwB,EAAE,GAAoB;IACjE,MAAM,SAAS,GAAG,SAAS,IAAI,WAAW,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC;IACzE,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACvD,OAAO,GAAG,CAAC,UAAU,KAAK,IAAI,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC;AACvE,CAAC;AAED,SAAS,oBAAoB,CAAC,MAAyB,EAAE,QAA2B;IAClF,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,IAAI,EAAE,KAAK,MAAM,CAAC,MAAM,CAAC;YAAE,MAAM,IAAI,CAAC,CAAC;QACvC,IAAI,MAAM,KAAK,MAAM,CAAC,MAAM;YAAE,OAAO,IAAI,CAAC;IAC5C,CAAC;IACD,OAAO,MAAM,KAAK,MAAM,CAAC,MAAM,CAAC;AAClC,CAAC;AAED,SAAS,WAAW,CAAC,IAAgC;IACnD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;IACzC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,GAAG,CAAC,UAAU,IAAI,YAAY,CAAC;QAC3C,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9C,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jean.gnc/harness-kit",
3
- "version": "0.12.8",
3
+ "version": "0.13.0",
4
4
  "type": "module",
5
5
  "description": "Build your own multi-agent harness: typed toolkit for authoring plugins (skills, agents, commands, hooks) and shipping them to Claude Code and Codex from a single source tree.",
6
6
  "license": "MIT",
@@ -68,6 +68,7 @@
68
68
  ]
69
69
  },
70
70
  "dependencies": {
71
+ "@anthropic-ai/sdk": "^0.100.1",
71
72
  "citty": "^0.2.2",
72
73
  "js-yaml": "^4.1.1",
73
74
  "markdownlint-cli2": "^0.18.1",