@skilljack/evals 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +327 -0
  3. package/action/action.yml +72 -0
  4. package/action/index.ts +78 -0
  5. package/dist/action/index.d.ts +8 -0
  6. package/dist/action/index.d.ts.map +1 -0
  7. package/dist/action/index.js +68 -0
  8. package/dist/action/index.js.map +1 -0
  9. package/dist/src/cli.d.ts +9 -0
  10. package/dist/src/cli.d.ts.map +1 -0
  11. package/dist/src/cli.js +264 -0
  12. package/dist/src/cli.js.map +1 -0
  13. package/dist/src/config.d.ts +52 -0
  14. package/dist/src/config.d.ts.map +1 -0
  15. package/dist/src/config.js +194 -0
  16. package/dist/src/config.js.map +1 -0
  17. package/dist/src/index.d.ts +24 -0
  18. package/dist/src/index.d.ts.map +1 -0
  19. package/dist/src/index.js +28 -0
  20. package/dist/src/index.js.map +1 -0
  21. package/dist/src/parser.d.ts +22 -0
  22. package/dist/src/parser.d.ts.map +1 -0
  23. package/dist/src/parser.js +205 -0
  24. package/dist/src/parser.js.map +1 -0
  25. package/dist/src/pipeline.d.ts +53 -0
  26. package/dist/src/pipeline.d.ts.map +1 -0
  27. package/dist/src/pipeline.js +185 -0
  28. package/dist/src/pipeline.js.map +1 -0
  29. package/dist/src/report/github-summary.d.ts +15 -0
  30. package/dist/src/report/github-summary.d.ts.map +1 -0
  31. package/dist/src/report/github-summary.js +77 -0
  32. package/dist/src/report/github-summary.js.map +1 -0
  33. package/dist/src/report/report.d.ts +23 -0
  34. package/dist/src/report/report.d.ts.map +1 -0
  35. package/dist/src/report/report.js +216 -0
  36. package/dist/src/report/report.js.map +1 -0
  37. package/dist/src/runner/runner.d.ts +29 -0
  38. package/dist/src/runner/runner.d.ts.map +1 -0
  39. package/dist/src/runner/runner.js +211 -0
  40. package/dist/src/runner/runner.js.map +1 -0
  41. package/dist/src/runner/security.d.ts +26 -0
  42. package/dist/src/runner/security.d.ts.map +1 -0
  43. package/dist/src/runner/security.js +34 -0
  44. package/dist/src/runner/security.js.map +1 -0
  45. package/dist/src/runner/skill-setup.d.ts +19 -0
  46. package/dist/src/runner/skill-setup.d.ts.map +1 -0
  47. package/dist/src/runner/skill-setup.js +72 -0
  48. package/dist/src/runner/skill-setup.js.map +1 -0
  49. package/dist/src/scorer/deterministic.d.ts +12 -0
  50. package/dist/src/scorer/deterministic.d.ts.map +1 -0
  51. package/dist/src/scorer/deterministic.js +149 -0
  52. package/dist/src/scorer/deterministic.js.map +1 -0
  53. package/dist/src/scorer/judge.d.ts +34 -0
  54. package/dist/src/scorer/judge.d.ts.map +1 -0
  55. package/dist/src/scorer/judge.js +226 -0
  56. package/dist/src/scorer/judge.js.map +1 -0
  57. package/dist/src/scorer/scorer.d.ts +25 -0
  58. package/dist/src/scorer/scorer.d.ts.map +1 -0
  59. package/dist/src/scorer/scorer.js +149 -0
  60. package/dist/src/scorer/scorer.js.map +1 -0
  61. package/dist/src/session/session-logger.d.ts +30 -0
  62. package/dist/src/session/session-logger.d.ts.map +1 -0
  63. package/dist/src/session/session-logger.js +157 -0
  64. package/dist/src/session/session-logger.js.map +1 -0
  65. package/dist/src/types.d.ts +227 -0
  66. package/dist/src/types.d.ts.map +1 -0
  67. package/dist/src/types.js +16 -0
  68. package/dist/src/types.js.map +1 -0
  69. package/package.json +44 -0
@@ -0,0 +1 @@
1
+ {"version":3,"file":"skill-setup.js","sourceRoot":"","sources":["../../../src/runner/skill-setup.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAE7B;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,eAAuB,EACvB,GAAW;IAEX,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;IACtD,MAAM,EAAE,CAAC,KAAK,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE/C,MAAM,UAAU,GAAa,EAAE,CAAC;IAEhC,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;QAE3E,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;YAC5B,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC;gBACxB,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;gBAC3D,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;gBAEtD,MAAM,OAAO,CAAC,WAAW,EAAE,YAAY,CAAC,CAAC;gBACzC,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAC9B,CAAC;iBAAM,IAAI,KAAK,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;gBACrC,kCAAkC;gBAClC,MAAM,SAAS,GAAG,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC;gBACjD,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,SAAS,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;gBACrE,MAAM,EAAE,CAAC,QAAQ,CACf,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,KAAK,CAAC,IAAI,CAAC,EACtC,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,SAAS,EAAE,UAAU,CAAC,CAC5C,CAAC;gBACF,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,IAAI,KAAK,CACb,qCAAqC,eAAe,KAAK,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAC5G,CAAC;IACJ,CAAC;IAED,OAAO,UAAU,CAAC;AACpB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,kBAAkB,CAAC,GAAW;IAClD,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC;IACtD,IAAI,CAAC;QACH,MAAM,EAAE,CAAC,EAAE,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IAC3D,CAAC;IAAC,MAAM,CAAC;QACP,wBAAwB;IAC1B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,OAAO,CAAC,GAAW,EAAE,IAAY;IAC9C,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1C,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;IAE/D,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;QAC3C,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;QAE7C,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC;YACxB,MAAM,OAAO,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;QACnC,CAAC;aAAM,CAAC;YACN,MAAM,EAAE,CAAC,QAAQ,CAAC,OAAO,EAAE,QAAQ,CAAC,CAAC;QACvC,CAAC;IACH,CAAC;AACH,CAAC"}
@@ -0,0 +1,12 @@
1
+ /**
2
+ * Deterministic scorer for skill evaluations.
3
+ *
4
+ * Performs fast, free checks based on tool call analysis and output markers.
5
+ * No LLM calls required — checks are purely based on the session data.
6
+ */
7
+ import type { EvalTask, TaskResult, DeterministicResult } from '../types.js';
8
+ /**
9
+ * Run deterministic checks on a task result.
10
+ */
11
+ export declare function scoreDeterministic(task: EvalTask, result: TaskResult): DeterministicResult | null;
12
+ //# sourceMappingURL=deterministic.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"deterministic.d.ts","sourceRoot":"","sources":["../../../src/scorer/deterministic.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,QAAQ,EACR,UAAU,EACV,mBAAmB,EACpB,MAAM,aAAa,CAAC;AAqBrB;;GAEG;AACH,wBAAgB,kBAAkB,CAChC,IAAI,EAAE,QAAQ,EACd,MAAM,EAAE,UAAU,GACjB,mBAAmB,GAAG,IAAI,CAuH5B"}
@@ -0,0 +1,149 @@
1
+ /**
2
+ * Deterministic scorer for skill evaluations.
3
+ *
4
+ * Performs fast, free checks based on tool call analysis and output markers.
5
+ * No LLM calls required — checks are purely based on the session data.
6
+ */
7
+ /**
8
+ * Check if a tool name is a skill activation tool.
9
+ */
10
+ function isSkillTool(toolName) {
11
+ // Local mode uses "Skill", MCP mode uses mcp__*__skill
12
+ return toolName === 'Skill' ||
13
+ (toolName.includes('skill') && !toolName.includes('skill-resource'));
14
+ }
15
+ /**
16
+ * Extract skill name from a tool call input.
17
+ */
18
+ function extractSkillName(input) {
19
+ if (typeof input !== 'object' || input === null)
20
+ return undefined;
21
+ const obj = input;
22
+ // Local Skill tool uses 'skill', MCP uses 'name'
23
+ return obj.skill || obj.skill_name || obj.name || undefined;
24
+ }
25
+ /**
26
+ * Run deterministic checks on a task result.
27
+ */
28
+ export function scoreDeterministic(task, result) {
29
+ const check = task.deterministic;
30
+ if (!check)
31
+ return null;
32
+ const details = [];
33
+ // 1. Check skill activation
34
+ let skillActivated = false;
35
+ let activatedSkillName;
36
+ if (result.isError) {
37
+ details.push('Task errored — treating as no activation');
38
+ }
39
+ else {
40
+ // Check tool calls for skill invocations
41
+ for (const call of result.toolCalls) {
42
+ if (isSkillTool(call.tool)) {
43
+ const name = extractSkillName(call.input);
44
+ if (name) {
45
+ skillActivated = true;
46
+ activatedSkillName = name;
47
+ break;
48
+ }
49
+ }
50
+ }
51
+ // Also check skillLoads array (may be populated by runner)
52
+ if (!skillActivated && result.skillLoads.length > 0) {
53
+ skillActivated = true;
54
+ activatedSkillName = result.skillLoads[0];
55
+ }
56
+ }
57
+ // Verify activation matches expectations
58
+ if (check.expectSkillActivation) {
59
+ if (skillActivated) {
60
+ // Check if the correct skill was activated
61
+ if (task.expectedSkillLoad && task.expectedSkillLoad !== 'none') {
62
+ if (activatedSkillName === task.expectedSkillLoad) {
63
+ details.push(`Skill activated correctly: ${activatedSkillName}`);
64
+ }
65
+ else {
66
+ details.push(`Wrong skill activated: expected '${task.expectedSkillLoad}', got '${activatedSkillName}'`);
67
+ skillActivated = false; // Wrong skill doesn't count
68
+ }
69
+ }
70
+ else {
71
+ details.push(`Skill activated: ${activatedSkillName}`);
72
+ }
73
+ }
74
+ else {
75
+ details.push(`Expected skill activation but no skill was loaded`);
76
+ }
77
+ }
78
+ else {
79
+ // Expect NO activation (false positive test)
80
+ if (skillActivated) {
81
+ details.push(`Unexpected skill activation: ${activatedSkillName} (false positive)`);
82
+ }
83
+ else {
84
+ details.push('Correctly did not activate any skill');
85
+ }
86
+ }
87
+ // 2. Check marker in output
88
+ let markerFound = null;
89
+ if (check.expectMarker) {
90
+ const output = result.output.toLowerCase();
91
+ const marker = check.expectMarker.toLowerCase();
92
+ markerFound = output.includes(marker);
93
+ details.push(markerFound
94
+ ? `Marker found: "${check.expectMarker}"`
95
+ : `Marker not found: "${check.expectMarker}"`);
96
+ }
97
+ // 3. Check expected tool calls
98
+ let expectedToolsCalled = null;
99
+ if (check.expectToolCalls && check.expectToolCalls.length > 0) {
100
+ const calledTools = new Set(result.toolCalls.map((c) => c.tool));
101
+ const missing = check.expectToolCalls.filter((t) => !calledTools.has(t));
102
+ expectedToolsCalled = missing.length === 0;
103
+ if (expectedToolsCalled) {
104
+ details.push(`All expected tools called: ${check.expectToolCalls.join(', ')}`);
105
+ }
106
+ else {
107
+ details.push(`Missing expected tool calls: ${missing.join(', ')}`);
108
+ }
109
+ }
110
+ // 4. Check forbidden tool calls
111
+ let unexpectedToolsCalled = null;
112
+ if (check.expectNoToolCalls && check.expectNoToolCalls.length > 0) {
113
+ const calledTools = new Set(result.toolCalls.map((c) => c.tool));
114
+ const forbidden = check.expectNoToolCalls.filter((t) => calledTools.has(t));
115
+ unexpectedToolsCalled = forbidden.length > 0;
116
+ if (unexpectedToolsCalled) {
117
+ details.push(`Forbidden tools were called: ${forbidden.join(', ')}`);
118
+ }
119
+ else {
120
+ details.push(`No forbidden tools called`);
121
+ }
122
+ }
123
+ // Compute overall pass/fail
124
+ let passed;
125
+ if (check.expectSkillActivation) {
126
+ // For positive tests: skill must be activated
127
+ passed = skillActivated;
128
+ if (markerFound !== null)
129
+ passed = passed && markerFound;
130
+ if (expectedToolsCalled !== null)
131
+ passed = passed && expectedToolsCalled;
132
+ if (unexpectedToolsCalled !== null)
133
+ passed = passed && !unexpectedToolsCalled;
134
+ }
135
+ else {
136
+ // For negative tests (false positive): skill must NOT be activated
137
+ passed = !skillActivated;
138
+ }
139
+ return {
140
+ skillActivated,
141
+ skillName: activatedSkillName,
142
+ markerFound,
143
+ expectedToolsCalled,
144
+ unexpectedToolsCalled,
145
+ passed,
146
+ details,
147
+ };
148
+ }
149
+ //# sourceMappingURL=deterministic.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"deterministic.js","sourceRoot":"","sources":["../../../src/scorer/deterministic.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAQH;;GAEG;AACH,SAAS,WAAW,CAAC,QAAgB;IACnC,uDAAuD;IACvD,OAAO,QAAQ,KAAK,OAAO;QACzB,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,CAAC;AACzE,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,KAAc;IACtC,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,KAAK,KAAK,IAAI;QAAE,OAAO,SAAS,CAAC;IAClE,MAAM,GAAG,GAAG,KAAgC,CAAC;IAC7C,iDAAiD;IACjD,OAAQ,GAAG,CAAC,KAAgB,IAAK,GAAG,CAAC,UAAqB,IAAK,GAAG,CAAC,IAAe,IAAI,SAAS,CAAC;AAClG,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAChC,IAAc,EACd,MAAkB;IAElB,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,CAAC;IACjC,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,MAAM,OAAO,GAAa,EAAE,CAAC;IAE7B,4BAA4B;IAC5B,IAAI,cAAc,GAAG,KAAK,CAAC;IAC3B,IAAI,kBAAsC,CAAC;IAE3C,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;QACnB,OAAO,CAAC,IAAI,CAAC,0CAA0C,CAAC,CAAC;IAC3D,CAAC;SAAM,CAAC;QACN,yCAAyC;QACzC,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;YACpC,IAAI,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC3B,MAAM,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBAC1C,IAAI,IAAI,EAAE,CAAC;oBACT,cAAc,GAAG,IAAI,CAAC;oBACtB,kBAAkB,GAAG,IAAI,CAAC;oBAC1B,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;QAED,2DAA2D;QAC3D,IAAI,CAAC,cAAc,IAAI,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpD,cAAc,GAAG,IAAI,CAAC;YACtB,kBAAkB,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED,yCAAyC;IACzC,IAAI,KAAK,CAAC,qBAAqB,EAAE,CAAC;QAChC,IAAI,cAAc,EAAE,CAAC;YACnB,2CAA2C;YAC3C,IAAI,IAAI,CAAC,iBAAiB,IAAI,IAAI,CAAC,iBAAiB,KAAK,MAAM,EAAE,CAAC;gBAChE,IAAI,kBAAkB,KAAK,IAAI,CAAC,iBAAiB,EAAE,CAAC;oBAClD,OAAO,CAAC,IAAI,CAAC,8BAA8B,kBAAkB,EAAE,CAAC,CAAC;gBACnE,CAAC;qBAAM,CAAC;oBACN,OAAO,CAAC,IAAI,CAAC,oCAAoC,IAAI,CAAC,iBAAiB,WAAW,kBAAkB,GAAG,CAAC,CAAC;oBACzG,cAAc,GAAG,KAAK,CAAC,CAAC,4BAA4B;gBACtD,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,OAAO,CAAC,IAAI,CAAC,oBAAoB,kBAAkB,EAAE,CAAC,CAAC;YACzD,CAAC;QACH,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CAAC,mDAAmD,CAAC,CAAC;QACpE,CAAC;IACH,CAAC;SAAM,CAAC;QACN,6CAA6C;QAC7C,IAAI,cAAc,EAAE,CAAC;YACnB,OAAO,CAAC,IAAI,CAAC,gCAAgC,kBAAkB,mBAAmB,CAAC,CAAC;QACtF,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CAAC,sCAAsC,CAAC,CAAC;QACvD,CAAC;IACH,CAAC;IAED,4BAA4B;IAC5B,IAAI,WAAW,GAAmB,IAAI,CAAC;IACvC,IAAI,KAAK,CAAC,YAAY,EAAE,CAAC;QACvB,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC;QAC3C,MAAM,MAAM,GAAG,KAAK,CAAC,YAAY,CAAC,WAAW,EAAE,CAAC;QAChD,WAAW,GAAG,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACtC,OAAO,CAAC,IAAI,CACV,WAAW;YACT,CAAC,CAAC,kBAAkB,KAAK,CAAC,YAAY,GAAG;YACzC,CAAC,CAAC,sBAAsB,KAAK,CAAC,YAAY,GAAG,CAChD,CAAC;IACJ,CAAC;IAED,+BAA+B;IAC/B,IAAI,mBAAmB,GAAmB,IAAI,CAAC;IAC/C,IAAI,KAAK,CAAC,eAAe,IAAI,KAAK,CAAC,eAAe,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC9D,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QACjE,MAAM,OAAO,GAAG,KAAK,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACzE,mBAAmB,GAAG,OAAO,CAAC,MAAM,KAAK,CAAC,CAAC;QAC3C,IAAI,mBAAmB,EAAE,CAAC;YACxB,OAAO,CAAC,IAAI,CAAC,8BAA8B,KAAK,CAAC,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACjF,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CAAC,gCAAgC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACrE,CAAC;IACH,CAAC;IAED,gCAAgC;IAChC,IAAI,qBAAqB,GAAmB,IAAI,CAAC;IACjD,IAAI,KAAK,CAAC,iBAAiB,IAAI,KAAK,CAAC,iBAAiB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClE,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;QACjE,MAAM,SAAS,GAAG,KAAK,CAAC,iBAAiB,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5E,qBAAqB,GAAG,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC;QAC7C,IAAI,qBAAqB,EAAE,CAAC;YAC1B,OAAO,CAAC,IAAI,CAAC,gCAAgC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACvE,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CAAC,2BAA2B,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED,4BAA4B;IAC5B,IAAI,MAAe,CAAC;IACpB,IAAI,KAAK,CAAC,qBAAqB,EAAE,CAAC;QAChC,8CAA8C;QAC9C,MAAM,GAAG,cAAc,CAAC;QACxB,IAAI,WAAW,KAAK,IAAI;YAAE,MAAM,GAAG,MAAM,IAAI,WAAW,CAAC;QACzD,IAAI,mBAAmB,KAAK,IAAI;YAAE,MAAM,GAAG,MAAM,IAAI,mBAAmB,CAAC;QACzE,IAAI,qBAAqB,KAAK,IAAI;YAAE,MAAM,GAAG,MAAM,IAAI,CAAC,qBAAqB,CAAC;IAChF,CAAC;SAAM,CAAC;QACN,mEAAmE;QACnE,MAAM,GAAG,CAAC,cAAc,CAAC;IAC3B,CAAC;IAED,OAAO;QACL,cAAc;QACd,SAAS,EAAE,kBAAkB;QAC7B,WAAW;QACX,mBAAmB;QACnB,qBAAqB;QACrB,MAAM;QACN,OAAO;KACR,CAAC;AACJ,CAAC"}
@@ -0,0 +1,34 @@
1
+ /**
2
+ * LLM-as-judge for scoring skill evaluation results.
3
+ *
4
+ * Uses Claude (via Agent SDK) to evaluate agent performance on three dimensions:
5
+ * - Discovery (0/1): Did agent load the expected skill?
6
+ * - Adherence (1-5): How well did agent follow skill instructions?
7
+ * - Output Quality (1-5): Does output meet task requirements?
8
+ */
9
+ import type { EvalTask, TaskResult, JudgeScore, JudgeOptions } from '../types.js';
10
+ /**
11
+ * LLM-as-judge for scoring skill evaluation results.
12
+ */
13
+ export declare class SkillJudge {
14
+ private options;
15
+ constructor(options?: JudgeOptions);
16
+ /**
17
+ * Build the prompt for the judge.
18
+ */
19
+ private buildJudgePrompt;
20
+ /**
21
+ * Parse the judge's JSON response into a JudgeScore.
22
+ */
23
+ private parseJudgeResponse;
24
+ private createErrorScore;
25
+ /**
26
+ * Score a single evaluation result.
27
+ */
28
+ judgeResult(task: EvalTask, result: TaskResult): Promise<JudgeScore>;
29
+ /**
30
+ * Score all evaluation results.
31
+ */
32
+ judgeAll(tasks: EvalTask[], results: TaskResult[]): Promise<JudgeScore[]>;
33
+ }
34
+ //# sourceMappingURL=judge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge.d.ts","sourceRoot":"","sources":["../../../src/scorer/judge.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAGH,OAAO,KAAK,EACV,QAAQ,EACR,UAAU,EACV,UAAU,EACV,YAAY,EAEb,MAAM,aAAa,CAAC;AAsErB;;GAEG;AACH,qBAAa,UAAU;IACrB,OAAO,CAAC,OAAO,CAAyB;gBAE5B,OAAO,GAAE,YAAiB;IAStC;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAyBxB;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAuC1B,OAAO,CAAC,gBAAgB;IAYxB;;OAEG;IACG,WAAW,CAAC,IAAI,EAAE,QAAQ,EAAE,MAAM,EAAE,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;IA+D1E;;OAEG;IACG,QAAQ,CAAC,KAAK,EAAE,QAAQ,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,GAAG,OAAO,CAAC,UAAU,EAAE,CAAC;CAShF"}
@@ -0,0 +1,226 @@
1
+ /**
2
+ * LLM-as-judge for scoring skill evaluation results.
3
+ *
4
+ * Uses Claude (via Agent SDK) to evaluate agent performance on three dimensions:
5
+ * - Discovery (0/1): Did agent load the expected skill?
6
+ * - Adherence (1-5): How well did agent follow skill instructions?
7
+ * - Output Quality (1-5): Does output meet task requirements?
8
+ */
9
+ import { query } from '@anthropic-ai/claude-agent-sdk';
10
+ import { isAssistantMessage, isResultMessage, isTextBlock, } from '../types.js';
11
+ import { loadConfigSync } from '../config.js';
12
+ const JUDGE_PROMPT_TEMPLATE = `You are an expert evaluator for AI agent skills. Score this skill evaluation result.
13
+
14
+ ## Task Information
15
+ **Prompt given to agent:** {prompt}
16
+
17
+ **Expected skill to load:** {expectedSkill}
18
+
19
+ **Criteria:**
20
+ {criteriaText}
21
+
22
+ **Golden checklist (expected behaviors):**
23
+ {checklistText}
24
+
25
+ ## Agent Result
26
+ **Skills that were loaded:** {skillLoads}
27
+
28
+ **Agent output:**
29
+ {output}
30
+
31
+ ## Scoring Instructions
32
+
33
+ Score the agent's performance on three dimensions:
34
+
35
+ 1. **Discovery (0 or 1)**: Did the agent load the expected skill "{expectedSkill}"?
36
+ - Score 1 if the expected skill was loaded
37
+ - Score 0 if it was not loaded
38
+ - If expected skill is "none", score 1 if NO skill was loaded, 0 if a skill was incorrectly loaded
39
+
40
+ 2. **Adherence (1-5)**: How well did the agent follow the skill's instructions?
41
+ - 5 = Perfectly followed all instructions
42
+ - 4 = Followed most instructions with minor deviations
43
+ - 3 = Followed core instructions but missed some details
44
+ - 2 = Partially followed instructions with significant gaps
45
+ - 1 = Did not follow the skill's instructions
46
+
47
+ 3. **Output Quality (1-5)**: Does the output meet the task requirements?
48
+ - 5 = Excellent output, meets all requirements
49
+ - 4 = Good output with minor issues
50
+ - 3 = Acceptable output, meets basic requirements
51
+ - 2 = Poor output, missing key requirements
52
+ - 1 = Unacceptable output
53
+
54
+ 4. **Failure Category** (if score < 4 on any dimension):
55
+ - "discovery_failure": Agent didn't load the skill when it should have
56
+ - "false_positive": Agent loaded a skill when it should NOT have
57
+ - "instruction_ambiguity": Agent misinterpreted skill instructions
58
+ - "missing_guidance": Skill didn't cover a needed case
59
+ - "agent_error": Agent made a mistake despite clear guidance
60
+ - "none": No significant failure
61
+
62
+ Respond with a JSON object:
63
+ \`\`\`json
64
+ {
65
+ "discovery": <0 or 1>,
66
+ "adherence": <1-5>,
67
+ "output_quality": <1-5>,
68
+ "failure_category": "<category or none>",
69
+ "reasoning": "<brief explanation of scores>"
70
+ }
71
+ \`\`\`
72
+ `;
73
+ /**
74
+ * LLM-as-judge for scoring skill evaluation results.
75
+ */
76
+ export class SkillJudge {
77
+ options;
78
+ constructor(options = {}) {
79
+ const config = loadConfigSync();
80
+ this.options = {
81
+ model: options.model ?? config.defaultJudgeModel,
82
+ outputTruncation: options.outputTruncation ?? config.judgeOutputTruncation,
83
+ };
84
+ }
85
+ /**
86
+ * Build the prompt for the judge.
87
+ */
88
+ buildJudgePrompt(task, result) {
89
+ const criteriaLines = task.criteria.map((c) => `- **${capitalize(c.dimension)}** (weight ${c.weight}): ${c.description}`);
90
+ const criteriaText = criteriaLines.length > 0
91
+ ? criteriaLines.join('\n')
92
+ : '- No specific criteria defined';
93
+ const checklistText = task.goldenChecklist.length > 0
94
+ ? task.goldenChecklist.map((item) => `- ${item}`).join('\n')
95
+ : '- No checklist defined';
96
+ const skillLoads = result.skillLoads.length > 0
97
+ ? result.skillLoads.join(', ')
98
+ : 'None';
99
+ return JUDGE_PROMPT_TEMPLATE
100
+ .replace('{prompt}', task.prompt)
101
+ .replace(/{expectedSkill}/g, task.expectedSkillLoad)
102
+ .replace('{criteriaText}', criteriaText)
103
+ .replace('{checklistText}', checklistText)
104
+ .replace('{skillLoads}', skillLoads)
105
+ .replace('{output}', result.output.slice(0, this.options.outputTruncation) || '(no output)');
106
+ }
107
+ /**
108
+ * Parse the judge's JSON response into a JudgeScore.
109
+ */
110
+ parseJudgeResponse(response, taskId, weights) {
111
+ const jsonMatch = response.match(/\{[\s\S]*?\}/);
112
+ if (!jsonMatch) {
113
+ return this.createErrorScore(taskId, 'Failed to parse judge response');
114
+ }
115
+ try {
116
+ const data = JSON.parse(jsonMatch[0]);
117
+ const discovery = Number(data.discovery) || 0;
118
+ const adherence = Number(data.adherence) || 1;
119
+ const outputQuality = Number(data.output_quality) || 1;
120
+ const adherenceNorm = (adherence - 1) / 4;
121
+ const outputNorm = (outputQuality - 1) / 4;
122
+ const weightedScore = (weights.get('discovery') ?? 0.3) * discovery +
123
+ (weights.get('adherence') ?? 0.4) * adherenceNorm +
124
+ (weights.get('output') ?? 0.3) * outputNorm;
125
+ return {
126
+ taskId,
127
+ discovery,
128
+ adherence,
129
+ outputQuality,
130
+ weightedScore,
131
+ failureCategory: (data.failure_category || 'none'),
132
+ reasoning: data.reasoning || '',
133
+ };
134
+ }
135
+ catch {
136
+ return this.createErrorScore(taskId, 'Invalid JSON in judge response');
137
+ }
138
+ }
139
+ createErrorScore(taskId, reason) {
140
+ return {
141
+ taskId,
142
+ discovery: 0,
143
+ adherence: 1,
144
+ outputQuality: 1,
145
+ weightedScore: 0,
146
+ failureCategory: 'agent_error',
147
+ reasoning: reason,
148
+ };
149
+ }
150
+ /**
151
+ * Score a single evaluation result.
152
+ */
153
+ async judgeResult(task, result) {
154
+ if (result.isError) {
155
+ return {
156
+ taskId: task.id,
157
+ discovery: 0,
158
+ adherence: 1,
159
+ outputQuality: 1,
160
+ weightedScore: 0,
161
+ failureCategory: 'agent_error',
162
+ reasoning: `Task failed with error: ${result.errorMessage}`,
163
+ };
164
+ }
165
+ const weights = new Map();
166
+ for (const c of task.criteria) {
167
+ weights.set(c.dimension, c.weight);
168
+ }
169
+ const prompt = this.buildJudgePrompt(task, result);
170
+ try {
171
+ let responseText = '';
172
+ for await (const message of query({
173
+ prompt,
174
+ options: {
175
+ model: this.options.model,
176
+ allowedTools: [],
177
+ permissionMode: 'bypassPermissions',
178
+ },
179
+ })) {
180
+ if (isAssistantMessage(message)) {
181
+ const content = message.message.content;
182
+ for (const block of content) {
183
+ if (isTextBlock(block)) {
184
+ responseText += block.text;
185
+ }
186
+ }
187
+ }
188
+ if (isResultMessage(message)) {
189
+ if (message.result) {
190
+ responseText = message.result;
191
+ }
192
+ }
193
+ }
194
+ return this.parseJudgeResponse(responseText, task.id, weights);
195
+ }
196
+ catch (error) {
197
+ // Fallback: heuristic scoring
198
+ const discovery = result.skillLoads.includes(task.expectedSkillLoad) ? 1 : 0;
199
+ return {
200
+ taskId: task.id,
201
+ discovery,
202
+ adherence: 3,
203
+ outputQuality: 3,
204
+ weightedScore: 0.5,
205
+ failureCategory: discovery === 0 ? 'discovery_failure' : 'none',
206
+ reasoning: `Heuristic scoring (judge error: ${error instanceof Error ? error.message : 'unknown'})`,
207
+ };
208
+ }
209
+ }
210
+ /**
211
+ * Score all evaluation results.
212
+ */
213
+ async judgeAll(tasks, results) {
214
+ const scores = [];
215
+ for (let i = 0; i < tasks.length; i++) {
216
+ console.log(`Judging task ${tasks[i].id}...`);
217
+ const score = await this.judgeResult(tasks[i], results[i]);
218
+ scores.push(score);
219
+ }
220
+ return scores;
221
+ }
222
+ }
223
+ function capitalize(s) {
224
+ return s.charAt(0).toUpperCase() + s.slice(1);
225
+ }
226
+ //# sourceMappingURL=judge.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge.js","sourceRoot":"","sources":["../../../src/scorer/judge.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,gCAAgC,CAAC;AAQvD,OAAO,EACL,kBAAkB,EAClB,eAAe,EACf,WAAW,GACZ,MAAM,aAAa,CAAC;AACrB,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAE9C,MAAM,qBAAqB,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA4D7B,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,UAAU;IACb,OAAO,CAAyB;IAExC,YAAY,UAAwB,EAAE;QACpC,MAAM,MAAM,GAAG,cAAc,EAAE,CAAC;QAEhC,IAAI,CAAC,OAAO,GAAG;YACb,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,MAAM,CAAC,iBAAiB;YAChD,gBAAgB,EAAE,OAAO,CAAC,gBAAgB,IAAI,MAAM,CAAC,qBAAqB;SAC3E,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,IAAc,EAAE,MAAkB;QACzD,MAAM,aAAa,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CACrC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,UAAU,CAAC,CAAC,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,WAAW,EAAE,CACjF,CAAC;QACF,MAAM,YAAY,GAAG,aAAa,CAAC,MAAM,GAAG,CAAC;YAC3C,CAAC,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC;YAC1B,CAAC,CAAC,gCAAgC,CAAC;QAErC,MAAM,aAAa,GAAG,IAAI,CAAC,eAAe,CAAC,MAAM,GAAG,CAAC;YACnD,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;YAC5D,CAAC,CAAC,wBAAwB,CAAC;QAE7B,MAAM,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;YAC7C,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC;YAC9B,CAAC,CAAC,MAAM,CAAC;QAEX,OAAO,qBAAqB;aACzB,OAAO,CAAC,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC;aAChC,OAAO,CAAC,kBAAkB,EAAE,IAAI,CAAC,iBAAiB,CAAC;aACnD,OAAO,CAAC,gBAAgB,EAAE,YAAY,CAAC;aACvC,OAAO,CAAC,iBAAiB,EAAE,aAAa,CAAC;aACzC,OAAO,CAAC,cAAc,EAAE,UAAU,CAAC;aACnC,OAAO,CAAC,UAAU,EAAE,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,OAAO,CAAC,gBAAgB,CAAC,IAAI,aAAa,CAAC,CAAC;IACjG,CAAC;IAED;;OAEG;IACK,kBAAkB,CACxB,QAAgB,EAChB,MAAc,EACd,OAA4B;QAE5B,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QACjD,IAAI,CAAC,SAAS,EAAE,CAAC;YACf,OAAO,IAAI,CAAC,gBAAgB,CAAC,MAAM,EAAE,gCAAgC,CAAC,CAAC;QACzE,CAAC;QAED,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;YAEtC,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YAC9C,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;YAC9C,MAAM,aAAa,GAAG,MAAM,CAAC,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;YAEvD,MAAM,aAAa,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;YAC1C,MAAM,UAAU,GAAG,CAAC,aAAa,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;YAE3C,MAAM,aAAa,GACjB,CAAC,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,GAAG,SAAS;gBAC7C,CAAC,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,GAAG,aAAa;gBACjD,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,GAAG,UAAU,CAAC;YAE9C,OAAO;gBACL,MAAM;gBACN,SAAS;gBACT,SAAS;gBACT,aAAa;gBACb,aAAa;gBACb,eAAe,EAAE,CAAC,IAAI,CAAC,gBAAgB,IAAI,MAAM,CAAoB;gBACrE,SAAS,EAAE,IAAI,CAAC,SAAS,IAAI,EAAE;aAChC,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC,gBAAgB,CAAC,MAAM,EAAE,gCAAgC,CAAC,CAAC;QACzE,CAAC;IACH,CAAC;IAEO,gBAAgB,CAAC,MAAc,EAAE,MAAc;QACrD,OAAO;YACL,MAAM;YACN,SAAS,EAAE,CAAC;YACZ,SAAS,EAAE,CAAC;YACZ,aAAa,EAAE,CAAC;YAChB,aAAa,EAAE,CAAC;YAChB,eAAe,EAAE,aAAa;YAC9B,SAAS,EAAE,MAAM;SAClB,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW,CAAC,IAAc,EAAE,MAAkB;QAClD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YACnB,OAAO;gBACL,MAAM,EAAE,IAAI,CAAC,EAAE;gBACf,SAAS,EAAE,CAAC;gBACZ,SAAS,EAAE,CAAC;gBACZ,aAAa,EAAE,CAAC;gBAChB,aAAa,EAAE,CAAC;gBAChB,eAAe,EAAE,aAAa;gBAC9B,SAAS,EAAE,2BAA2B,MAAM,CAAC,YAAY,EAAE;aAC5D,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,GAAG,EAAkB,CAAC;QAC1C,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC9B,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC;QACrC,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,CAAC,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;QAEnD,IAAI,CAAC;YACH,IAAI,YAAY,GAAG,EAAE,CAAC;YAEtB,IAAI,KAAK,EAAE,MAAM,OAAO,IAAI,KAAK,CAAC;gBAChC,MAAM;gBACN,OAAO,EAAE;oBACP,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;oBACzB,YAAY,EAAE,EAAE;oBAChB,cAAc,EAAE,mBAAmB;iBACpC;aACF,CAAC,EAAE,CAAC;gBACH,IAAI,kBAAkB,CAAC,OAAO,CAAC,EAAE,CAAC;oBAChC,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC;oBACxC,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;wBAC5B,IAAI,WAAW,CAAC,KAAK,CAAC,EAAE,CAAC;4BACvB,YAAY,IAAI,KAAK,CAAC,IAAI,CAAC;wBAC7B,CAAC;oBACH,CAAC;gBACH,CAAC;gBAED,IAAI,eAAe,CAAC,OAAO,CAAC,EAAE,CAAC;oBAC7B,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;wBACnB,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC;oBAChC,CAAC;gBACH,CAAC;YACH,CAAC;YAED,OAAO,IAAI,CAAC,kBAAkB,CAAC,YAAY,EAAE,IAAI,CAAC,EAAE,EAAE,OAAO,CAAC,CAAC;QACjE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,8BAA8B;YAC9B,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC7E,OAAO;gBACL,MAAM,EAAE,IAAI,CAAC,EAAE;gBACf,SAAS;gBACT,SAAS,EAAE,CAAC;gBACZ,aAAa,EAAE,CAAC;gBAChB,aAAa,EAAE,GAAG;gBAClB,eAAe,EAAE,SAAS,KAAK,CAAC,CAAC,CAAC,CAAC,mBAAmB,CAAC,CAAC,CAAC,MAAM;gBAC/D,SAAS,EAAE,mCAAmC,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,GAAG;aACpG,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,KAAiB,EAAE,OAAqB;QACrD,MAAM,MAAM,GAAiB,EAAE,CAAC;QAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,OAAO,CAAC,GAAG,CAAC,gBAAgB,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;YAC9C,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;YAC3D,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;QACD,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAED,SAAS,UAAU,CAAC,CAAS;IAC3B,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;AAChD,CAAC"}
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Scoring orchestrator that combines deterministic and LLM-as-judge scoring.
3
+ *
4
+ * Deterministic scoring runs first (free, fast), then LLM judge if configured.
5
+ * Results are merged with deterministic taking precedence for discovery.
6
+ */
7
+ import type { EvalTask, TaskResult, CombinedScore } from '../types.js';
8
+ import type { JudgeOptions } from '../types.js';
9
+ export interface ScorerOptions {
10
+ /** Skip deterministic scoring */
11
+ noDeterministic?: boolean;
12
+ /** Skip LLM judge scoring */
13
+ noJudge?: boolean;
14
+ /** Judge options */
15
+ judgeOptions?: JudgeOptions;
16
+ }
17
+ /**
18
+ * Score a single task result using both deterministic and LLM judge methods.
19
+ */
20
+ export declare function scoreTask(task: EvalTask, result: TaskResult, options?: ScorerOptions): Promise<CombinedScore>;
21
+ /**
22
+ * Score all task results.
23
+ */
24
+ export declare function scoreAll(tasks: EvalTask[], results: TaskResult[], options?: ScorerOptions): Promise<CombinedScore[]>;
25
+ //# sourceMappingURL=scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scorer.d.ts","sourceRoot":"","sources":["../../../src/scorer/scorer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EACV,QAAQ,EACR,UAAU,EACV,aAAa,EAId,MAAM,aAAa,CAAC;AAGrB,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAGhD,MAAM,WAAW,aAAa;IAC5B,iCAAiC;IACjC,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,6BAA6B;IAC7B,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,oBAAoB;IACpB,YAAY,CAAC,EAAE,YAAY,CAAC;CAC7B;AAED;;GAEG;AACH,wBAAsB,SAAS,CAC7B,IAAI,EAAE,QAAQ,EACd,MAAM,EAAE,UAAU,EAClB,OAAO,GAAE,aAAkB,GAC1B,OAAO,CAAC,aAAa,CAAC,CAmBxB;AAED;;GAEG;AACH,wBAAsB,QAAQ,CAC5B,KAAK,EAAE,QAAQ,EAAE,EACjB,OAAO,EAAE,UAAU,EAAE,EACrB,OAAO,GAAE,aAAkB,GAC1B,OAAO,CAAC,aAAa,EAAE,CAAC,CAY1B"}
@@ -0,0 +1,149 @@
1
+ /**
2
+ * Scoring orchestrator that combines deterministic and LLM-as-judge scoring.
3
+ *
4
+ * Deterministic scoring runs first (free, fast), then LLM judge if configured.
5
+ * Results are merged with deterministic taking precedence for discovery.
6
+ */
7
+ import { scoreDeterministic } from './deterministic.js';
8
+ import { SkillJudge } from './judge.js';
9
+ import { loadConfigSync, getDefaultWeights } from '../config.js';
10
+ /**
11
+ * Score a single task result using both deterministic and LLM judge methods.
12
+ */
13
+ export async function scoreTask(task, result, options = {}) {
14
+ const config = loadConfigSync();
15
+ const weights = getDefaultWeights(config);
16
+ // Run deterministic scoring
17
+ let deterministicResult = null;
18
+ if (!options.noDeterministic && task.deterministic) {
19
+ deterministicResult = scoreDeterministic(task, result);
20
+ }
21
+ // Run LLM judge scoring
22
+ let judgeResult = null;
23
+ if (!options.noJudge && task.criteria.length > 0) {
24
+ const judge = new SkillJudge(options.judgeOptions);
25
+ judgeResult = await judge.judgeResult(task, result);
26
+ }
27
+ const isNegativeTest = task.expectedSkillLoad === 'none';
28
+ return mergeScores(task.id, deterministicResult, judgeResult, weights, isNegativeTest);
29
+ }
30
+ /**
31
+ * Score all task results.
32
+ */
33
+ export async function scoreAll(tasks, results, options = {}) {
34
+ const scores = [];
35
+ for (let i = 0; i < tasks.length; i++) {
36
+ const task = tasks[i];
37
+ const result = results[i];
38
+ console.log(`Scoring task ${task.id}...`);
39
+ const score = await scoreTask(task, result, options);
40
+ scores.push(score);
41
+ }
42
+ return scores;
43
+ }
44
+ /**
45
+ * Merge deterministic and judge scores into a combined score.
46
+ *
47
+ * Merge rules:
48
+ * - Discovery: deterministic is authoritative (checks actual tool calls)
49
+ * - Adherence/output: from judge; if no judge, map deterministic pass→5, fail→1
50
+ * - Failure category: determined from available evidence
51
+ */
52
+ function mergeScores(taskId, det, judge, weights, isNegativeTest = false) {
53
+ // For negative tests (expectedSkillLoad === 'none'):
54
+ // discovery = 1 means correctly did NOT activate (good)
55
+ // discovery = 0 means incorrectly activated (false positive)
56
+ const computeDiscovery = (activated) => isNegativeTest ? (activated ? 0 : 1) : (activated ? 1 : 0);
57
+ // Case 1: Both available — merge
58
+ if (det && judge) {
59
+ const discovery = computeDiscovery(det.skillActivated);
60
+ const adherence = judge.adherence;
61
+ const outputQuality = judge.outputQuality;
62
+ const adherenceNorm = (adherence - 1) / 4;
63
+ const outputNorm = (outputQuality - 1) / 4;
64
+ const weightedScore = (weights.get('discovery') ?? 0.3) * discovery +
65
+ (weights.get('adherence') ?? 0.4) * adherenceNorm +
66
+ (weights.get('output') ?? 0.3) * outputNorm;
67
+ // Determine failure category
68
+ let failureCategory = judge.failureCategory;
69
+ if (!det.passed && det.skillActivated === false) {
70
+ failureCategory = 'discovery_failure';
71
+ }
72
+ // Check for false positive via deterministic
73
+ if (det.skillActivated && det.details.some((d) => d.includes('false positive'))) {
74
+ failureCategory = 'false_positive';
75
+ }
76
+ const reasons = [];
77
+ if (det.details.length > 0)
78
+ reasons.push(`Deterministic: ${det.details.join('; ')}`);
79
+ if (judge.reasoning)
80
+ reasons.push(`Judge: ${judge.reasoning}`);
81
+ return {
82
+ taskId,
83
+ deterministic: det,
84
+ judge,
85
+ discovery,
86
+ adherence,
87
+ outputQuality,
88
+ weightedScore,
89
+ failureCategory,
90
+ reasoning: reasons.join(' | '),
91
+ };
92
+ }
93
+ // Case 2: Deterministic only
94
+ if (det) {
95
+ const discovery = computeDiscovery(det.skillActivated);
96
+ const adherence = det.passed ? 5 : 1;
97
+ const outputQuality = det.passed ? 5 : 1;
98
+ const adherenceNorm = (adherence - 1) / 4;
99
+ const outputNorm = (outputQuality - 1) / 4;
100
+ const weightedScore = (weights.get('discovery') ?? 0.3) * discovery +
101
+ (weights.get('adherence') ?? 0.4) * adherenceNorm +
102
+ (weights.get('output') ?? 0.3) * outputNorm;
103
+ let failureCategory = 'none';
104
+ if (!det.skillActivated && det.details.some((d) => d.includes('Expected skill activation'))) {
105
+ failureCategory = 'discovery_failure';
106
+ }
107
+ if (det.details.some((d) => d.includes('false positive'))) {
108
+ failureCategory = 'false_positive';
109
+ }
110
+ return {
111
+ taskId,
112
+ deterministic: det,
113
+ judge: null,
114
+ discovery,
115
+ adherence,
116
+ outputQuality,
117
+ weightedScore,
118
+ failureCategory,
119
+ reasoning: `Deterministic only: ${det.details.join('; ')}`,
120
+ };
121
+ }
122
+ // Case 3: Judge only
123
+ if (judge) {
124
+ return {
125
+ taskId,
126
+ deterministic: null,
127
+ judge,
128
+ discovery: judge.discovery,
129
+ adherence: judge.adherence,
130
+ outputQuality: judge.outputQuality,
131
+ weightedScore: judge.weightedScore,
132
+ failureCategory: judge.failureCategory,
133
+ reasoning: judge.reasoning,
134
+ };
135
+ }
136
+ // Case 4: No scoring available
137
+ return {
138
+ taskId,
139
+ deterministic: null,
140
+ judge: null,
141
+ discovery: 0,
142
+ adherence: 1,
143
+ outputQuality: 1,
144
+ weightedScore: 0,
145
+ failureCategory: 'agent_error',
146
+ reasoning: 'No scoring method available (no deterministic check or LLM judge criteria defined)',
147
+ };
148
+ }
149
+ //# sourceMappingURL=scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"scorer.js","sourceRoot":"","sources":["../../../src/scorer/scorer.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAUH,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AACxD,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAExC,OAAO,EAAE,cAAc,EAAE,iBAAiB,EAAE,MAAM,cAAc,CAAC;AAWjE;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAC7B,IAAc,EACd,MAAkB,EAClB,UAAyB,EAAE;IAE3B,MAAM,MAAM,GAAG,cAAc,EAAE,CAAC;IAChC,MAAM,OAAO,GAAG,iBAAiB,CAAC,MAAM,CAAC,CAAC;IAE1C,4BAA4B;IAC5B,IAAI,mBAAmB,GAA+B,IAAI,CAAC;IAC3D,IAAI,CAAC,OAAO,CAAC,eAAe,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;QACnD,mBAAmB,GAAG,kBAAkB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACzD,CAAC;IAED,wBAAwB;IACxB,IAAI,WAAW,GAAsB,IAAI,CAAC;IAC1C,IAAI,CAAC,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACjD,MAAM,KAAK,GAAG,IAAI,UAAU,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QACnD,WAAW,GAAG,MAAM,KAAK,CAAC,WAAW,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC;IACtD,CAAC;IAED,MAAM,cAAc,GAAG,IAAI,CAAC,iBAAiB,KAAK,MAAM,CAAC;IACzD,OAAO,WAAW,CAAC,IAAI,CAAC,EAAE,EAAE,mBAAmB,EAAE,WAAW,EAAE,OAAO,EAAE,cAAc,CAAC,CAAC;AACzF,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,KAAiB,EACjB,OAAqB,EACrB,UAAyB,EAAE;IAE3B,MAAM,MAAM,GAAoB,EAAE,CAAC;IAEnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,MAAM,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QAC1B,OAAO,CAAC,GAAG,CAAC,gBAAgB,IAAI,CAAC,EAAE,KAAK,CAAC,CAAC;QAC1C,MAAM,KAAK,GAAG,MAAM,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QACrD,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IACrB,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,WAAW,CAClB,MAAc,EACd,GAA+B,EAC/B,KAAwB,EACxB,OAA4B,EAC5B,cAAc,GAAG,KAAK;IAEtB,qDAAqD;IACrD,wDAAwD;IACxD,6DAA6D;IAC7D,MAAM,gBAAgB,GAAG,CAAC,SAAkB,EAAE,EAAE,CAC9C,cAAc,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7D,iCAAiC;IACjC,IAAI,GAAG,IAAI,KAAK,EAAE,CAAC;QACjB,MAAM,SAAS,GAAG,gBAAgB,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC;QACvD,MAAM,SAAS,GAAG,KAAK,CAAC,SAAS,CAAC;QAClC,MAAM,aAAa,GAAG,KAAK,CAAC,aAAa,CAAC;QAE1C,MAAM,aAAa,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QAC1C,MAAM,UAAU,GAAG,CAAC,aAAa,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QAC3C,MAAM,aAAa,GACjB,CAAC,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,GAAG,SAAS;YAC7C,CAAC,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,GAAG,aAAa;YACjD,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,GAAG,UAAU,CAAC;QAE9C,6BAA6B;QAC7B,IAAI,eAAe,GAAG,KAAK,CAAC,eAAe,CAAC;QAC5C,IAAI,CAAC,GAAG,CAAC,MAAM,IAAI,GAAG,CAAC,cAAc,KAAK,KAAK,EAAE,CAAC;YAChD,eAAe,GAAG,mBAAmB,CAAC;QACxC,CAAC;QACD,6CAA6C;QAC7C,IAAI,GAAG,CAAC,cAAc,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,EAAE,CAAC;YAChF,eAAe,GAAG,gBAAgB,CAAC;QACrC,CAAC;QAED,MAAM,OAAO,GAAa,EAAE,CAAC;QAC7B,IAAI,GAAG,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO,CAAC,IAAI,CAAC,kBAAkB,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACrF,IAAI,KAAK,CAAC,SAAS;YAAE,OAAO,CAAC,IAAI,CAAC,UAAU,KAAK,CAAC,SAAS,EAAE,CAAC,CAAC;QAE/D,OAAO;YACL,MAAM;YACN,aAAa,EAAE,GAAG;YAClB,KAAK;YACL,SAAS;YACT,SAAS;YACT,aAAa;YACb,aAAa;YACb,eAAe;YACf,SAAS,EAAE,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC;SAC/B,CAAC;IACJ,CAAC;IAED,6BAA6B;IAC7B,IAAI,GAAG,EAAE,CAAC;QACR,MAAM,SAAS,GAAG,gBAAgB,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC;QACvD,MAAM,SAAS,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACrC,MAAM,aAAa,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QAEzC,MAAM,aAAa,GAAG,CAAC,SAAS,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QAC1C,MAAM,UAAU,GAAG,CAAC,aAAa,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC;QAC3C,MAAM,aAAa,GACjB,CAAC,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,GAAG,SAAS;YAC7C,CAAC,OAAO,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,GAAG,CAAC,GAAG,aAAa;YACjD,CAAC,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,GAAG,CAAC,GAAG,UAAU,CAAC;QAE9C,IAAI,eAAe,GAAoB,MAAM,CAAC;QAC9C,IAAI,CAAC,GAAG,CAAC,cAAc,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,2BAA2B,CAAC,CAAC,EAAE,CAAC;YAC5F,eAAe,GAAG,mBAAmB,CAAC;QACxC,CAAC;QACD,IAAI,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,gBAAgB,CAAC,CAAC,EAAE,CAAC;YAC1D,eAAe,GAAG,gBAAgB,CAAC;QACrC,CAAC;QAED,OAAO;YACL,MAAM;YACN,aAAa,EAAE,GAAG;YAClB,KAAK,EAAE,IAAI;YACX,SAAS;YACT,SAAS;YACT,aAAa;YACb,aAAa;YACb,eAAe;YACf,SAAS,EAAE,uBAAuB,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;SAC3D,CAAC;IACJ,CAAC;IAED,qBAAqB;IACrB,IAAI,KAAK,EAAE,CAAC;QACV,OAAO;YACL,MAAM;YACN,aAAa,EAAE,IAAI;YACnB,KAAK;YACL,SAAS,EAAE,KAAK,CAAC,SAAS;YAC1B,SAAS,EAAE,KAAK,CAAC,SAAS;YAC1B,aAAa,EAAE,KAAK,CAAC,aAAa;YAClC,aAAa,EAAE,KAAK,CAAC,aAAa;YAClC,eAAe,EAAE,KAAK,CAAC,eAAe;YACtC,SAAS,EAAE,KAAK,CAAC,SAAS;SAC3B,CAAC;IACJ,CAAC;IAED,+BAA+B;IAC/B,OAAO;QACL,MAAM;QACN,aAAa,EAAE,IAAI;QACnB,KAAK,EAAE,IAAI;QACX,SAAS,EAAE,CAAC;QACZ,SAAS,EAAE,CAAC;QACZ,aAAa,EAAE,CAAC;QAChB,aAAa,EAAE,CAAC;QAChB,eAAe,EAAE,aAAa;QAC9B,SAAS,EAAE,oFAAoF;KAChG,CAAC;AACJ,CAAC"}