@kilnai/core 0.10.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/dist/engine/domain/eval-config.d.ts +2 -1
  2. package/dist/engine/domain/eval-config.d.ts.map +1 -1
  3. package/dist/engine/domain/eval-config.js +9 -0
  4. package/dist/engine/domain/eval-config.js.map +1 -1
  5. package/dist/engine/domain/knowledge-source.d.ts +5 -1
  6. package/dist/engine/domain/knowledge-source.d.ts.map +1 -1
  7. package/dist/engine/error-catalog.js +2 -2
  8. package/dist/engine/error-catalog.js.map +1 -1
  9. package/dist/engine/errors.d.ts +1 -1
  10. package/dist/engine/errors.d.ts.map +1 -1
  11. package/dist/engine/gateway/conversation-event.d.ts +9 -1
  12. package/dist/engine/gateway/conversation-event.d.ts.map +1 -1
  13. package/dist/engine/gateway/tenant-config.d.ts +14 -0
  14. package/dist/engine/gateway/tenant-config.d.ts.map +1 -1
  15. package/dist/engine/gateway/tenant-config.js +22 -0
  16. package/dist/engine/gateway/tenant-config.js.map +1 -1
  17. package/dist/engine/index.d.ts +1 -1
  18. package/dist/engine/index.d.ts.map +1 -1
  19. package/dist/engine/index.js.map +1 -1
  20. package/dist/eval/consistency-runner.d.ts +28 -0
  21. package/dist/eval/consistency-runner.d.ts.map +1 -0
  22. package/dist/eval/consistency-runner.js +43 -0
  23. package/dist/eval/consistency-runner.js.map +1 -0
  24. package/dist/eval/experiment-runner.d.ts.map +1 -1
  25. package/dist/eval/experiment-runner.js +1 -0
  26. package/dist/eval/experiment-runner.js.map +1 -1
  27. package/dist/eval/index.d.ts +13 -0
  28. package/dist/eval/index.d.ts.map +1 -1
  29. package/dist/eval/index.js +12 -0
  30. package/dist/eval/index.js.map +1 -1
  31. package/dist/eval/scorer-factory.d.ts.map +1 -1
  32. package/dist/eval/scorer-factory.js +33 -0
  33. package/dist/eval/scorer-factory.js.map +1 -1
  34. package/dist/eval/scorers/context-relevance-scorer.d.ts +8 -0
  35. package/dist/eval/scorers/context-relevance-scorer.d.ts.map +1 -0
  36. package/dist/eval/scorers/context-relevance-scorer.js +32 -0
  37. package/dist/eval/scorers/context-relevance-scorer.js.map +1 -0
  38. package/dist/eval/scorers/effort-scorer.d.ts +6 -0
  39. package/dist/eval/scorers/effort-scorer.d.ts.map +1 -0
  40. package/dist/eval/scorers/effort-scorer.js +15 -0
  41. package/dist/eval/scorers/effort-scorer.js.map +1 -0
  42. package/dist/eval/scorers/handoff-quality-scorer.d.ts +8 -0
  43. package/dist/eval/scorers/handoff-quality-scorer.d.ts.map +1 -0
  44. package/dist/eval/scorers/handoff-quality-scorer.js +65 -0
  45. package/dist/eval/scorers/handoff-quality-scorer.js.map +1 -0
  46. package/dist/eval/scorers/milestone-scorer.d.ts +6 -0
  47. package/dist/eval/scorers/milestone-scorer.d.ts.map +1 -0
  48. package/dist/eval/scorers/milestone-scorer.js +35 -0
  49. package/dist/eval/scorers/milestone-scorer.js.map +1 -0
  50. package/dist/eval/scorers/multi-turn-consistency-scorer.d.ts +8 -0
  51. package/dist/eval/scorers/multi-turn-consistency-scorer.d.ts.map +1 -0
  52. package/dist/eval/scorers/multi-turn-consistency-scorer.js +55 -0
  53. package/dist/eval/scorers/multi-turn-consistency-scorer.js.map +1 -0
  54. package/dist/eval/scorers/policy-adherence-scorer.d.ts +9 -0
  55. package/dist/eval/scorers/policy-adherence-scorer.d.ts.map +1 -0
  56. package/dist/eval/scorers/policy-adherence-scorer.js +34 -0
  57. package/dist/eval/scorers/policy-adherence-scorer.js.map +1 -0
  58. package/dist/eval/scorers/resolution-scorer.d.ts +6 -0
  59. package/dist/eval/scorers/resolution-scorer.d.ts.map +1 -0
  60. package/dist/eval/scorers/resolution-scorer.js +25 -0
  61. package/dist/eval/scorers/resolution-scorer.js.map +1 -0
  62. package/dist/eval/scorers/routing-accuracy-scorer.d.ts +6 -0
  63. package/dist/eval/scorers/routing-accuracy-scorer.d.ts.map +1 -0
  64. package/dist/eval/scorers/routing-accuracy-scorer.js +19 -0
  65. package/dist/eval/scorers/routing-accuracy-scorer.js.map +1 -0
  66. package/dist/eval/scorers/safety-preservation-scorer.d.ts +9 -0
  67. package/dist/eval/scorers/safety-preservation-scorer.d.ts.map +1 -0
  68. package/dist/eval/scorers/safety-preservation-scorer.js +54 -0
  69. package/dist/eval/scorers/safety-preservation-scorer.js.map +1 -0
  70. package/dist/eval/scorers/tool-calling-accuracy-scorer.d.ts +6 -0
  71. package/dist/eval/scorers/tool-calling-accuracy-scorer.d.ts.map +1 -0
  72. package/dist/eval/scorers/tool-calling-accuracy-scorer.js +81 -0
  73. package/dist/eval/scorers/tool-calling-accuracy-scorer.js.map +1 -0
  74. package/dist/eval/scorers/tool-trajectory-scorer.d.ts +8 -0
  75. package/dist/eval/scorers/tool-trajectory-scorer.d.ts.map +1 -0
  76. package/dist/eval/scorers/tool-trajectory-scorer.js +51 -0
  77. package/dist/eval/scorers/tool-trajectory-scorer.js.map +1 -0
  78. package/dist/eval/types.d.ts +1 -0
  79. package/dist/eval/types.d.ts.map +1 -1
  80. package/dist/events/index.d.ts +11 -1
  81. package/dist/events/index.d.ts.map +1 -1
  82. package/dist/events/index.js +1 -0
  83. package/dist/events/index.js.map +1 -1
  84. package/dist/index.d.ts +2 -2
  85. package/dist/index.d.ts.map +1 -1
  86. package/dist/index.js.map +1 -1
  87. package/dist/knowledge/infrastructure/composite-extractor.d.ts +2 -2
  88. package/dist/knowledge/infrastructure/composite-extractor.d.ts.map +1 -1
  89. package/dist/knowledge/infrastructure/composite-extractor.js +2 -2
  90. package/dist/knowledge/infrastructure/composite-extractor.js.map +1 -1
  91. package/dist/knowledge/infrastructure/file-extractor.d.ts +2 -2
  92. package/dist/knowledge/infrastructure/file-extractor.d.ts.map +1 -1
  93. package/dist/knowledge/infrastructure/file-extractor.js +1 -1
  94. package/dist/knowledge/infrastructure/file-extractor.js.map +1 -1
  95. package/dist/knowledge/infrastructure/pdf-extractor.d.ts +2 -2
  96. package/dist/knowledge/infrastructure/pdf-extractor.d.ts.map +1 -1
  97. package/dist/knowledge/infrastructure/pdf-extractor.js +4 -2
  98. package/dist/knowledge/infrastructure/pdf-extractor.js.map +1 -1
  99. package/dist/knowledge/infrastructure/url-extractor.d.ts +2 -2
  100. package/dist/knowledge/infrastructure/url-extractor.d.ts.map +1 -1
  101. package/dist/knowledge/infrastructure/url-extractor.js +5 -3
  102. package/dist/knowledge/infrastructure/url-extractor.js.map +1 -1
  103. package/dist/knowledge/source-manager.d.ts +2 -0
  104. package/dist/knowledge/source-manager.d.ts.map +1 -1
  105. package/dist/knowledge/source-manager.js +54 -1
  106. package/dist/knowledge/source-manager.js.map +1 -1
  107. package/dist/observability/span-mapper.d.ts.map +1 -1
  108. package/dist/observability/span-mapper.js +15 -0
  109. package/dist/observability/span-mapper.js.map +1 -1
  110. package/dist/package/yaml-parser.d.ts.map +1 -1
  111. package/dist/package/yaml-parser.js +1 -0
  112. package/dist/package/yaml-parser.js.map +1 -1
  113. package/dist/skill/index.d.ts +2 -4
  114. package/dist/skill/index.d.ts.map +1 -1
  115. package/dist/skill/index.js +1 -2
  116. package/dist/skill/index.js.map +1 -1
  117. package/dist/skill/md-parser.d.ts +21 -0
  118. package/dist/skill/md-parser.d.ts.map +1 -0
  119. package/dist/skill/md-parser.js +168 -0
  120. package/dist/skill/md-parser.js.map +1 -0
  121. package/dist/skill/skill-registry.d.ts +16 -8
  122. package/dist/skill/skill-registry.d.ts.map +1 -1
  123. package/dist/skill/skill-registry.js +77 -30
  124. package/dist/skill/skill-registry.js.map +1 -1
  125. package/dist/skill/types.d.ts +7 -3
  126. package/dist/skill/types.d.ts.map +1 -1
  127. package/package.json +1 -1
  128. package/dist/skill/yaml-parser.d.ts +0 -18
  129. package/dist/skill/yaml-parser.d.ts.map +0 -1
  130. package/dist/skill/yaml-parser.js +0 -61
  131. package/dist/skill/yaml-parser.js.map +0 -1
  132. package/dist/skill/yaml-schema.d.ts +0 -20
  133. package/dist/skill/yaml-schema.d.ts.map +0 -1
  134. package/dist/skill/yaml-schema.js +0 -80
  135. package/dist/skill/yaml-schema.js.map +0 -1
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAEA,YAAY,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,OAAO,EAAE,oBAAoB,EAAE,gBAAgB,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AACpJ,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,YAAY,EAAE,sBAAsB,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC3F,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAChE,YAAY,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAEA,YAAY,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,OAAO,EAAE,oBAAoB,EAAE,gBAAgB,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AACpJ,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,sBAAsB,EAAE,MAAM,uCAAuC,CAAC;AAC/E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,yBAAyB,EAAE,MAAM,2CAA2C,CAAC;AACtF,OAAO,EAAE,0BAA0B,EAAE,MAAM,4CAA4C,CAAC;AACxF,OAAO,EAAE,wBAAwB,EAAE,MAAM,yCAAyC,CAAC;AACnF,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,YAAY,EAAE,sBAAsB,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC3F,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAChE,YAAY,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC;AACrF,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,YAAY,EAAE,uBAAuB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,yBAAyB,CAAC"}
@@ -5,6 +5,8 @@ export { JsonValidityScorer } from "./scorers/json-validity-scorer.js";
5
5
  export { LengthScorer } from "./scorers/length-scorer.js";
6
6
  export { LatencyScorer } from "./scorers/latency-scorer.js";
7
7
  export { CostScorer } from "./scorers/cost-scorer.js";
8
+ export { EffortScorer } from "./scorers/effort-scorer.js";
9
+ export { ResolutionScorer } from "./scorers/resolution-scorer.js";
8
10
  export { CompositeScorer } from "./scorers/composite-scorer.js";
9
11
  export { FaithfulnessScorer } from "./scorers/faithfulness-scorer.js";
10
12
  export { RelevanceScorer } from "./scorers/relevance-scorer.js";
@@ -12,8 +14,18 @@ export { CoherenceScorer } from "./scorers/coherence-scorer.js";
12
14
  export { HallucinationScorer } from "./scorers/hallucination-scorer.js";
13
15
  export { ToxicityScorer } from "./scorers/toxicity-scorer.js";
14
16
  export { CustomPromptScorer } from "./scorers/custom-prompt-scorer.js";
17
+ export { PolicyAdherenceScorer } from "./scorers/policy-adherence-scorer.js";
18
+ export { ContextRelevanceScorer } from "./scorers/context-relevance-scorer.js";
19
+ export { ToolTrajectoryScorer } from "./scorers/tool-trajectory-scorer.js";
20
+ export { ToolCallingAccuracyScorer } from "./scorers/tool-calling-accuracy-scorer.js";
21
+ export { MultiTurnConsistencyScorer } from "./scorers/multi-turn-consistency-scorer.js";
22
+ export { SafetyPreservationScorer } from "./scorers/safety-preservation-scorer.js";
23
+ export { RoutingAccuracyScorer } from "./scorers/routing-accuracy-scorer.js";
24
+ export { HandoffQualityScorer } from "./scorers/handoff-quality-scorer.js";
25
+ export { MilestoneScorer } from "./scorers/milestone-scorer.js";
15
26
  export { parseDatasetJsonl } from "./dataset-loader.js";
16
27
  export { createScorer } from "./scorer-factory.js";
17
28
  export { ExperimentRunner } from "./experiment-runner.js";
18
29
  export { compareExperiments } from "./experiment-comparator.js";
30
+ export { ConsistencyRunner } from "./consistency-runner.js";
19
31
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAG/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAE1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAG/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,sBAAsB,EAAE,MAAM,uCAAuC,CAAC;AAC/E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,yBAAyB,EAAE,MAAM,2CAA2C,CAAC;AACtF,OAAO,EAAE,0BAA0B,EAAE,MAAM,4CAA4C,CAAC;AACxF,OAAO,EAAE,wBAAwB,EAAE,MAAM,yCAAyC,CAAC;AACnF,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAE1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAEhE,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"scorer-factory.d.ts","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACpD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AAgBxE,wBAAgB,YAAY,CAAC,MAAM,EAAE,gBAAgB,EAAE,GAAG,CAAC,EAAE,SAAS,GAAG,MAAM,CA8B9E"}
1
+ {"version":3,"file":"scorer-factory.d.ts","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACpD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AA2BxE,wBAAgB,YAAY,CAAC,MAAM,EAAE,gBAAgB,EAAE,GAAG,CAAC,EAAE,SAAS,GAAG,MAAM,CA8C9E"}
@@ -13,6 +13,17 @@ import { CoherenceScorer } from "./scorers/coherence-scorer.js";
13
13
  import { HallucinationScorer } from "./scorers/hallucination-scorer.js";
14
14
  import { ToxicityScorer } from "./scorers/toxicity-scorer.js";
15
15
  import { CustomPromptScorer } from "./scorers/custom-prompt-scorer.js";
16
+ import { PolicyAdherenceScorer } from "./scorers/policy-adherence-scorer.js";
17
+ import { ContextRelevanceScorer } from "./scorers/context-relevance-scorer.js";
18
+ import { ToolTrajectoryScorer } from "./scorers/tool-trajectory-scorer.js";
19
+ import { EffortScorer } from "./scorers/effort-scorer.js";
20
+ import { ResolutionScorer } from "./scorers/resolution-scorer.js";
21
+ import { ToolCallingAccuracyScorer } from "./scorers/tool-calling-accuracy-scorer.js";
22
+ import { MultiTurnConsistencyScorer } from "./scorers/multi-turn-consistency-scorer.js";
23
+ import { SafetyPreservationScorer } from "./scorers/safety-preservation-scorer.js";
24
+ import { RoutingAccuracyScorer } from "./scorers/routing-accuracy-scorer.js";
25
+ import { HandoffQualityScorer } from "./scorers/handoff-quality-scorer.js";
26
+ import { MilestoneScorer } from "./scorers/milestone-scorer.js";
16
27
  export function createScorer(config, llm) {
17
28
  switch (config.type) {
18
29
  case "exact-match":
@@ -27,6 +38,16 @@ export function createScorer(config, llm) {
27
38
  return new LatencyScorer(config.maxLatencyMs ?? 5000);
28
39
  case "cost":
29
40
  return new CostScorer(config.maxCostUsd ?? 1.0);
41
+ case "effort":
42
+ return new EffortScorer();
43
+ case "resolution":
44
+ return new ResolutionScorer();
45
+ case "tool-calling-accuracy":
46
+ return new ToolCallingAccuracyScorer();
47
+ case "routing-accuracy":
48
+ return new RoutingAccuracyScorer();
49
+ case "milestone":
50
+ return new MilestoneScorer();
30
51
  case "composite": {
31
52
  const subScorers = (config.scorers ?? []).map((s) => createScorer(s, llm));
32
53
  return new CompositeScorer(config.name, subScorers);
@@ -37,6 +58,12 @@ export function createScorer(config, llm) {
37
58
  case "hallucination":
38
59
  case "toxicity":
39
60
  case "custom-prompt":
61
+ case "policy-adherence":
62
+ case "context-relevance":
63
+ case "tool-trajectory":
64
+ case "multi-turn-consistency":
65
+ case "safety-preservation":
66
+ case "handoff-quality":
40
67
  return createLLMScorer(config, llm);
41
68
  default:
42
69
  throw new KilnError("EVAL_SCORER_FAILED", `Unknown scorer type: ${config.type}`, {
@@ -58,6 +85,12 @@ function createLLMScorer(config, llm) {
58
85
  case "hallucination": return new HallucinationScorer(llm);
59
86
  case "toxicity": return new ToxicityScorer(llm);
60
87
  case "custom-prompt": return new CustomPromptScorer(config.name, config.prompt ?? "", llm);
88
+ case "policy-adherence": return new PolicyAdherenceScorer(llm, config.policies ?? []);
89
+ case "context-relevance": return new ContextRelevanceScorer(llm);
90
+ case "tool-trajectory": return new ToolTrajectoryScorer(llm);
91
+ case "multi-turn-consistency": return new MultiTurnConsistencyScorer(llm);
92
+ case "safety-preservation": return new SafetyPreservationScorer(llm);
93
+ case "handoff-quality": return new HandoffQualityScorer(llm);
61
94
  default:
62
95
  throw new KilnError("EVAL_SCORER_FAILED", `Unknown LLM scorer type: ${config.type}`, {
63
96
  context: { type: config.type, name: config.name },
@@ -1 +1 @@
1
- {"version":3,"file":"scorer-factory.js","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAAA,2DAA2D;AAI3D,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAChD,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AAEvE,MAAM,UAAU,YAAY,CAAC,MAAwB,EAAE,GAAe;IACpE,QAAQ,MAAM,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,aAAa;YAChB,OAAO,IAAI,gBAAgB,EAAE,CAAC;QAChC,KAAK,UAAU;YACb,OAAO,IAAI,cAAc,CAAC,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC;QACrD,KAAK,eAAe;YAClB,OAAO,IAAI,kBAAkB,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAC/C,KAAK,QAAQ;YACX,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC;QAC9D,KAAK,SAAS;YACZ,OAAO,IAAI,aAAa,CAAC,MAAM,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QACxD,KAAK,MAAM;YACT,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC,UAAU,IAAI,GAAG,CAAC,CAAC;QAClD,KAAK,WAAW,CAAC,CAAC,CAAC;YACjB,MAAM,UAAU,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;YAC3E,OAAO,IAAI,eAAe,CAAC,MAAM,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;QACtD,CAAC;QACD,KAAK,cAAc,CAAC;QACpB,KAAK,WAAW,CAAC;QACjB,KAAK,WAAW,CAAC;QACjB,KAAK,eAAe,CAAC;QACrB,KAAK,UAAU,CAAC;QAChB,KAAK,eAAe;YAClB,OAAO,eAAe,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QACtC;YACE,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,wBAAwB,MAAM,CAAC,IAAI,EAAE,EAAE;gBAC/E,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;aAClD,CAAC,CAAC;IACP,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,MAAwB,EAAE,GAAe;IAChE,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,eAAe,MAAM,CAAC,IAAI,iCAAiC,EAAE;YACrG,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;YACjD,UAAU,EAAE,2DAA2D;SACxE,CAAC,CAAC;IACL,CAAC;IACD,QAAQ,MAAM,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,cAAc,CAAC,CAAC,OAAO,IAAI,kBAAkB,CAAC,GAAG,CAAC,CAAC;QACxD,KAAK,WAAW,CAAC,CAAC,OAAO,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;QAClD,KAAK,WAAW,CAAC,CAAC,OAAO,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;QAClD,KAAK,eAAe,CAAC,CAAC,OAAO,IAAI,mBAAmB,CAAC,GAAG,CAAC,CAAC;QAC1D,KAAK,UAAU,CAAC,CAAC,OAAO,IAAI,cAAc,CAAC,GAAG,CAAC,CAAC;QAChD,KAAK,eAAe,CAAC,CAAC,OAAO,IAAI,kBAAkB,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,IAAI,EAAE,EAAE,GAAG,CAAC,CAAC;QAC3F;YACE,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,4BAA4B,MAAM,CAAC,IAAI,EAAE,EAAE;gBACnF,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;aAClD,CAAC,CAAC;IACP,CAAC;AACH,CAAC"}
1
+ {"version":3,"file":"scorer-factory.js","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAAA,2DAA2D;AAI3D,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAChD,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,sBAAsB,EAAE,MAAM,uCAAuC,CAAC;AAC/E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,yBAAyB,EAAE,MAAM,2CAA2C,CAAC;AACtF,OAAO,EAAE,0BAA0B,EAAE,MAAM,4CAA4C,CAAC;AACxF,OAAO,EAAE,wBAAwB,EAAE,MAAM,yCAAyC,CAAC;AACnF,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAEhE,MAAM,UAAU,YAAY,CAAC,MAAwB,EAAE,GAAe;IACpE,QAAQ,MAAM,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,aAAa;YAChB,OAAO,IAAI,gBAAgB,EAAE,CAAC;QAChC,KAAK,UAAU;YACb,OAAO,IAAI,cAAc,CAAC,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC;QACrD,KAAK,eAAe;YAClB,OAAO,IAAI,kBAAkB,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAC/C,KAAK,QAAQ;YACX,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC;QAC9D,KAAK,SAAS;YACZ,OAAO,IAAI,aAAa,CAAC,MAAM,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QACxD,KAAK,MAAM;YACT,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC,UAAU,IAAI,GAAG,CAAC,CAAC;QAClD,KAAK,QAAQ;YACX,OAAO,IAAI,YAAY,EAAE,CAAC;QAC5B,KAAK,YAAY;YACf,OAAO,IAAI,gBAAgB,EAAE,CAAC;QAChC,KAAK,uBAAuB;YAC1B,OAAO,IAAI,yBAAyB,EAAE,CAAC;QACzC,KAAK,kBAAkB;YACrB,OAAO,IAAI,qBAAqB,EAAE,CAAC;QACrC,KAAK,WAAW;YACd,OAAO,IAAI,eAAe,EAAE,CAAC;QAC/B,KAAK,WAAW,CAAC,CAAC,CAAC;YACjB,MAAM,UAAU,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;YAC3E,OAAO,IAAI,eAAe,CAAC,MAAM,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;QACtD,CAAC;QACD,KAAK,cAAc,CAAC;QACpB,KAAK,WAAW,CAAC;QACjB,KAAK,WAAW,CAAC;QACjB,KAAK,eAAe,CAAC;QACrB,KAAK,UAAU,CAAC;QAChB,KAAK,eAAe,CAAC;QACrB,KAAK,kBAAkB,CAAC;QACxB,KAAK,mBAAmB,CAAC;QACzB,KAAK,iBAAiB,CAAC;QACvB,KAAK,wBAAwB,CAAC;QAC9B,KAAK,qBAAqB,CAAC;QAC3B,KAAK,iBAAiB;YACpB,OAAO,eAAe,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QACtC;YACE,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,wBAAwB,MAAM,CAAC,IAAI,EAAE,EAAE;gBAC/E,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;aAClD,CAAC,CAAC;IACP,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,MAAwB,EAAE,GAAe;IAChE,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,eAAe,MAAM,CAAC,IAAI,iCAAiC,EAAE;YACrG,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;YACjD,UAAU,EAAE,2DAA2D;SACxE,CAAC,CAAC;IACL,CAAC;IACD,QAAQ,MAAM,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,cAAc,CAAC,CAAC,OAAO,IAAI,kBAAkB,CAAC,GAAG,CAAC,CAAC;QACxD,KAAK,WAAW,CAAC,CAAC,OAAO,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;QAClD,KAAK,WAAW,CAAC,CAAC,OAAO,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;QAClD,KAAK,eAAe,CAAC,CAAC,OAAO,IAAI,mBAAmB,CAAC,GAAG,CAAC,CAAC;QAC1D,KAAK,UAAU,CAAC,CAAC,OAAO,IAAI,cAAc,CAAC,GAAG,CAAC,CAAC;QAChD,KAAK,eAAe,CAAC,CAAC,OAAO,IAAI,kBAAkB,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,IAAI,EAAE,EAAE,GAAG,CAAC,CAAC;QAC3F,KAAK,kBAAkB,CAAC,CAAC,OAAO,IAAI,qBAAqB,CAAC,GAAG,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;QACtF,KAAK,mBAAmB,CAAC,CAAC,OAAO,IAAI,sBAAsB,CAAC,GAAG,CAAC,CAAC;QACjE,KAAK,iBAAiB,CAAC,CAAC,OAAO,IAAI,oBAAoB,CAAC,GAAG,CAAC,CAAC;QAC7D,KAAK,wBAAwB,CAAC,CAAC,OAAO,IAAI,0BAA0B,CAAC,GAAG,CAAC,CAAC;QAC1E,KAAK,qBAAqB,CAAC,CAAC,OAAO,IAAI,wBAAwB,CAAC,GAAG,CAAC,CAAC;QACrE,KAAK,iBAAiB,CAAC,CAAC,OAAO,IAAI,oBAAoB,CAAC,GAAG,CAAC,CAAC;QAC7D;YACE,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,4BAA4B,MAAM,CAAC,IAAI,EAAE,EAAE;gBACnF,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;aAClD,CAAC,CAAC;IACP,CAAC;AACH,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
2
+ export declare class ContextRelevanceScorer implements Scorer {
3
+ private readonly llm;
4
+ readonly name = "context-relevance";
5
+ constructor(llm: ScorerLLM);
6
+ score(input: EvalInput): Promise<EvalScore>;
7
+ }
8
+ //# sourceMappingURL=context-relevance-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-relevance-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/context-relevance-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAG3E,qBAAa,sBAAuB,YAAW,MAAM;IAGvC,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,uBAAuB;gBAEP,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAwBlD"}
@@ -0,0 +1,32 @@
1
+ // ContextRelevanceScorer: evaluates whether retrieved context chunks are relevant to the input query
2
+ import { parseLLMResponse } from "./parse-llm-response.js";
3
+ export class ContextRelevanceScorer {
4
+ llm;
5
+ name = "context-relevance";
6
+ constructor(llm) {
7
+ this.llm = llm;
8
+ }
9
+ async score(input) {
10
+ const chunks = input.context ?? [];
11
+ if (chunks.length === 0) {
12
+ return { name: this.name, score: 0, reasoning: "No context provided" };
13
+ }
14
+ const contextList = chunks.map((c, i) => `[${i + 1}] ${c}`).join("\n\n");
15
+ const prompt = `Evaluate context relevance. Are the retrieved context chunks relevant to the user's query? This measures retrieval quality, not answer quality.
16
+
17
+ Query: ${input.input}
18
+
19
+ Retrieved context:
20
+ ${contextList}
21
+
22
+ Score 1.0 if all chunks are highly relevant to the query. Score 0.0 if none are relevant. Use intermediate scores based on the proportion and degree of relevance.
23
+
24
+ Respond EXACTLY in this format:
25
+ SCORE: <number from 0.0 to 1.0>
26
+ REASONING: <one sentence explanation>`;
27
+ const response = await this.llm.evaluate(prompt);
28
+ const { score, reasoning } = parseLLMResponse(response, this.name);
29
+ return { name: this.name, score, reasoning };
30
+ }
31
+ }
32
+ //# sourceMappingURL=context-relevance-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"context-relevance-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/context-relevance-scorer.ts"],"names":[],"mappings":"AAAA,qGAAqG;AAGrG,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAE3D,MAAM,OAAO,sBAAsB;IAGJ;IAFpB,IAAI,GAAG,mBAAmB,CAAC;IAEpC,YAA6B,GAAc;QAAd,QAAG,GAAH,GAAG,CAAW;IAAG,CAAC;IAE/C,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,IAAI,EAAE,CAAC;QACnC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACxB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,qBAAqB,EAAE,CAAC;QACzE,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACzE,MAAM,MAAM,GAAG;;SAEV,KAAK,CAAC,KAAK;;;EAGlB,WAAW;;;;;;sCAMyB,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
@@ -0,0 +1,6 @@
1
+ import type { EvalInput, EvalScore, Scorer } from "../types.js";
2
+ export declare class EffortScorer implements Scorer {
3
+ readonly name = "effort";
4
+ score(input: EvalInput): Promise<EvalScore>;
5
+ }
6
+ //# sourceMappingURL=effort-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"effort-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/effort-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAIhE,qBAAa,YAAa,YAAW,MAAM;IACzC,QAAQ,CAAC,IAAI,YAAY;IAEnB,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CASlD"}
@@ -0,0 +1,15 @@
1
+ // EffortScorer: rule-based scorer bridging enrichment effort score into eval framework
2
+ import { computeEffortScore } from "../../enrichment/effort-score.js";
3
+ export class EffortScorer {
4
+ name = "effort";
5
+ async score(input) {
6
+ const components = input.metadata?.effortComponents;
7
+ if (!components) {
8
+ return { name: this.name, score: 0, reasoning: "No effort components in metadata" };
9
+ }
10
+ const rawScore = computeEffortScore(components);
11
+ const normalized = rawScore / 10; // 0-10 -> 0-1
12
+ return { name: this.name, score: normalized, reasoning: `Effort score: ${rawScore}/10` };
13
+ }
14
+ }
15
+ //# sourceMappingURL=effort-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"effort-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/effort-scorer.ts"],"names":[],"mappings":"AAAA,uFAAuF;AAGvF,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AAGtE,MAAM,OAAO,YAAY;IACd,IAAI,GAAG,QAAQ,CAAC;IAEzB,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,UAAU,GAAG,KAAK,CAAC,QAAQ,EAAE,gBAAgD,CAAC;QACpF,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,kCAAkC,EAAE,CAAC;QACtF,CAAC;QACD,MAAM,QAAQ,GAAG,kBAAkB,CAAC,UAAU,CAAC,CAAC;QAChD,MAAM,UAAU,GAAG,QAAQ,GAAG,EAAE,CAAC,CAAC,cAAc;QAChD,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,UAAU,EAAE,SAAS,EAAE,iBAAiB,QAAQ,KAAK,EAAE,CAAC;IAC3F,CAAC;CACF"}
@@ -0,0 +1,8 @@
1
+ import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
2
+ export declare class HandoffQualityScorer implements Scorer {
3
+ private readonly llm;
4
+ readonly name = "handoff-quality";
5
+ constructor(llm: ScorerLLM);
6
+ score(input: EvalInput): Promise<EvalScore>;
7
+ }
8
+ //# sourceMappingURL=handoff-quality-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"handoff-quality-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/handoff-quality-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AA4B3E,qBAAa,oBAAqB,YAAW,MAAM;IAGrC,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,qBAAqB;gBAEL,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAuClD"}
@@ -0,0 +1,65 @@
1
+ // HandoffQualityScorer: LLM-as-judge for context preservation across agent handoffs
2
+ import { parseLLMResponse } from "./parse-llm-response.js";
3
+ function extractHandoffs(metadata) {
4
+ if (!metadata)
5
+ return undefined;
6
+ const raw = metadata["handoffHistory"];
7
+ if (!Array.isArray(raw))
8
+ return undefined;
9
+ const events = [];
10
+ for (const entry of raw) {
11
+ if (typeof entry === "object" &&
12
+ entry !== null &&
13
+ typeof entry["fromAgent"] === "string" &&
14
+ typeof entry["toAgent"] === "string") {
15
+ events.push(entry);
16
+ }
17
+ }
18
+ return events.length > 0 ? events : undefined;
19
+ }
20
+ export class HandoffQualityScorer {
21
+ llm;
22
+ name = "handoff-quality";
23
+ constructor(llm) {
24
+ this.llm = llm;
25
+ }
26
+ async score(input) {
27
+ const handoffs = extractHandoffs(input.metadata);
28
+ if (!handoffs) {
29
+ return { name: this.name, score: 0, reasoning: "No handoff history in metadata" };
30
+ }
31
+ const handoffLog = handoffs
32
+ .map((h, i) => {
33
+ const parts = [`Handoff ${i + 1}: ${h.fromAgent} -> ${h.toAgent}`];
34
+ if (h.reason)
35
+ parts.push(` Reason: ${h.reason}`);
36
+ if (h.summary)
37
+ parts.push(` Summary: ${h.summary}`);
38
+ return parts.join("\n");
39
+ })
40
+ .join("\n\n");
41
+ const prompt = `Evaluate the quality of agent handoffs in this conversation. Was context preserved across each agent switch?
42
+
43
+ User query: ${input.input}
44
+ Final output: ${input.output}
45
+
46
+ Handoff history:
47
+ ${handoffLog}
48
+
49
+ Evaluate:
50
+ 1. Was the handoff reason appropriate (correct agent for the task)?
51
+ 2. Was the context summary accurate and complete?
52
+ 3. Did the receiving agent pick up seamlessly without re-asking for information?
53
+ 4. Was any critical context lost during the handoff?
54
+
55
+ Score 1.0 for seamless handoffs with full context preservation. Score 0.0 for complete context loss.
56
+
57
+ Respond EXACTLY in this format:
58
+ SCORE: <number from 0.0 to 1.0>
59
+ REASONING: <one sentence explanation>`;
60
+ const response = await this.llm.evaluate(prompt);
61
+ const { score, reasoning } = parseLLMResponse(response, this.name);
62
+ return { name: this.name, score, reasoning };
63
+ }
64
+ }
65
+ //# sourceMappingURL=handoff-quality-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"handoff-quality-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/handoff-quality-scorer.ts"],"names":[],"mappings":"AAAA,oFAAoF;AAGpF,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAS3D,SAAS,eAAe,CAAC,QAA6C;IACpE,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,gBAAgB,CAAC,CAAC;IACvC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,MAAM,MAAM,GAAmB,EAAE,CAAC;IAClC,KAAK,MAAM,KAAK,IAAI,GAAG,EAAE,CAAC;QACxB,IACE,OAAO,KAAK,KAAK,QAAQ;YACzB,KAAK,KAAK,IAAI;YACd,OAAQ,KAAiC,CAAC,WAAW,CAAC,KAAK,QAAQ;YACnE,OAAQ,KAAiC,CAAC,SAAS,CAAC,KAAK,QAAQ,EACjE,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,KAAqB,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,MAAM,OAAO,oBAAoB;IAGF;IAFpB,IAAI,GAAG,iBAAiB,CAAC;IAElC,YAA6B,GAAc;QAAd,QAAG,GAAH,GAAG,CAAW;IAAG,CAAC;IAE/C,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,QAAQ,GAAG,eAAe,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACjD,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,gCAAgC,EAAE,CAAC;QACpF,CAAC;QAED,MAAM,UAAU,GAAG,QAAQ;aACxB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YACZ,MAAM,KAAK,GAAG,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,SAAS,OAAO,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;YACnE,IAAI,CAAC,CAAC,MAAM;gBAAE,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;YAClD,IAAI,CAAC,CAAC,OAAO;gBAAE,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;YACrD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC;aACD,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,MAAM,GAAG;;cAEL,KAAK,CAAC,KAAK;gBACT,KAAK,CAAC,MAAM;;;EAG1B,UAAU;;;;;;;;;;;;sCAY0B,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
@@ -0,0 +1,6 @@
1
+ import type { EvalInput, EvalScore, Scorer } from "../types.js";
2
+ export declare class MilestoneScorer implements Scorer {
3
+ readonly name = "milestone";
4
+ score(input: EvalInput): Promise<EvalScore>;
5
+ }
6
+ //# sourceMappingURL=milestone-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"milestone-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/milestone-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAyBhE,qBAAa,eAAgB,YAAW,MAAM;IAC5C,QAAQ,CAAC,IAAI,eAAe;IAEtB,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAelD"}
@@ -0,0 +1,35 @@
1
+ // MilestoneScorer: rule-based scorer tracking intermediate checkpoint achievement
2
+ function extractMilestones(metadata) {
3
+ if (!metadata)
4
+ return undefined;
5
+ const raw = metadata["milestones"];
6
+ if (!Array.isArray(raw))
7
+ return undefined;
8
+ const milestones = [];
9
+ for (const entry of raw) {
10
+ if (typeof entry === "object" &&
11
+ entry !== null &&
12
+ typeof entry["name"] === "string" &&
13
+ typeof entry["completed"] === "boolean") {
14
+ milestones.push(entry);
15
+ }
16
+ }
17
+ return milestones.length > 0 ? milestones : undefined;
18
+ }
19
+ export class MilestoneScorer {
20
+ name = "milestone";
21
+ async score(input) {
22
+ const milestones = extractMilestones(input.metadata);
23
+ if (!milestones) {
24
+ return { name: this.name, score: 0, reasoning: "No milestones in metadata" };
25
+ }
26
+ const completed = milestones.filter((m) => m.completed);
27
+ const score = completed.length / milestones.length;
28
+ const missed = milestones.filter((m) => !m.completed).map((m) => m.name);
29
+ const parts = [`${completed.length}/${milestones.length} milestones completed`];
30
+ if (missed.length > 0)
31
+ parts.push(`missed: ${missed.join(", ")}`);
32
+ return { name: this.name, score: Math.round(score * 100) / 100, reasoning: parts.join("; ") };
33
+ }
34
+ }
35
+ //# sourceMappingURL=milestone-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"milestone-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/milestone-scorer.ts"],"names":[],"mappings":"AAAA,kFAAkF;AASlF,SAAS,iBAAiB,CAAC,QAA6C;IACtE,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC;IACnC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,MAAM,UAAU,GAAgB,EAAE,CAAC;IACnC,KAAK,MAAM,KAAK,IAAI,GAAG,EAAE,CAAC;QACxB,IACE,OAAO,KAAK,KAAK,QAAQ;YACzB,KAAK,KAAK,IAAI;YACd,OAAQ,KAAiC,CAAC,MAAM,CAAC,KAAK,QAAQ;YAC9D,OAAQ,KAAiC,CAAC,WAAW,CAAC,KAAK,SAAS,EACpE,CAAC;YACD,UAAU,CAAC,IAAI,CAAC,KAAkB,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IACD,OAAO,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CAAC;AACxD,CAAC;AAED,MAAM,OAAO,eAAe;IACjB,IAAI,GAAG,WAAW,CAAC;IAE5B,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,UAAU,GAAG,iBAAiB,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACrD,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,2BAA2B,EAAE,CAAC;QAC/E,CAAC;QAED,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACxD,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC;QACnD,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAEzE,MAAM,KAAK,GAAG,CAAC,GAAG,SAAS,CAAC,MAAM,IAAI,UAAU,CAAC,MAAM,uBAAuB,CAAC,CAAC;QAChF,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;YAAE,KAAK,CAAC,IAAI,CAAC,WAAW,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAElE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,GAAG,EAAE,SAAS,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;IAChG,CAAC;CACF"}
@@ -0,0 +1,8 @@
1
+ import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
2
+ export declare class MultiTurnConsistencyScorer implements Scorer {
3
+ private readonly llm;
4
+ readonly name = "multi-turn-consistency";
5
+ constructor(llm: ScorerLLM);
6
+ score(input: EvalInput): Promise<EvalScore>;
7
+ }
8
+ //# sourceMappingURL=multi-turn-consistency-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"multi-turn-consistency-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/multi-turn-consistency-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AA0B3E,qBAAa,0BAA2B,YAAW,MAAM;IAG3C,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,4BAA4B;gBAEZ,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CA+BlD"}
@@ -0,0 +1,55 @@
1
+ // MultiTurnConsistencyScorer: LLM-as-judge for context retention across conversation turns
2
+ import { parseLLMResponse } from "./parse-llm-response.js";
3
+ function extractConversationHistory(metadata) {
4
+ if (!metadata)
5
+ return undefined;
6
+ const raw = metadata["conversationHistory"];
7
+ if (!Array.isArray(raw))
8
+ return undefined;
9
+ const turns = [];
10
+ for (const entry of raw) {
11
+ if (typeof entry === "object" &&
12
+ entry !== null &&
13
+ typeof entry["role"] === "string" &&
14
+ typeof entry["content"] === "string") {
15
+ turns.push(entry);
16
+ }
17
+ }
18
+ return turns.length >= 2 ? turns : undefined;
19
+ }
20
+ export class MultiTurnConsistencyScorer {
21
+ llm;
22
+ name = "multi-turn-consistency";
23
+ constructor(llm) {
24
+ this.llm = llm;
25
+ }
26
+ async score(input) {
27
+ const history = extractConversationHistory(input.metadata);
28
+ if (!history) {
29
+ return { name: this.name, score: 0, reasoning: "No conversation history in metadata (need >= 2 turns)" };
30
+ }
31
+ const transcript = history.map((t) => `[${t.role}]: ${t.content}`).join("\n");
32
+ const prompt = `Evaluate context retention across this multi-turn conversation. Did the assistant maintain awareness of previously stated facts, requests, and context throughout the conversation?
33
+
34
+ Conversation:
35
+ ${transcript}
36
+
37
+ Final output: ${input.output}
38
+
39
+ Evaluate:
40
+ 1. Does the assistant contradict earlier statements?
41
+ 2. Does it forget previously provided information?
42
+ 3. Does it ask questions already answered?
43
+ 4. Does it maintain a coherent understanding of the user's evolving needs?
44
+
45
+ Score 1.0 for perfect context retention. Score 0.0 for complete context loss.
46
+
47
+ Respond EXACTLY in this format:
48
+ SCORE: <number from 0.0 to 1.0>
49
+ REASONING: <one sentence explanation>`;
50
+ const response = await this.llm.evaluate(prompt);
51
+ const { score, reasoning } = parseLLMResponse(response, this.name);
52
+ return { name: this.name, score, reasoning };
53
+ }
54
+ }
55
+ //# sourceMappingURL=multi-turn-consistency-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"multi-turn-consistency-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/multi-turn-consistency-scorer.ts"],"names":[],"mappings":"AAAA,2FAA2F;AAG3F,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAO3D,SAAS,0BAA0B,CAAC,QAA6C;IAC/E,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,qBAAqB,CAAC,CAAC;IAC5C,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,MAAM,KAAK,GAAuB,EAAE,CAAC;IACrC,KAAK,MAAM,KAAK,IAAI,GAAG,EAAE,CAAC;QACxB,IACE,OAAO,KAAK,KAAK,QAAQ;YACzB,KAAK,KAAK,IAAI;YACd,OAAQ,KAAiC,CAAC,MAAM,CAAC,KAAK,QAAQ;YAC9D,OAAQ,KAAiC,CAAC,SAAS,CAAC,KAAK,QAAQ,EACjE,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,KAAyB,CAAC,CAAC;QACxC,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC;AAC/C,CAAC;AAED,MAAM,OAAO,0BAA0B;IAGR;IAFpB,IAAI,GAAG,wBAAwB,CAAC;IAEzC,YAA6B,GAAc;QAAd,QAAG,GAAH,GAAG,CAAW;IAAG,CAAC;IAE/C,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,OAAO,GAAG,0BAA0B,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QAC3D,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,uDAAuD,EAAE,CAAC;QAC3G,CAAC;QAED,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE9E,MAAM,MAAM,GAAG;;;EAGjB,UAAU;;gBAEI,KAAK,CAAC,MAAM;;;;;;;;;;;;sCAYU,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
@@ -0,0 +1,9 @@
1
+ import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
2
+ export declare class PolicyAdherenceScorer implements Scorer {
3
+ private readonly llm;
4
+ private readonly policies;
5
+ readonly name = "policy-adherence";
6
+ constructor(llm: ScorerLLM, policies: readonly string[]);
7
+ score(input: EvalInput): Promise<EvalScore>;
8
+ }
9
+ //# sourceMappingURL=policy-adherence-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"policy-adherence-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/policy-adherence-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAG3E,qBAAa,qBAAsB,YAAW,MAAM;IAIhD,OAAO,CAAC,QAAQ,CAAC,GAAG;IACpB,OAAO,CAAC,QAAQ,CAAC,QAAQ;IAJ3B,QAAQ,CAAC,IAAI,sBAAsB;gBAGhB,GAAG,EAAE,SAAS,EACd,QAAQ,EAAE,SAAS,MAAM,EAAE;IAGxC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAwBlD"}
@@ -0,0 +1,34 @@
1
+ // PolicyAdherenceScorer: evaluates whether output follows business policy rules
2
+ import { parseLLMResponse } from "./parse-llm-response.js";
3
+ export class PolicyAdherenceScorer {
4
+ llm;
5
+ policies;
6
+ name = "policy-adherence";
7
+ constructor(llm, policies) {
8
+ this.llm = llm;
9
+ this.policies = policies;
10
+ }
11
+ async score(input) {
12
+ if (this.policies.length === 0) {
13
+ return { name: this.name, score: 0, reasoning: "No policies configured" };
14
+ }
15
+ const policyList = this.policies.map((p, i) => `${i + 1}. ${p}`).join("\n");
16
+ const prompt = `Evaluate policy adherence. Does the output comply with ALL of the following business policies?
17
+
18
+ Policies:
19
+ ${policyList}
20
+
21
+ Input: ${input.input}
22
+ Output: ${input.output}
23
+
24
+ Score 1.0 if the output fully adheres to all policies. Score 0.0 if it violates any policy. Use intermediate scores for partial adherence.
25
+
26
+ Respond EXACTLY in this format:
27
+ SCORE: <number from 0.0 to 1.0>
28
+ REASONING: <one sentence explanation>`;
29
+ const response = await this.llm.evaluate(prompt);
30
+ const { score, reasoning } = parseLLMResponse(response, this.name);
31
+ return { name: this.name, score, reasoning };
32
+ }
33
+ }
34
+ //# sourceMappingURL=policy-adherence-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"policy-adherence-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/policy-adherence-scorer.ts"],"names":[],"mappings":"AAAA,gFAAgF;AAGhF,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAE3D,MAAM,OAAO,qBAAqB;IAIb;IACA;IAJV,IAAI,GAAG,kBAAkB,CAAC;IAEnC,YACmB,GAAc,EACd,QAA2B;QAD3B,QAAG,GAAH,GAAG,CAAW;QACd,aAAQ,GAAR,QAAQ,CAAmB;IAC3C,CAAC;IAEJ,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,wBAAwB,EAAE,CAAC;QAC5E,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC5E,MAAM,MAAM,GAAG;;;EAGjB,UAAU;;SAEH,KAAK,CAAC,KAAK;UACV,KAAK,CAAC,MAAM;;;;;;sCAMgB,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
@@ -0,0 +1,6 @@
1
+ import type { EvalInput, EvalScore, Scorer } from "../types.js";
2
+ export declare class ResolutionScorer implements Scorer {
3
+ readonly name = "resolution";
4
+ score(input: EvalInput): Promise<EvalScore>;
5
+ }
6
+ //# sourceMappingURL=resolution-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"resolution-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/resolution-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAShE,qBAAa,gBAAiB,YAAW,MAAM;IAC7C,QAAQ,CAAC,IAAI,gBAAgB;IAEvB,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAclD"}
@@ -0,0 +1,25 @@
1
+ // ResolutionScorer: rule-based scorer mapping resolution status to eval score
2
+ const RESOLUTION_SCORES = {
3
+ resolved: 1.0,
4
+ partial: 0.5,
5
+ ambiguous: 0.25,
6
+ unresolved: 0.0,
7
+ };
8
+ export class ResolutionScorer {
9
+ name = "resolution";
10
+ async score(input) {
11
+ const resolution = input.metadata?.resolution;
12
+ if (!resolution?.status) {
13
+ return { name: this.name, score: 0, reasoning: "No resolution data in metadata" };
14
+ }
15
+ const score = RESOLUTION_SCORES[resolution.status] ?? 0;
16
+ const confidence = resolution.confidence ?? 1.0;
17
+ const weighted = score * confidence;
18
+ return {
19
+ name: this.name,
20
+ score: weighted,
21
+ reasoning: `Resolution: ${resolution.status} (confidence: ${confidence})`,
22
+ };
23
+ }
24
+ }
25
+ //# sourceMappingURL=resolution-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"resolution-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/resolution-scorer.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAI9E,MAAM,iBAAiB,GAA2B;IAChD,QAAQ,EAAE,GAAG;IACb,OAAO,EAAE,GAAG;IACZ,SAAS,EAAE,IAAI;IACf,UAAU,EAAE,GAAG;CAChB,CAAC;AAEF,MAAM,OAAO,gBAAgB;IAClB,IAAI,GAAG,YAAY,CAAC;IAE7B,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,UAAU,GAAG,KAAK,CAAC,QAAQ,EAAE,UAAkE,CAAC;QACtG,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,CAAC;YACxB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,gCAAgC,EAAE,CAAC;QACpF,CAAC;QACD,MAAM,KAAK,GAAG,iBAAiB,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QACxD,MAAM,UAAU,GAAG,UAAU,CAAC,UAAU,IAAI,GAAG,CAAC;QAChD,MAAM,QAAQ,GAAG,KAAK,GAAG,UAAU,CAAC;QACpC,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,KAAK,EAAE,QAAQ;YACf,SAAS,EAAE,eAAe,UAAU,CAAC,MAAM,iBAAiB,UAAU,GAAG;SAC1E,CAAC;IACJ,CAAC;CACF"}
@@ -0,0 +1,6 @@
1
+ import type { EvalInput, EvalScore, Scorer } from "../types.js";
2
+ export declare class RoutingAccuracyScorer implements Scorer {
3
+ readonly name = "routing-accuracy";
4
+ score(input: EvalInput): Promise<EvalScore>;
5
+ }
6
+ //# sourceMappingURL=routing-accuracy-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"routing-accuracy-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/routing-accuracy-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAEhE,qBAAa,qBAAsB,YAAW,MAAM;IAClD,QAAQ,CAAC,IAAI,sBAAsB;IAE7B,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAiBlD"}
@@ -0,0 +1,19 @@
1
+ // RoutingAccuracyScorer: rule-based scorer checking if the correct agent handled the message
2
+ export class RoutingAccuracyScorer {
3
+ name = "routing-accuracy";
4
+ async score(input) {
5
+ const expected = input.metadata?.expectedAgentId;
6
+ if (!expected) {
7
+ return { name: this.name, score: 0, reasoning: "No expectedAgentId in metadata" };
8
+ }
9
+ const actual = input.metadata?.activeAgentId;
10
+ if (!actual) {
11
+ return { name: this.name, score: 0, reasoning: "No activeAgentId in metadata" };
12
+ }
13
+ if (actual === expected) {
14
+ return { name: this.name, score: 1, reasoning: `Correct: routed to "${actual}"` };
15
+ }
16
+ return { name: this.name, score: 0, reasoning: `Expected "${expected}", got "${actual}"` };
17
+ }
18
+ }
19
+ //# sourceMappingURL=routing-accuracy-scorer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"routing-accuracy-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/routing-accuracy-scorer.ts"],"names":[],"mappings":"AAAA,6FAA6F;AAI7F,MAAM,OAAO,qBAAqB;IACvB,IAAI,GAAG,kBAAkB,CAAC;IAEnC,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,QAAQ,GAAG,KAAK,CAAC,QAAQ,EAAE,eAAqC,CAAC;QACvE,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,gCAAgC,EAAE,CAAC;QACpF,CAAC;QAED,MAAM,MAAM,GAAG,KAAK,CAAC,QAAQ,EAAE,aAAmC,CAAC;QACnE,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,8BAA8B,EAAE,CAAC;QAClF,CAAC;QAED,IAAI,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,uBAAuB,MAAM,GAAG,EAAE,CAAC;QACpF,CAAC;QAED,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,aAAa,QAAQ,WAAW,MAAM,GAAG,EAAE,CAAC;IAC7F,CAAC;CACF"}
@@ -0,0 +1,9 @@
1
+ import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
2
+ export declare class SafetyPreservationScorer implements Scorer {
3
+ private readonly llm;
4
+ readonly name = "safety-preservation";
5
+ constructor(llm: ScorerLLM);
6
+ score(input: EvalInput): Promise<EvalScore>;
7
+ private parseResponse;
8
+ }
9
+ //# sourceMappingURL=safety-preservation-scorer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"safety-preservation-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/safety-preservation-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAE3E,qBAAa,wBAAyB,YAAW,MAAM;IAGzC,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,yBAAyB;gBAET,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;IA+BjD,OAAO,CAAC,aAAa;CAoBtB"}