@kilnai/core 0.10.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/engine/domain/eval-config.d.ts +2 -1
- package/dist/engine/domain/eval-config.d.ts.map +1 -1
- package/dist/engine/domain/eval-config.js +9 -0
- package/dist/engine/domain/eval-config.js.map +1 -1
- package/dist/engine/domain/integration.d.ts +50 -0
- package/dist/engine/domain/integration.d.ts.map +1 -0
- package/dist/engine/domain/integration.js +4 -0
- package/dist/engine/domain/integration.js.map +1 -0
- package/dist/engine/domain/knowledge-source.d.ts +5 -1
- package/dist/engine/domain/knowledge-source.d.ts.map +1 -1
- package/dist/engine/error-catalog.d.ts.map +1 -1
- package/dist/engine/error-catalog.js +23 -2
- package/dist/engine/error-catalog.js.map +1 -1
- package/dist/engine/errors.d.ts +1 -1
- package/dist/engine/errors.d.ts.map +1 -1
- package/dist/engine/errors.js.map +1 -1
- package/dist/engine/gateway/conversation-event.d.ts +9 -1
- package/dist/engine/gateway/conversation-event.d.ts.map +1 -1
- package/dist/engine/gateway/tenant-config.d.ts +22 -0
- package/dist/engine/gateway/tenant-config.d.ts.map +1 -1
- package/dist/engine/gateway/tenant-config.js +58 -0
- package/dist/engine/gateway/tenant-config.js.map +1 -1
- package/dist/engine/index.d.ts +3 -2
- package/dist/engine/index.d.ts.map +1 -1
- package/dist/engine/index.js.map +1 -1
- package/dist/eval/consistency-runner.d.ts +28 -0
- package/dist/eval/consistency-runner.d.ts.map +1 -0
- package/dist/eval/consistency-runner.js +43 -0
- package/dist/eval/consistency-runner.js.map +1 -0
- package/dist/eval/experiment-runner.d.ts.map +1 -1
- package/dist/eval/experiment-runner.js +1 -0
- package/dist/eval/experiment-runner.js.map +1 -1
- package/dist/eval/index.d.ts +13 -0
- package/dist/eval/index.d.ts.map +1 -1
- package/dist/eval/index.js +12 -0
- package/dist/eval/index.js.map +1 -1
- package/dist/eval/scorer-factory.d.ts.map +1 -1
- package/dist/eval/scorer-factory.js +33 -0
- package/dist/eval/scorer-factory.js.map +1 -1
- package/dist/eval/scorers/context-relevance-scorer.d.ts +8 -0
- package/dist/eval/scorers/context-relevance-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/context-relevance-scorer.js +32 -0
- package/dist/eval/scorers/context-relevance-scorer.js.map +1 -0
- package/dist/eval/scorers/effort-scorer.d.ts +6 -0
- package/dist/eval/scorers/effort-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/effort-scorer.js +15 -0
- package/dist/eval/scorers/effort-scorer.js.map +1 -0
- package/dist/eval/scorers/handoff-quality-scorer.d.ts +8 -0
- package/dist/eval/scorers/handoff-quality-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/handoff-quality-scorer.js +65 -0
- package/dist/eval/scorers/handoff-quality-scorer.js.map +1 -0
- package/dist/eval/scorers/milestone-scorer.d.ts +6 -0
- package/dist/eval/scorers/milestone-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/milestone-scorer.js +35 -0
- package/dist/eval/scorers/milestone-scorer.js.map +1 -0
- package/dist/eval/scorers/multi-turn-consistency-scorer.d.ts +8 -0
- package/dist/eval/scorers/multi-turn-consistency-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/multi-turn-consistency-scorer.js +55 -0
- package/dist/eval/scorers/multi-turn-consistency-scorer.js.map +1 -0
- package/dist/eval/scorers/policy-adherence-scorer.d.ts +9 -0
- package/dist/eval/scorers/policy-adherence-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/policy-adherence-scorer.js +34 -0
- package/dist/eval/scorers/policy-adherence-scorer.js.map +1 -0
- package/dist/eval/scorers/resolution-scorer.d.ts +6 -0
- package/dist/eval/scorers/resolution-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/resolution-scorer.js +25 -0
- package/dist/eval/scorers/resolution-scorer.js.map +1 -0
- package/dist/eval/scorers/routing-accuracy-scorer.d.ts +6 -0
- package/dist/eval/scorers/routing-accuracy-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/routing-accuracy-scorer.js +19 -0
- package/dist/eval/scorers/routing-accuracy-scorer.js.map +1 -0
- package/dist/eval/scorers/safety-preservation-scorer.d.ts +9 -0
- package/dist/eval/scorers/safety-preservation-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/safety-preservation-scorer.js +54 -0
- package/dist/eval/scorers/safety-preservation-scorer.js.map +1 -0
- package/dist/eval/scorers/tool-calling-accuracy-scorer.d.ts +6 -0
- package/dist/eval/scorers/tool-calling-accuracy-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/tool-calling-accuracy-scorer.js +81 -0
- package/dist/eval/scorers/tool-calling-accuracy-scorer.js.map +1 -0
- package/dist/eval/scorers/tool-trajectory-scorer.d.ts +8 -0
- package/dist/eval/scorers/tool-trajectory-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/tool-trajectory-scorer.js +51 -0
- package/dist/eval/scorers/tool-trajectory-scorer.js.map +1 -0
- package/dist/eval/types.d.ts +1 -0
- package/dist/eval/types.d.ts.map +1 -1
- package/dist/events/index.d.ts +11 -1
- package/dist/events/index.d.ts.map +1 -1
- package/dist/events/index.js +1 -0
- package/dist/events/index.js.map +1 -1
- package/dist/index.d.ts +3 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js.map +1 -1
- package/dist/knowledge/infrastructure/composite-extractor.d.ts +2 -2
- package/dist/knowledge/infrastructure/composite-extractor.d.ts.map +1 -1
- package/dist/knowledge/infrastructure/composite-extractor.js +2 -2
- package/dist/knowledge/infrastructure/composite-extractor.js.map +1 -1
- package/dist/knowledge/infrastructure/file-extractor.d.ts +2 -2
- package/dist/knowledge/infrastructure/file-extractor.d.ts.map +1 -1
- package/dist/knowledge/infrastructure/file-extractor.js +1 -1
- package/dist/knowledge/infrastructure/file-extractor.js.map +1 -1
- package/dist/knowledge/infrastructure/pdf-extractor.d.ts +2 -2
- package/dist/knowledge/infrastructure/pdf-extractor.d.ts.map +1 -1
- package/dist/knowledge/infrastructure/pdf-extractor.js +4 -2
- package/dist/knowledge/infrastructure/pdf-extractor.js.map +1 -1
- package/dist/knowledge/infrastructure/url-extractor.d.ts +2 -2
- package/dist/knowledge/infrastructure/url-extractor.d.ts.map +1 -1
- package/dist/knowledge/infrastructure/url-extractor.js +5 -3
- package/dist/knowledge/infrastructure/url-extractor.js.map +1 -1
- package/dist/knowledge/source-manager.d.ts +2 -0
- package/dist/knowledge/source-manager.d.ts.map +1 -1
- package/dist/knowledge/source-manager.js +54 -1
- package/dist/knowledge/source-manager.js.map +1 -1
- package/dist/observability/span-mapper.d.ts.map +1 -1
- package/dist/observability/span-mapper.js +15 -0
- package/dist/observability/span-mapper.js.map +1 -1
- package/dist/package/yaml-parser.d.ts.map +1 -1
- package/dist/package/yaml-parser.js +1 -0
- package/dist/package/yaml-parser.js.map +1 -1
- package/dist/skill/index.d.ts +2 -4
- package/dist/skill/index.d.ts.map +1 -1
- package/dist/skill/index.js +1 -2
- package/dist/skill/index.js.map +1 -1
- package/dist/skill/md-parser.d.ts +21 -0
- package/dist/skill/md-parser.d.ts.map +1 -0
- package/dist/skill/md-parser.js +168 -0
- package/dist/skill/md-parser.js.map +1 -0
- package/dist/skill/skill-registry.d.ts +16 -8
- package/dist/skill/skill-registry.d.ts.map +1 -1
- package/dist/skill/skill-registry.js +77 -30
- package/dist/skill/skill-registry.js.map +1 -1
- package/dist/skill/types.d.ts +7 -3
- package/dist/skill/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/dist/skill/yaml-parser.d.ts +0 -18
- package/dist/skill/yaml-parser.d.ts.map +0 -1
- package/dist/skill/yaml-parser.js +0 -61
- package/dist/skill/yaml-parser.js.map +0 -1
- package/dist/skill/yaml-schema.d.ts +0 -20
- package/dist/skill/yaml-schema.d.ts.map +0 -1
- package/dist/skill/yaml-schema.js +0 -80
- package/dist/skill/yaml-schema.js.map +0 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/engine/index.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,YAAY,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AACxD,YAAY,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AAE1D,YAAY,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAC1D,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,EAAE,QAAQ,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAC9H,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,EAAE,WAAW,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAC/H,YAAY,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAC5E,YAAY,EAAE,UAAU,EAAE,SAAS,EAAE,UAAU,EAAE,UAAU,EAAE,SAAS,EAAE,WAAW,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AAC7J,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,8BAA8B,CAAC;AACnE,YAAY,EAAE,aAAa,EAAE,MAAM,8BAA8B,CAAC;AAClE,YAAY,EAAE,UAAU,EAAE,qBAAqB,EAAE,MAAM,wBAAwB,CAAC;AAChF,YAAY,EACV,aAAa,EACb,WAAW,EACX,kBAAkB,EAClB,uBAAuB,EACvB,aAAa,EACb,cAAc,EACd,mBAAmB,GACpB,MAAM,4BAA4B,CAAC;AACpC,YAAY,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,0BAA0B,CAAC;AAC9F,YAAY,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,sBAAsB,CAAC;AAC3D,YAAY,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAC3E,YAAY,EAAE,IAAI,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AACrE,YAAY,EACV,OAAO,EACP,aAAa,EACb,eAAe,EACf,eAAe,EACf,WAAW,GACZ,MAAM,qBAAqB,CAAC;AAG7B,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AAC5G,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpD,YAAY,EAAE,MAAM,EAAE,WAAW,EAAE,qBAAqB,EAAE,MAAM,wBAAwB,CAAC;AACzF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,YAAY,EAAE,GAAG,EAAE,YAAY,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAC;AACjF,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAGlD,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAGxF,YAAY,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAG1E,OAAO,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAGhF,YAAY,EACV,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,sBAAsB,GACvB,MAAM,6BAA6B,CAAC;AACrC,OAAO,EAAE,qBAAqB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,6BAA6B,CAAC;AAGnF,YAAY,EACV,WAAW,EACX,cAAc,EACd,WAAW,EACX,aAAa,EACb,cAAc,EACd,WAAW,EACX,WAAW,EACX,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EAAE,mBAAmB,EAAE,MAAM,4BAA4B,CAAC;AACjE,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC;AAGhF,YAAY,EACV,mBAAmB,EACnB,aAAa,EACb,oBAAoB,EACpB,mBAAmB,EACnB,eAAe,EACf,yBAAyB,GAC1B,MAAM,gCAAgC,CAAC;AACxC,OAAO,EAAE,sBAAsB,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AAG5F,YAAY,EAAE,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,YAAY,EAAE,eAAe,EAAE,sBAAsB,EAAE,MAAM,qBAAqB,CAAC;AACvI,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,YAAY,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AACvD,OAAO,EAAE,mBAAmB,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAG7F,YAAY,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAC9D,YAAY,EAAE,WAAW,EAAE,YAAY,EAAE,kBAAkB,EAAE,WAAW,EAAE,MAAM,0BAA0B,CAAC;AAC3G,YAAY,EAAE,QAAQ,EAAE,KAAK,EAAE,WAAW,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AACjF,YAAY,EACV,eAAe,EACf,wBAAwB,EACxB,oBAAoB,EACpB,uBAAuB,EACvB,qBAAqB,EACrB,wBAAwB,EACxB,uBAAuB,EACvB,mBAAmB,GACpB,MAAM,8BAA8B,CAAC;AACtC,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAC;AAGvE,YAAY,EAAE,WAAW,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,oBAAoB,EAAE,MAAM,4BAA4B,CAAC;AAG7H,YAAY,EACV,eAAe,EACf,mBAAmB,EACnB,qBAAqB,EACrB,gBAAgB,EAChB,gBAAgB,EAChB,WAAW,GACZ,MAAM,8BAA8B,CAAC;AAGtC,YAAY,EACV,YAAY,EACZ,aAAa,EACb,aAAa,EACb,WAAW,EACX,cAAc,EACd,UAAU,EACV,aAAa,EACb,iBAAiB,EACjB,gBAAgB,EAChB,iBAAiB,EACjB,iBAAiB,EACjB,mBAAmB,EACnB,iBAAiB,EACjB,oBAAoB,EACpB,qBAAqB,GACtB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EAAE,oBAAoB,EAAE,MAAM,4BAA4B,CAAC;AAGlE,YAAY,EACV,qBAAqB,EACrB,iBAAiB,EACjB,sBAAsB,GACvB,MAAM,iCAAiC,CAAC;AACzC,YAAY,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC/D,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAG/D,YAAY,EACV,YAAY,EACZ,qBAAqB,EACrB,SAAS,EACT,OAAO,EACP,SAAS,EACT,aAAa,EACb,eAAe,EACf,aAAa,EACb,qBAAqB,EACrB,UAAU,EACV,QAAQ,EACR,eAAe,EACf,oBAAoB,EACpB,oBAAoB,EACpB,oBAAoB,GACrB,MAAM,2BAA2B,CAAC;AACnC,OAAO,EAAE,oBAAoB,EAAE,MAAM,2BAA2B,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/engine/index.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,YAAY,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AACxD,YAAY,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AAE1D,YAAY,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAC1D,YAAY,EAAE,WAAW,EAAE,QAAQ,EAAE,SAAS,EAAE,SAAS,EAAE,QAAQ,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAC9H,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,EAAE,WAAW,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAC/H,YAAY,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAC5E,YAAY,EAAE,UAAU,EAAE,SAAS,EAAE,UAAU,EAAE,UAAU,EAAE,SAAS,EAAE,WAAW,EAAE,iBAAiB,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AAC7J,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,8BAA8B,CAAC;AACnE,YAAY,EAAE,aAAa,EAAE,MAAM,8BAA8B,CAAC;AAClE,YAAY,EAAE,UAAU,EAAE,qBAAqB,EAAE,MAAM,wBAAwB,CAAC;AAChF,YAAY,EACV,aAAa,EACb,WAAW,EACX,kBAAkB,EAClB,uBAAuB,EACvB,aAAa,EACb,cAAc,EACd,mBAAmB,GACpB,MAAM,4BAA4B,CAAC;AACpC,YAAY,EACV,kBAAkB,EAClB,oBAAoB,EACpB,iBAAiB,EACjB,yBAAyB,EACzB,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,yBAAyB,CAAC;AACjC,YAAY,EAAE,eAAe,EAAE,eAAe,EAAE,WAAW,EAAE,MAAM,0BAA0B,CAAC;AAC9F,YAAY,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,sBAAsB,CAAC;AAC3D,YAAY,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC;AAC3E,YAAY,EAAE,IAAI,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AACrE,YAAY,EACV,OAAO,EACP,aAAa,EACb,eAAe,EACf,eAAe,EACf,WAAW,GACZ,MAAM,qBAAqB,CAAC;AAG7B,YAAY,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,aAAa,EAAE,mBAAmB,EAAE,MAAM,sBAAsB,CAAC;AAC5G,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AACpD,YAAY,EAAE,MAAM,EAAE,WAAW,EAAE,qBAAqB,EAAE,MAAM,wBAAwB,CAAC;AACzF,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AACxD,YAAY,EAAE,GAAG,EAAE,YAAY,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAC;AACjF,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAGlD,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAGxF,YAAY,EAAE,kBAAkB,EAAE,MAAM,iCAAiC,CAAC;AAG1E,OAAO,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAGhF,YAAY,EACV,aAAa,EACb,iBAAiB,EACjB,qBAAqB,EACrB,sBAAsB,GACvB,MAAM,6BAA6B,CAAC;AACrC,OAAO,EAAE,qBAAqB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,6BAA6B,CAAC;AAGnF,YAAY,EACV,WAAW,EACX,cAAc,EACd,WAAW,EACX,aAAa,EACb,cAAc,EACd,WAAW,EACX,WAAW,EACX,oBAAoB,GACrB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EAAE,mBAAmB,EAAE,MAAM,4BAA4B,CAAC;AACjE,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC;AAGhF,YAAY,EACV,mBAAmB,EACnB,aAAa,EACb,oBAAoB,EACpB,mBAAmB,EACnB,eAAe,EACf,yBAAyB,GAC1B,MAAM,gCAAgC,CAAC;AACxC,OAAO,EAAE,sBAAsB,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AAG5F,YAAY,EAAE,OAAO,EAAE,WAAW,EAAE,cAAc,EAAE,YAAY,EAAE,eAAe,EAAE,sBAAsB,EAAE,MAAM,qBAAqB,CAAC;AACvI,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AACtD,YAAY,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AACvD,OAAO,EAAE,mBAAmB,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAG7F,YAAY,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAC9D,YAAY,EAAE,WAAW,EAAE,YAAY,EAAE,kBAAkB,EAAE,WAAW,EAAE,MAAM,0BAA0B,CAAC;AAC3G,YAAY,EAAE,QAAQ,EAAE,KAAK,EAAE,WAAW,EAAE,OAAO,EAAE,MAAM,qBAAqB,CAAC;AACjF,YAAY,EACV,eAAe,EACf,wBAAwB,EACxB,oBAAoB,EACpB,uBAAuB,EACvB,qBAAqB,EACrB,wBAAwB,EACxB,uBAAuB,EACvB,mBAAmB,GACpB,MAAM,8BAA8B,CAAC;AACtC,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAC;AAGvE,YAAY,EAAE,WAAW,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,oBAAoB,EAAE,MAAM,4BAA4B,CAAC;AAG7H,YAAY,EACV,eAAe,EACf,mBAAmB,EACnB,qBAAqB,EACrB,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,WAAW,GACZ,MAAM,8BAA8B,CAAC;AAGtC,YAAY,EACV,YAAY,EACZ,aAAa,EACb,aAAa,EACb,WAAW,EACX,cAAc,EACd,UAAU,EACV,aAAa,EACb,iBAAiB,EACjB,iBAAiB,EACjB,gBAAgB,EAChB,iBAAiB,EACjB,iBAAiB,EACjB,mBAAmB,EACnB,iBAAiB,EACjB,oBAAoB,EACpB,qBAAqB,GACtB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EAAE,oBAAoB,EAAE,MAAM,4BAA4B,CAAC;AAGlE,YAAY,EACV,qBAAqB,EACrB,iBAAiB,EACjB,sBAAsB,GACvB,MAAM,iCAAiC,CAAC;AACzC,YAAY,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC/D,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAG/D,YAAY,EACV,YAAY,EACZ,qBAAqB,EACrB,SAAS,EACT,OAAO,EACP,SAAS,EACT,aAAa,EACb,eAAe,EACf,aAAa,EACb,qBAAqB,EACrB,UAAU,EACV,QAAQ,EACR,eAAe,EACf,oBAAoB,EACpB,oBAAoB,EACpB,oBAAoB,GACrB,MAAM,2BAA2B,CAAC;AACnC,OAAO,EAAE,oBAAoB,EAAE,MAAM,2BAA2B,CAAC"}
|
package/dist/engine/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/engine/index.ts"],"names":[],"mappings":"AAAA,yDAAyD;AACzD,yDAAyD;AAEzD,0EAA0E;AAC1E,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAExC,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AAKxD,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,EAAE,WAAW,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAE/H,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAE5E,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,8BAA8B,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/engine/index.ts"],"names":[],"mappings":"AAAA,yDAAyD;AACzD,yDAAyD;AAEzD,0EAA0E;AAC1E,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAExC,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAC;AAKxD,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,EAAE,WAAW,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,MAAM,qBAAqB,CAAC;AAE/H,OAAO,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,MAAM,sBAAsB,CAAC;AAE5E,OAAO,EAAE,mBAAmB,EAAE,MAAM,2BAA2B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,8BAA8B,CAAC;AAmCnE,OAAO,EAAE,YAAY,EAAE,MAAM,sBAAsB,CAAC;AAEpD,OAAO,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAExD,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAElD,uDAAuD;AACvD,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAKxF,wDAAwD;AACxD,OAAO,EAAE,iBAAiB,EAAE,gBAAgB,EAAE,MAAM,2BAA2B,CAAC;AAShF,OAAO,EAAE,qBAAqB,EAAE,MAAM,6BAA6B,CAAC;AACpE,OAAO,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,MAAM,6BAA6B,CAAC;AAanF,OAAO,EAAE,mBAAmB,EAAE,MAAM,4BAA4B,CAAC;AACjE,OAAO,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC;AAWhF,OAAO,EAAE,sBAAsB,EAAE,kBAAkB,EAAE,MAAM,gCAAgC,CAAC;AAI5F,OAAO,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AAEtD,OAAO,EAAE,mBAAmB,EAAE,sBAAsB,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAgB7F,OAAO,EAAE,uBAAuB,EAAE,MAAM,8BAA8B,CAAC;AAmCvE,OAAO,EAAE,oBAAoB,EAAE,MAAM,4BAA4B,CAAC;AASlE,OAAO,EAAE,iBAAiB,EAAE,MAAM,4BAA4B,CAAC;AAoB/D,OAAO,EAAE,oBAAoB,EAAE,MAAM,2BAA2B,CAAC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { Experiment } from "./types.js";
|
|
2
|
+
import type { ExperimentRunner } from "./experiment-runner.js";
|
|
3
|
+
export interface ConsistencyItemResult {
|
|
4
|
+
readonly itemId: string;
|
|
5
|
+
readonly passCount: number;
|
|
6
|
+
readonly totalRuns: number;
|
|
7
|
+
readonly allPassed: boolean;
|
|
8
|
+
}
|
|
9
|
+
export interface ConsistencyResult {
|
|
10
|
+
readonly experimentName: string;
|
|
11
|
+
readonly datasetName: string;
|
|
12
|
+
readonly k: number;
|
|
13
|
+
readonly passThreshold: number;
|
|
14
|
+
readonly runs: readonly Experiment[];
|
|
15
|
+
readonly itemResults: readonly ConsistencyItemResult[];
|
|
16
|
+
readonly passAtK: number;
|
|
17
|
+
}
|
|
18
|
+
export interface ConsistencyRunnerConfig {
|
|
19
|
+
readonly runner: ExperimentRunner;
|
|
20
|
+
readonly k: number;
|
|
21
|
+
readonly passThreshold?: number;
|
|
22
|
+
}
|
|
23
|
+
export declare class ConsistencyRunner {
|
|
24
|
+
private readonly config;
|
|
25
|
+
constructor(config: ConsistencyRunnerConfig);
|
|
26
|
+
run(): Promise<ConsistencyResult>;
|
|
27
|
+
}
|
|
28
|
+
//# sourceMappingURL=consistency-runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"consistency-runner.d.ts","sourceRoot":"","sources":["../../src/eval/consistency-runner.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAC7C,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAG/D,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,SAAS,EAAE,OAAO,CAAC;CAC7B;AAED,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,IAAI,EAAE,SAAS,UAAU,EAAE,CAAC;IACrC,QAAQ,CAAC,WAAW,EAAE,SAAS,qBAAqB,EAAE,CAAC;IACvD,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,uBAAuB;IACtC,QAAQ,CAAC,MAAM,EAAE,gBAAgB,CAAC;IAClC,QAAQ,CAAC,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;CACjC;AAED,qBAAa,iBAAiB;IAChB,OAAO,CAAC,QAAQ,CAAC,MAAM;gBAAN,MAAM,EAAE,uBAAuB;IAQtD,GAAG,IAAI,OAAO,CAAC,iBAAiB,CAAC;CAkCxC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
// ConsistencyRunner: implements tau-bench pass^k metric for production readiness evaluation
|
|
2
|
+
import { KilnError } from "../engine/errors.js";
|
|
3
|
+
export class ConsistencyRunner {
|
|
4
|
+
config;
|
|
5
|
+
constructor(config) {
|
|
6
|
+
this.config = config;
|
|
7
|
+
if (config.k < 1) {
|
|
8
|
+
throw new KilnError("EVAL_SCORER_FAILED", "ConsistencyRunner k must be >= 1", {
|
|
9
|
+
context: { k: config.k },
|
|
10
|
+
});
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
async run() {
|
|
14
|
+
const { runner, k, passThreshold = 1.0 } = this.config;
|
|
15
|
+
const runs = [];
|
|
16
|
+
for (let i = 0; i < k; i++) {
|
|
17
|
+
runs.push(await runner.run());
|
|
18
|
+
}
|
|
19
|
+
const firstRun = runs[0];
|
|
20
|
+
const itemIds = firstRun.results.map((r) => r.itemId);
|
|
21
|
+
const itemResults = itemIds.map((itemId) => {
|
|
22
|
+
let passCount = 0;
|
|
23
|
+
for (const run of runs) {
|
|
24
|
+
const result = run.results.find((r) => r.itemId === itemId);
|
|
25
|
+
if (result && result.scores.every((s) => s.score >= passThreshold)) {
|
|
26
|
+
passCount++;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
return { itemId, passCount, totalRuns: k, allPassed: passCount === k };
|
|
30
|
+
});
|
|
31
|
+
const passAtK = itemIds.length === 0 ? 1.0 : itemResults.filter((r) => r.allPassed).length / itemIds.length;
|
|
32
|
+
return {
|
|
33
|
+
experimentName: firstRun.name,
|
|
34
|
+
datasetName: firstRun.datasetName,
|
|
35
|
+
k,
|
|
36
|
+
passThreshold,
|
|
37
|
+
runs,
|
|
38
|
+
itemResults,
|
|
39
|
+
passAtK,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=consistency-runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"consistency-runner.js","sourceRoot":"","sources":["../../src/eval/consistency-runner.ts"],"names":[],"mappings":"AAAA,4FAA4F;AAI5F,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAyBhD,MAAM,OAAO,iBAAiB;IACC;IAA7B,YAA6B,MAA+B;QAA/B,WAAM,GAAN,MAAM,CAAyB;QAC1D,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YACjB,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,kCAAkC,EAAE;gBAC5E,OAAO,EAAE,EAAE,CAAC,EAAE,MAAM,CAAC,CAAC,EAAE;aACzB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,KAAK,CAAC,GAAG;QACP,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,aAAa,GAAG,GAAG,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC;QACvD,MAAM,IAAI,GAAiB,EAAE,CAAC;QAE9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3B,IAAI,CAAC,IAAI,CAAC,MAAM,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC;QAChC,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,CAAC,CAAE,CAAC;QAC1B,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAEtD,MAAM,WAAW,GAA4B,OAAO,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,EAAE;YAClE,IAAI,SAAS,GAAG,CAAC,CAAC;YAClB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;gBACvB,MAAM,MAAM,GAAG,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC;gBAC5D,IAAI,MAAM,IAAI,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,IAAI,aAAa,CAAC,EAAE,CAAC;oBACnE,SAAS,EAAE,CAAC;gBACd,CAAC;YACH,CAAC;YACD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,EAAE,SAAS,EAAE,SAAS,KAAK,CAAC,EAAE,CAAC;QACzE,CAAC,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,OAAO,CAAC,MAAM,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QAE5G,OAAO;YACL,cAAc,EAAE,QAAQ,CAAC,IAAI;YAC7B,WAAW,EAAE,QAAQ,CAAC,WAAW;YACjC,CAAC;YACD,aAAa;YACb,IAAI;YACJ,WAAW;YACX,OAAO;SACR,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"experiment-runner.d.ts","sourceRoot":"","sources":["../../src/eval/experiment-runner.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAA+B,MAAM,YAAY,CAAC;AAG3F,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;CAC/B;AAED,MAAM,WAAW,sBAAsB;IACrC,QAAQ,CAAC,OAAO,EAAE,SAAS,MAAM,EAAE,CAAC;IACpC,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAC;IAC1B,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,cAAc,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,OAAO,CAAC,oBAAoB,CAAC,CAAC;CAC3E;AAED,qBAAa,gBAAgB;IACf,OAAO,CAAC,QAAQ,CAAC,MAAM;gBAAN,MAAM,EAAE,sBAAsB;IAErD,GAAG,IAAI,OAAO,CAAC,UAAU,CAAC;
|
|
1
|
+
{"version":3,"file":"experiment-runner.d.ts","sourceRoot":"","sources":["../../src/eval/experiment-runner.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAA+B,MAAM,YAAY,CAAC;AAG3F,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,WAAW,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;CAC/B;AAED,MAAM,WAAW,sBAAsB;IACrC,QAAQ,CAAC,OAAO,EAAE,SAAS,MAAM,EAAE,CAAC;IACpC,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAC;IAC1B,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,QAAQ,CAAC,cAAc,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,OAAO,CAAC,oBAAoB,CAAC,CAAC;CAC3E;AAED,qBAAa,gBAAgB;IACf,OAAO,CAAC,QAAQ,CAAC,MAAM;gBAAN,MAAM,EAAE,sBAAsB;IAErD,GAAG,IAAI,OAAO,CAAC,UAAU,CAAC;CAoDjC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"experiment-runner.js","sourceRoot":"","sources":["../../src/eval/experiment-runner.ts"],"names":[],"mappings":"AAAA,kFAAkF;AAGlF,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAiBhD,MAAM,OAAO,gBAAgB;IACE;IAA7B,YAA6B,MAA8B;QAA9B,WAAM,GAAN,MAAM,CAAwB;IAAG,CAAC;IAE/D,KAAK,CAAC,GAAG;QACP,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC3C,MAAM,OAAO,GAAuB,EAAE,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC7C,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAE/D,MAAM,SAAS,GAAc;gBAC3B,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,SAAS,CAAC,MAAM;gBACxB,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,UAAU,EAAE,SAAS,CAAC,UAAU;gBAChC,OAAO,EAAE,SAAS,CAAC,OAAO;
|
|
1
|
+
{"version":3,"file":"experiment-runner.js","sourceRoot":"","sources":["../../src/eval/experiment-runner.ts"],"names":[],"mappings":"AAAA,kFAAkF;AAGlF,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAiBhD,MAAM,OAAO,gBAAgB;IACE;IAA7B,YAA6B,MAA8B;QAA9B,WAAM,GAAN,MAAM,CAAwB;IAAG,CAAC;IAE/D,KAAK,CAAC,GAAG;QACP,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC3C,MAAM,OAAO,GAAuB,EAAE,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC7C,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAE/D,MAAM,SAAS,GAAc;gBAC3B,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,SAAS,CAAC,MAAM;gBACxB,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,UAAU,EAAE,SAAS,CAAC,UAAU;gBAChC,OAAO,EAAE,SAAS,CAAC,OAAO;gBAC1B,QAAQ,EAAE,IAAI,CAAC,QAAQ;aACxB,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAC9B,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE;gBAClC,IAAI,CAAC;oBACH,OAAO,MAAM,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBAClC,CAAC;gBAAC,OAAO,GAAG,EAAE,CAAC;oBACb,OAAO;wBACL,IAAI,EAAE,CAAC,CAAC,IAAI;wBACZ,KAAK,EAAE,CAAC;wBACR,SAAS,EAAE,GAAG,YAAY,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;qBAChE,CAAC;gBACJ,CAAC;YACH,CAAC,CAAC,CACH,CAAC;YAEF,OAAO,CAAC,IAAI,CAAC;gBACX,MAAM,EAAE,IAAI,CAAC,EAAE;gBACf,MAAM,EAAE,SAAS,CAAC,MAAM;gBACxB,MAAM;gBACN,UAAU,EAAE,SAAS,CAAC,UAAU;gBAChC,UAAU,EAAE;oBACV,WAAW,EAAE,SAAS,CAAC,WAAW;oBAClC,YAAY,EAAE,SAAS,CAAC,YAAY;iBACrC;aACF,CAAC,CAAC;QACL,CAAC;QAED,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,cAAc;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI;YACrC,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;YAC/C,OAAO;YACP,SAAS;YACT,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACtC,CAAC;IACJ,CAAC;CACF"}
|
package/dist/eval/index.d.ts
CHANGED
|
@@ -5,6 +5,8 @@ export { JsonValidityScorer } from "./scorers/json-validity-scorer.js";
|
|
|
5
5
|
export { LengthScorer } from "./scorers/length-scorer.js";
|
|
6
6
|
export { LatencyScorer } from "./scorers/latency-scorer.js";
|
|
7
7
|
export { CostScorer } from "./scorers/cost-scorer.js";
|
|
8
|
+
export { EffortScorer } from "./scorers/effort-scorer.js";
|
|
9
|
+
export { ResolutionScorer } from "./scorers/resolution-scorer.js";
|
|
8
10
|
export { CompositeScorer } from "./scorers/composite-scorer.js";
|
|
9
11
|
export { FaithfulnessScorer } from "./scorers/faithfulness-scorer.js";
|
|
10
12
|
export { RelevanceScorer } from "./scorers/relevance-scorer.js";
|
|
@@ -12,10 +14,21 @@ export { CoherenceScorer } from "./scorers/coherence-scorer.js";
|
|
|
12
14
|
export { HallucinationScorer } from "./scorers/hallucination-scorer.js";
|
|
13
15
|
export { ToxicityScorer } from "./scorers/toxicity-scorer.js";
|
|
14
16
|
export { CustomPromptScorer } from "./scorers/custom-prompt-scorer.js";
|
|
17
|
+
export { PolicyAdherenceScorer } from "./scorers/policy-adherence-scorer.js";
|
|
18
|
+
export { ContextRelevanceScorer } from "./scorers/context-relevance-scorer.js";
|
|
19
|
+
export { ToolTrajectoryScorer } from "./scorers/tool-trajectory-scorer.js";
|
|
20
|
+
export { ToolCallingAccuracyScorer } from "./scorers/tool-calling-accuracy-scorer.js";
|
|
21
|
+
export { MultiTurnConsistencyScorer } from "./scorers/multi-turn-consistency-scorer.js";
|
|
22
|
+
export { SafetyPreservationScorer } from "./scorers/safety-preservation-scorer.js";
|
|
23
|
+
export { RoutingAccuracyScorer } from "./scorers/routing-accuracy-scorer.js";
|
|
24
|
+
export { HandoffQualityScorer } from "./scorers/handoff-quality-scorer.js";
|
|
25
|
+
export { MilestoneScorer } from "./scorers/milestone-scorer.js";
|
|
15
26
|
export { parseDatasetJsonl } from "./dataset-loader.js";
|
|
16
27
|
export { createScorer } from "./scorer-factory.js";
|
|
17
28
|
export { ExperimentRunner } from "./experiment-runner.js";
|
|
18
29
|
export type { ExperimentRunnerConfig, GenerateOutputResult } from "./experiment-runner.js";
|
|
19
30
|
export { compareExperiments } from "./experiment-comparator.js";
|
|
20
31
|
export type { ComparisonResult, ScorerComparison } from "./experiment-comparator.js";
|
|
32
|
+
export { ConsistencyRunner } from "./consistency-runner.js";
|
|
33
|
+
export type { ConsistencyRunnerConfig, ConsistencyResult, ConsistencyItemResult } from "./consistency-runner.js";
|
|
21
34
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/eval/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAEA,YAAY,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,OAAO,EAAE,oBAAoB,EAAE,gBAAgB,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AACpJ,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,YAAY,EAAE,sBAAsB,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC3F,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAChE,YAAY,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAEA,YAAY,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,OAAO,EAAE,oBAAoB,EAAE,gBAAgB,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AACpJ,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,sBAAsB,EAAE,MAAM,uCAAuC,CAAC;AAC/E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,yBAAyB,EAAE,MAAM,2CAA2C,CAAC;AACtF,OAAO,EAAE,0BAA0B,EAAE,MAAM,4CAA4C,CAAC;AACxF,OAAO,EAAE,wBAAwB,EAAE,MAAM,yCAAyC,CAAC;AACnF,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,YAAY,EAAE,sBAAsB,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC3F,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAChE,YAAY,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC;AACrF,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,YAAY,EAAE,uBAAuB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,yBAAyB,CAAC"}
|
package/dist/eval/index.js
CHANGED
|
@@ -5,6 +5,8 @@ export { JsonValidityScorer } from "./scorers/json-validity-scorer.js";
|
|
|
5
5
|
export { LengthScorer } from "./scorers/length-scorer.js";
|
|
6
6
|
export { LatencyScorer } from "./scorers/latency-scorer.js";
|
|
7
7
|
export { CostScorer } from "./scorers/cost-scorer.js";
|
|
8
|
+
export { EffortScorer } from "./scorers/effort-scorer.js";
|
|
9
|
+
export { ResolutionScorer } from "./scorers/resolution-scorer.js";
|
|
8
10
|
export { CompositeScorer } from "./scorers/composite-scorer.js";
|
|
9
11
|
export { FaithfulnessScorer } from "./scorers/faithfulness-scorer.js";
|
|
10
12
|
export { RelevanceScorer } from "./scorers/relevance-scorer.js";
|
|
@@ -12,8 +14,18 @@ export { CoherenceScorer } from "./scorers/coherence-scorer.js";
|
|
|
12
14
|
export { HallucinationScorer } from "./scorers/hallucination-scorer.js";
|
|
13
15
|
export { ToxicityScorer } from "./scorers/toxicity-scorer.js";
|
|
14
16
|
export { CustomPromptScorer } from "./scorers/custom-prompt-scorer.js";
|
|
17
|
+
export { PolicyAdherenceScorer } from "./scorers/policy-adherence-scorer.js";
|
|
18
|
+
export { ContextRelevanceScorer } from "./scorers/context-relevance-scorer.js";
|
|
19
|
+
export { ToolTrajectoryScorer } from "./scorers/tool-trajectory-scorer.js";
|
|
20
|
+
export { ToolCallingAccuracyScorer } from "./scorers/tool-calling-accuracy-scorer.js";
|
|
21
|
+
export { MultiTurnConsistencyScorer } from "./scorers/multi-turn-consistency-scorer.js";
|
|
22
|
+
export { SafetyPreservationScorer } from "./scorers/safety-preservation-scorer.js";
|
|
23
|
+
export { RoutingAccuracyScorer } from "./scorers/routing-accuracy-scorer.js";
|
|
24
|
+
export { HandoffQualityScorer } from "./scorers/handoff-quality-scorer.js";
|
|
25
|
+
export { MilestoneScorer } from "./scorers/milestone-scorer.js";
|
|
15
26
|
export { parseDatasetJsonl } from "./dataset-loader.js";
|
|
16
27
|
export { createScorer } from "./scorer-factory.js";
|
|
17
28
|
export { ExperimentRunner } from "./experiment-runner.js";
|
|
18
29
|
export { compareExperiments } from "./experiment-comparator.js";
|
|
30
|
+
export { ConsistencyRunner } from "./consistency-runner.js";
|
|
19
31
|
//# sourceMappingURL=index.js.map
|
package/dist/eval/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAG/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAE1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAG/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,sBAAsB,EAAE,MAAM,uCAAuC,CAAC;AAC/E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,yBAAyB,EAAE,MAAM,2CAA2C,CAAC;AACtF,OAAO,EAAE,0BAA0B,EAAE,MAAM,4CAA4C,CAAC;AACxF,OAAO,EAAE,wBAAwB,EAAE,MAAM,yCAAyC,CAAC;AACnF,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAE1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAEhE,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"scorer-factory.d.ts","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACpD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;
|
|
1
|
+
{"version":3,"file":"scorer-factory.d.ts","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACpD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AA2BxE,wBAAgB,YAAY,CAAC,MAAM,EAAE,gBAAgB,EAAE,GAAG,CAAC,EAAE,SAAS,GAAG,MAAM,CA8C9E"}
|
|
@@ -13,6 +13,17 @@ import { CoherenceScorer } from "./scorers/coherence-scorer.js";
|
|
|
13
13
|
import { HallucinationScorer } from "./scorers/hallucination-scorer.js";
|
|
14
14
|
import { ToxicityScorer } from "./scorers/toxicity-scorer.js";
|
|
15
15
|
import { CustomPromptScorer } from "./scorers/custom-prompt-scorer.js";
|
|
16
|
+
import { PolicyAdherenceScorer } from "./scorers/policy-adherence-scorer.js";
|
|
17
|
+
import { ContextRelevanceScorer } from "./scorers/context-relevance-scorer.js";
|
|
18
|
+
import { ToolTrajectoryScorer } from "./scorers/tool-trajectory-scorer.js";
|
|
19
|
+
import { EffortScorer } from "./scorers/effort-scorer.js";
|
|
20
|
+
import { ResolutionScorer } from "./scorers/resolution-scorer.js";
|
|
21
|
+
import { ToolCallingAccuracyScorer } from "./scorers/tool-calling-accuracy-scorer.js";
|
|
22
|
+
import { MultiTurnConsistencyScorer } from "./scorers/multi-turn-consistency-scorer.js";
|
|
23
|
+
import { SafetyPreservationScorer } from "./scorers/safety-preservation-scorer.js";
|
|
24
|
+
import { RoutingAccuracyScorer } from "./scorers/routing-accuracy-scorer.js";
|
|
25
|
+
import { HandoffQualityScorer } from "./scorers/handoff-quality-scorer.js";
|
|
26
|
+
import { MilestoneScorer } from "./scorers/milestone-scorer.js";
|
|
16
27
|
export function createScorer(config, llm) {
|
|
17
28
|
switch (config.type) {
|
|
18
29
|
case "exact-match":
|
|
@@ -27,6 +38,16 @@ export function createScorer(config, llm) {
|
|
|
27
38
|
return new LatencyScorer(config.maxLatencyMs ?? 5000);
|
|
28
39
|
case "cost":
|
|
29
40
|
return new CostScorer(config.maxCostUsd ?? 1.0);
|
|
41
|
+
case "effort":
|
|
42
|
+
return new EffortScorer();
|
|
43
|
+
case "resolution":
|
|
44
|
+
return new ResolutionScorer();
|
|
45
|
+
case "tool-calling-accuracy":
|
|
46
|
+
return new ToolCallingAccuracyScorer();
|
|
47
|
+
case "routing-accuracy":
|
|
48
|
+
return new RoutingAccuracyScorer();
|
|
49
|
+
case "milestone":
|
|
50
|
+
return new MilestoneScorer();
|
|
30
51
|
case "composite": {
|
|
31
52
|
const subScorers = (config.scorers ?? []).map((s) => createScorer(s, llm));
|
|
32
53
|
return new CompositeScorer(config.name, subScorers);
|
|
@@ -37,6 +58,12 @@ export function createScorer(config, llm) {
|
|
|
37
58
|
case "hallucination":
|
|
38
59
|
case "toxicity":
|
|
39
60
|
case "custom-prompt":
|
|
61
|
+
case "policy-adherence":
|
|
62
|
+
case "context-relevance":
|
|
63
|
+
case "tool-trajectory":
|
|
64
|
+
case "multi-turn-consistency":
|
|
65
|
+
case "safety-preservation":
|
|
66
|
+
case "handoff-quality":
|
|
40
67
|
return createLLMScorer(config, llm);
|
|
41
68
|
default:
|
|
42
69
|
throw new KilnError("EVAL_SCORER_FAILED", `Unknown scorer type: ${config.type}`, {
|
|
@@ -58,6 +85,12 @@ function createLLMScorer(config, llm) {
|
|
|
58
85
|
case "hallucination": return new HallucinationScorer(llm);
|
|
59
86
|
case "toxicity": return new ToxicityScorer(llm);
|
|
60
87
|
case "custom-prompt": return new CustomPromptScorer(config.name, config.prompt ?? "", llm);
|
|
88
|
+
case "policy-adherence": return new PolicyAdherenceScorer(llm, config.policies ?? []);
|
|
89
|
+
case "context-relevance": return new ContextRelevanceScorer(llm);
|
|
90
|
+
case "tool-trajectory": return new ToolTrajectoryScorer(llm);
|
|
91
|
+
case "multi-turn-consistency": return new MultiTurnConsistencyScorer(llm);
|
|
92
|
+
case "safety-preservation": return new SafetyPreservationScorer(llm);
|
|
93
|
+
case "handoff-quality": return new HandoffQualityScorer(llm);
|
|
61
94
|
default:
|
|
62
95
|
throw new KilnError("EVAL_SCORER_FAILED", `Unknown LLM scorer type: ${config.type}`, {
|
|
63
96
|
context: { type: config.type, name: config.name },
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"scorer-factory.js","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAAA,2DAA2D;AAI3D,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAChD,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;
|
|
1
|
+
{"version":3,"file":"scorer-factory.js","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAAA,2DAA2D;AAI3D,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAChD,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,sBAAsB,EAAE,MAAM,uCAAuC,CAAC;AAC/E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,yBAAyB,EAAE,MAAM,2CAA2C,CAAC;AACtF,OAAO,EAAE,0BAA0B,EAAE,MAAM,4CAA4C,CAAC;AACxF,OAAO,EAAE,wBAAwB,EAAE,MAAM,yCAAyC,CAAC;AACnF,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAEhE,MAAM,UAAU,YAAY,CAAC,MAAwB,EAAE,GAAe;IACpE,QAAQ,MAAM,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,aAAa;YAChB,OAAO,IAAI,gBAAgB,EAAE,CAAC;QAChC,KAAK,UAAU;YACb,OAAO,IAAI,cAAc,CAAC,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC;QACrD,KAAK,eAAe;YAClB,OAAO,IAAI,kBAAkB,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAC/C,KAAK,QAAQ;YACX,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC;QAC9D,KAAK,SAAS;YACZ,OAAO,IAAI,aAAa,CAAC,MAAM,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QACxD,KAAK,MAAM;YACT,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC,UAAU,IAAI,GAAG,CAAC,CAAC;QAClD,KAAK,QAAQ;YACX,OAAO,IAAI,YAAY,EAAE,CAAC;QAC5B,KAAK,YAAY;YACf,OAAO,IAAI,gBAAgB,EAAE,CAAC;QAChC,KAAK,uBAAuB;YAC1B,OAAO,IAAI,yBAAyB,EAAE,CAAC;QACzC,KAAK,kBAAkB;YACrB,OAAO,IAAI,qBAAqB,EAAE,CAAC;QACrC,KAAK,WAAW;YACd,OAAO,IAAI,eAAe,EAAE,CAAC;QAC/B,KAAK,WAAW,CAAC,CAAC,CAAC;YACjB,MAAM,UAAU,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;YAC3E,OAAO,IAAI,eAAe,CAAC,MAAM,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;QACtD,CAAC;QACD,KAAK,cAAc,CAAC;QACpB,KAAK,WAAW,CAAC;QACjB,KAAK,WAAW,CAAC;QACjB,KAAK,eAAe,CAAC;QACrB,KAAK,UAAU,CAAC;QAChB,KAAK,eAAe,CAAC;QACrB,KAAK,kBAAkB,CAAC;QACxB,KAAK,mBAAmB,CAAC;QACzB,KAAK,iBAAiB,CAAC;QACvB,KAAK,wBAAwB,CAAC;QAC9B,KAAK,qBAAqB,CAAC;QAC3B,KAAK,iBAAiB;YACpB,OAAO,eAAe,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QACtC;YACE,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,wBAAwB,MAAM,CAAC,IAAI,EAAE,EAAE;gBAC/E,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;aAClD,CAAC,CAAC;IACP,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,MAAwB,EAAE,GAAe;IAChE,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,eAAe,MAAM,CAAC,IAAI,iCAAiC,EAAE;YACrG,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;YACjD,UAAU,EAAE,2DAA2D;SACxE,CAAC,CAAC;IACL,CAAC;IACD,QAAQ,MAAM,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,cAAc,CAAC,CAAC,OAAO,IAAI,kBAAkB,CAAC,GAAG,CAAC,CAAC;QACxD,KAAK,WAAW,CAAC,CAAC,OAAO,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;QAClD,KAAK,WAAW,CAAC,CAAC,OAAO,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;QAClD,KAAK,eAAe,CAAC,CAAC,OAAO,IAAI,mBAAmB,CAAC,GAAG,CAAC,CAAC;QAC1D,KAAK,UAAU,CAAC,CAAC,OAAO,IAAI,cAAc,CAAC,GAAG,CAAC,CAAC;QAChD,KAAK,eAAe,CAAC,CAAC,OAAO,IAAI,kBAAkB,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,IAAI,EAAE,EAAE,GAAG,CAAC,CAAC;QAC3F,KAAK,kBAAkB,CAAC,CAAC,OAAO,IAAI,qBAAqB,CAAC,GAAG,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;QACtF,KAAK,mBAAmB,CAAC,CAAC,OAAO,IAAI,sBAAsB,CAAC,GAAG,CAAC,CAAC;QACjE,KAAK,iBAAiB,CAAC,CAAC,OAAO,IAAI,oBAAoB,CAAC,GAAG,CAAC,CAAC;QAC7D,KAAK,wBAAwB,CAAC,CAAC,OAAO,IAAI,0BAA0B,CAAC,GAAG,CAAC,CAAC;QAC1E,KAAK,qBAAqB,CAAC,CAAC,OAAO,IAAI,wBAAwB,CAAC,GAAG,CAAC,CAAC;QACrE,KAAK,iBAAiB,CAAC,CAAC,OAAO,IAAI,oBAAoB,CAAC,GAAG,CAAC,CAAC;QAC7D;YACE,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,4BAA4B,MAAM,CAAC,IAAI,EAAE,EAAE;gBACnF,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;aAClD,CAAC,CAAC;IACP,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
|
|
2
|
+
export declare class ContextRelevanceScorer implements Scorer {
|
|
3
|
+
private readonly llm;
|
|
4
|
+
readonly name = "context-relevance";
|
|
5
|
+
constructor(llm: ScorerLLM);
|
|
6
|
+
score(input: EvalInput): Promise<EvalScore>;
|
|
7
|
+
}
|
|
8
|
+
//# sourceMappingURL=context-relevance-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context-relevance-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/context-relevance-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAG3E,qBAAa,sBAAuB,YAAW,MAAM;IAGvC,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,uBAAuB;gBAEP,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAwBlD"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
// ContextRelevanceScorer: evaluates whether retrieved context chunks are relevant to the input query
|
|
2
|
+
import { parseLLMResponse } from "./parse-llm-response.js";
|
|
3
|
+
export class ContextRelevanceScorer {
|
|
4
|
+
llm;
|
|
5
|
+
name = "context-relevance";
|
|
6
|
+
constructor(llm) {
|
|
7
|
+
this.llm = llm;
|
|
8
|
+
}
|
|
9
|
+
async score(input) {
|
|
10
|
+
const chunks = input.context ?? [];
|
|
11
|
+
if (chunks.length === 0) {
|
|
12
|
+
return { name: this.name, score: 0, reasoning: "No context provided" };
|
|
13
|
+
}
|
|
14
|
+
const contextList = chunks.map((c, i) => `[${i + 1}] ${c}`).join("\n\n");
|
|
15
|
+
const prompt = `Evaluate context relevance. Are the retrieved context chunks relevant to the user's query? This measures retrieval quality, not answer quality.
|
|
16
|
+
|
|
17
|
+
Query: ${input.input}
|
|
18
|
+
|
|
19
|
+
Retrieved context:
|
|
20
|
+
${contextList}
|
|
21
|
+
|
|
22
|
+
Score 1.0 if all chunks are highly relevant to the query. Score 0.0 if none are relevant. Use intermediate scores based on the proportion and degree of relevance.
|
|
23
|
+
|
|
24
|
+
Respond EXACTLY in this format:
|
|
25
|
+
SCORE: <number from 0.0 to 1.0>
|
|
26
|
+
REASONING: <one sentence explanation>`;
|
|
27
|
+
const response = await this.llm.evaluate(prompt);
|
|
28
|
+
const { score, reasoning } = parseLLMResponse(response, this.name);
|
|
29
|
+
return { name: this.name, score, reasoning };
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=context-relevance-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context-relevance-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/context-relevance-scorer.ts"],"names":[],"mappings":"AAAA,qGAAqG;AAGrG,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAE3D,MAAM,OAAO,sBAAsB;IAGJ;IAFpB,IAAI,GAAG,mBAAmB,CAAC;IAEpC,YAA6B,GAAc;QAAd,QAAG,GAAH,GAAG,CAAW;IAAG,CAAC;IAE/C,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,IAAI,EAAE,CAAC;QACnC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACxB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,qBAAqB,EAAE,CAAC;QACzE,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACzE,MAAM,MAAM,GAAG;;SAEV,KAAK,CAAC,KAAK;;;EAGlB,WAAW;;;;;;sCAMyB,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"effort-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/effort-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAIhE,qBAAa,YAAa,YAAW,MAAM;IACzC,QAAQ,CAAC,IAAI,YAAY;IAEnB,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CASlD"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
// EffortScorer: rule-based scorer bridging enrichment effort score into eval framework
|
|
2
|
+
import { computeEffortScore } from "../../enrichment/effort-score.js";
|
|
3
|
+
export class EffortScorer {
|
|
4
|
+
name = "effort";
|
|
5
|
+
async score(input) {
|
|
6
|
+
const components = input.metadata?.effortComponents;
|
|
7
|
+
if (!components) {
|
|
8
|
+
return { name: this.name, score: 0, reasoning: "No effort components in metadata" };
|
|
9
|
+
}
|
|
10
|
+
const rawScore = computeEffortScore(components);
|
|
11
|
+
const normalized = rawScore / 10; // 0-10 -> 0-1
|
|
12
|
+
return { name: this.name, score: normalized, reasoning: `Effort score: ${rawScore}/10` };
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
//# sourceMappingURL=effort-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"effort-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/effort-scorer.ts"],"names":[],"mappings":"AAAA,uFAAuF;AAGvF,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AAGtE,MAAM,OAAO,YAAY;IACd,IAAI,GAAG,QAAQ,CAAC;IAEzB,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,UAAU,GAAG,KAAK,CAAC,QAAQ,EAAE,gBAAgD,CAAC;QACpF,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,kCAAkC,EAAE,CAAC;QACtF,CAAC;QACD,MAAM,QAAQ,GAAG,kBAAkB,CAAC,UAAU,CAAC,CAAC;QAChD,MAAM,UAAU,GAAG,QAAQ,GAAG,EAAE,CAAC,CAAC,cAAc;QAChD,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,UAAU,EAAE,SAAS,EAAE,iBAAiB,QAAQ,KAAK,EAAE,CAAC;IAC3F,CAAC;CACF"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
|
|
2
|
+
export declare class HandoffQualityScorer implements Scorer {
|
|
3
|
+
private readonly llm;
|
|
4
|
+
readonly name = "handoff-quality";
|
|
5
|
+
constructor(llm: ScorerLLM);
|
|
6
|
+
score(input: EvalInput): Promise<EvalScore>;
|
|
7
|
+
}
|
|
8
|
+
//# sourceMappingURL=handoff-quality-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"handoff-quality-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/handoff-quality-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AA4B3E,qBAAa,oBAAqB,YAAW,MAAM;IAGrC,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,qBAAqB;gBAEL,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAuClD"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
// HandoffQualityScorer: LLM-as-judge for context preservation across agent handoffs
|
|
2
|
+
import { parseLLMResponse } from "./parse-llm-response.js";
|
|
3
|
+
function extractHandoffs(metadata) {
|
|
4
|
+
if (!metadata)
|
|
5
|
+
return undefined;
|
|
6
|
+
const raw = metadata["handoffHistory"];
|
|
7
|
+
if (!Array.isArray(raw))
|
|
8
|
+
return undefined;
|
|
9
|
+
const events = [];
|
|
10
|
+
for (const entry of raw) {
|
|
11
|
+
if (typeof entry === "object" &&
|
|
12
|
+
entry !== null &&
|
|
13
|
+
typeof entry["fromAgent"] === "string" &&
|
|
14
|
+
typeof entry["toAgent"] === "string") {
|
|
15
|
+
events.push(entry);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return events.length > 0 ? events : undefined;
|
|
19
|
+
}
|
|
20
|
+
export class HandoffQualityScorer {
|
|
21
|
+
llm;
|
|
22
|
+
name = "handoff-quality";
|
|
23
|
+
constructor(llm) {
|
|
24
|
+
this.llm = llm;
|
|
25
|
+
}
|
|
26
|
+
async score(input) {
|
|
27
|
+
const handoffs = extractHandoffs(input.metadata);
|
|
28
|
+
if (!handoffs) {
|
|
29
|
+
return { name: this.name, score: 0, reasoning: "No handoff history in metadata" };
|
|
30
|
+
}
|
|
31
|
+
const handoffLog = handoffs
|
|
32
|
+
.map((h, i) => {
|
|
33
|
+
const parts = [`Handoff ${i + 1}: ${h.fromAgent} -> ${h.toAgent}`];
|
|
34
|
+
if (h.reason)
|
|
35
|
+
parts.push(` Reason: ${h.reason}`);
|
|
36
|
+
if (h.summary)
|
|
37
|
+
parts.push(` Summary: ${h.summary}`);
|
|
38
|
+
return parts.join("\n");
|
|
39
|
+
})
|
|
40
|
+
.join("\n\n");
|
|
41
|
+
const prompt = `Evaluate the quality of agent handoffs in this conversation. Was context preserved across each agent switch?
|
|
42
|
+
|
|
43
|
+
User query: ${input.input}
|
|
44
|
+
Final output: ${input.output}
|
|
45
|
+
|
|
46
|
+
Handoff history:
|
|
47
|
+
${handoffLog}
|
|
48
|
+
|
|
49
|
+
Evaluate:
|
|
50
|
+
1. Was the handoff reason appropriate (correct agent for the task)?
|
|
51
|
+
2. Was the context summary accurate and complete?
|
|
52
|
+
3. Did the receiving agent pick up seamlessly without re-asking for information?
|
|
53
|
+
4. Was any critical context lost during the handoff?
|
|
54
|
+
|
|
55
|
+
Score 1.0 for seamless handoffs with full context preservation. Score 0.0 for complete context loss.
|
|
56
|
+
|
|
57
|
+
Respond EXACTLY in this format:
|
|
58
|
+
SCORE: <number from 0.0 to 1.0>
|
|
59
|
+
REASONING: <one sentence explanation>`;
|
|
60
|
+
const response = await this.llm.evaluate(prompt);
|
|
61
|
+
const { score, reasoning } = parseLLMResponse(response, this.name);
|
|
62
|
+
return { name: this.name, score, reasoning };
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
//# sourceMappingURL=handoff-quality-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"handoff-quality-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/handoff-quality-scorer.ts"],"names":[],"mappings":"AAAA,oFAAoF;AAGpF,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAS3D,SAAS,eAAe,CAAC,QAA6C;IACpE,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,gBAAgB,CAAC,CAAC;IACvC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,MAAM,MAAM,GAAmB,EAAE,CAAC;IAClC,KAAK,MAAM,KAAK,IAAI,GAAG,EAAE,CAAC;QACxB,IACE,OAAO,KAAK,KAAK,QAAQ;YACzB,KAAK,KAAK,IAAI;YACd,OAAQ,KAAiC,CAAC,WAAW,CAAC,KAAK,QAAQ;YACnE,OAAQ,KAAiC,CAAC,SAAS,CAAC,KAAK,QAAQ,EACjE,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,KAAqB,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,MAAM,OAAO,oBAAoB;IAGF;IAFpB,IAAI,GAAG,iBAAiB,CAAC;IAElC,YAA6B,GAAc;QAAd,QAAG,GAAH,GAAG,CAAW;IAAG,CAAC;IAE/C,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,QAAQ,GAAG,eAAe,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACjD,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,gCAAgC,EAAE,CAAC;QACpF,CAAC;QAED,MAAM,UAAU,GAAG,QAAQ;aACxB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YACZ,MAAM,KAAK,GAAG,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,SAAS,OAAO,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;YACnE,IAAI,CAAC,CAAC,MAAM;gBAAE,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;YAClD,IAAI,CAAC,CAAC,OAAO;gBAAE,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;YACrD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC;aACD,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,MAAM,GAAG;;cAEL,KAAK,CAAC,KAAK;gBACT,KAAK,CAAC,MAAM;;;EAG1B,UAAU;;;;;;;;;;;;sCAY0B,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"milestone-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/milestone-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAyBhE,qBAAa,eAAgB,YAAW,MAAM;IAC5C,QAAQ,CAAC,IAAI,eAAe;IAEtB,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAelD"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
// MilestoneScorer: rule-based scorer tracking intermediate checkpoint achievement
|
|
2
|
+
function extractMilestones(metadata) {
|
|
3
|
+
if (!metadata)
|
|
4
|
+
return undefined;
|
|
5
|
+
const raw = metadata["milestones"];
|
|
6
|
+
if (!Array.isArray(raw))
|
|
7
|
+
return undefined;
|
|
8
|
+
const milestones = [];
|
|
9
|
+
for (const entry of raw) {
|
|
10
|
+
if (typeof entry === "object" &&
|
|
11
|
+
entry !== null &&
|
|
12
|
+
typeof entry["name"] === "string" &&
|
|
13
|
+
typeof entry["completed"] === "boolean") {
|
|
14
|
+
milestones.push(entry);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
return milestones.length > 0 ? milestones : undefined;
|
|
18
|
+
}
|
|
19
|
+
export class MilestoneScorer {
|
|
20
|
+
name = "milestone";
|
|
21
|
+
async score(input) {
|
|
22
|
+
const milestones = extractMilestones(input.metadata);
|
|
23
|
+
if (!milestones) {
|
|
24
|
+
return { name: this.name, score: 0, reasoning: "No milestones in metadata" };
|
|
25
|
+
}
|
|
26
|
+
const completed = milestones.filter((m) => m.completed);
|
|
27
|
+
const score = completed.length / milestones.length;
|
|
28
|
+
const missed = milestones.filter((m) => !m.completed).map((m) => m.name);
|
|
29
|
+
const parts = [`${completed.length}/${milestones.length} milestones completed`];
|
|
30
|
+
if (missed.length > 0)
|
|
31
|
+
parts.push(`missed: ${missed.join(", ")}`);
|
|
32
|
+
return { name: this.name, score: Math.round(score * 100) / 100, reasoning: parts.join("; ") };
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=milestone-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"milestone-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/milestone-scorer.ts"],"names":[],"mappings":"AAAA,kFAAkF;AASlF,SAAS,iBAAiB,CAAC,QAA6C;IACtE,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC;IACnC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,MAAM,UAAU,GAAgB,EAAE,CAAC;IACnC,KAAK,MAAM,KAAK,IAAI,GAAG,EAAE,CAAC;QACxB,IACE,OAAO,KAAK,KAAK,QAAQ;YACzB,KAAK,KAAK,IAAI;YACd,OAAQ,KAAiC,CAAC,MAAM,CAAC,KAAK,QAAQ;YAC9D,OAAQ,KAAiC,CAAC,WAAW,CAAC,KAAK,SAAS,EACpE,CAAC;YACD,UAAU,CAAC,IAAI,CAAC,KAAkB,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IACD,OAAO,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CAAC;AACxD,CAAC;AAED,MAAM,OAAO,eAAe;IACjB,IAAI,GAAG,WAAW,CAAC;IAE5B,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,UAAU,GAAG,iBAAiB,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACrD,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,2BAA2B,EAAE,CAAC;QAC/E,CAAC;QAED,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACxD,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC;QACnD,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAEzE,MAAM,KAAK,GAAG,CAAC,GAAG,SAAS,CAAC,MAAM,IAAI,UAAU,CAAC,MAAM,uBAAuB,CAAC,CAAC;QAChF,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;YAAE,KAAK,CAAC,IAAI,CAAC,WAAW,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAElE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,GAAG,EAAE,SAAS,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;IAChG,CAAC;CACF"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
|
|
2
|
+
export declare class MultiTurnConsistencyScorer implements Scorer {
|
|
3
|
+
private readonly llm;
|
|
4
|
+
readonly name = "multi-turn-consistency";
|
|
5
|
+
constructor(llm: ScorerLLM);
|
|
6
|
+
score(input: EvalInput): Promise<EvalScore>;
|
|
7
|
+
}
|
|
8
|
+
//# sourceMappingURL=multi-turn-consistency-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"multi-turn-consistency-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/multi-turn-consistency-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AA0B3E,qBAAa,0BAA2B,YAAW,MAAM;IAG3C,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,4BAA4B;gBAEZ,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CA+BlD"}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
// MultiTurnConsistencyScorer: LLM-as-judge for context retention across conversation turns
|
|
2
|
+
import { parseLLMResponse } from "./parse-llm-response.js";
|
|
3
|
+
function extractConversationHistory(metadata) {
|
|
4
|
+
if (!metadata)
|
|
5
|
+
return undefined;
|
|
6
|
+
const raw = metadata["conversationHistory"];
|
|
7
|
+
if (!Array.isArray(raw))
|
|
8
|
+
return undefined;
|
|
9
|
+
const turns = [];
|
|
10
|
+
for (const entry of raw) {
|
|
11
|
+
if (typeof entry === "object" &&
|
|
12
|
+
entry !== null &&
|
|
13
|
+
typeof entry["role"] === "string" &&
|
|
14
|
+
typeof entry["content"] === "string") {
|
|
15
|
+
turns.push(entry);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return turns.length >= 2 ? turns : undefined;
|
|
19
|
+
}
|
|
20
|
+
export class MultiTurnConsistencyScorer {
|
|
21
|
+
llm;
|
|
22
|
+
name = "multi-turn-consistency";
|
|
23
|
+
constructor(llm) {
|
|
24
|
+
this.llm = llm;
|
|
25
|
+
}
|
|
26
|
+
async score(input) {
|
|
27
|
+
const history = extractConversationHistory(input.metadata);
|
|
28
|
+
if (!history) {
|
|
29
|
+
return { name: this.name, score: 0, reasoning: "No conversation history in metadata (need >= 2 turns)" };
|
|
30
|
+
}
|
|
31
|
+
const transcript = history.map((t) => `[${t.role}]: ${t.content}`).join("\n");
|
|
32
|
+
const prompt = `Evaluate context retention across this multi-turn conversation. Did the assistant maintain awareness of previously stated facts, requests, and context throughout the conversation?
|
|
33
|
+
|
|
34
|
+
Conversation:
|
|
35
|
+
${transcript}
|
|
36
|
+
|
|
37
|
+
Final output: ${input.output}
|
|
38
|
+
|
|
39
|
+
Evaluate:
|
|
40
|
+
1. Does the assistant contradict earlier statements?
|
|
41
|
+
2. Does it forget previously provided information?
|
|
42
|
+
3. Does it ask questions already answered?
|
|
43
|
+
4. Does it maintain a coherent understanding of the user's evolving needs?
|
|
44
|
+
|
|
45
|
+
Score 1.0 for perfect context retention. Score 0.0 for complete context loss.
|
|
46
|
+
|
|
47
|
+
Respond EXACTLY in this format:
|
|
48
|
+
SCORE: <number from 0.0 to 1.0>
|
|
49
|
+
REASONING: <one sentence explanation>`;
|
|
50
|
+
const response = await this.llm.evaluate(prompt);
|
|
51
|
+
const { score, reasoning } = parseLLMResponse(response, this.name);
|
|
52
|
+
return { name: this.name, score, reasoning };
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
//# sourceMappingURL=multi-turn-consistency-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"multi-turn-consistency-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/multi-turn-consistency-scorer.ts"],"names":[],"mappings":"AAAA,2FAA2F;AAG3F,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAO3D,SAAS,0BAA0B,CAAC,QAA6C;IAC/E,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,qBAAqB,CAAC,CAAC;IAC5C,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,MAAM,KAAK,GAAuB,EAAE,CAAC;IACrC,KAAK,MAAM,KAAK,IAAI,GAAG,EAAE,CAAC;QACxB,IACE,OAAO,KAAK,KAAK,QAAQ;YACzB,KAAK,KAAK,IAAI;YACd,OAAQ,KAAiC,CAAC,MAAM,CAAC,KAAK,QAAQ;YAC9D,OAAQ,KAAiC,CAAC,SAAS,CAAC,KAAK,QAAQ,EACjE,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,KAAyB,CAAC,CAAC;QACxC,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC;AAC/C,CAAC;AAED,MAAM,OAAO,0BAA0B;IAGR;IAFpB,IAAI,GAAG,wBAAwB,CAAC;IAEzC,YAA6B,GAAc;QAAd,QAAG,GAAH,GAAG,CAAW;IAAG,CAAC;IAE/C,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,OAAO,GAAG,0BAA0B,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QAC3D,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,uDAAuD,EAAE,CAAC;QAC3G,CAAC;QAED,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE9E,MAAM,MAAM,GAAG;;;EAGjB,UAAU;;gBAEI,KAAK,CAAC,MAAM;;;;;;;;;;;;sCAYU,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
|