@kilnai/core 0.9.2 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/engine/domain/eval-config.d.ts +2 -1
- package/dist/engine/domain/eval-config.d.ts.map +1 -1
- package/dist/engine/domain/eval-config.js +9 -0
- package/dist/engine/domain/eval-config.js.map +1 -1
- package/dist/engine/domain/knowledge-source.d.ts +5 -1
- package/dist/engine/domain/knowledge-source.d.ts.map +1 -1
- package/dist/engine/error-catalog.js +2 -2
- package/dist/engine/error-catalog.js.map +1 -1
- package/dist/engine/errors.d.ts +1 -1
- package/dist/engine/errors.d.ts.map +1 -1
- package/dist/engine/gateway/conversation-event.d.ts +9 -1
- package/dist/engine/gateway/conversation-event.d.ts.map +1 -1
- package/dist/engine/gateway/tenant-config.d.ts +30 -0
- package/dist/engine/gateway/tenant-config.d.ts.map +1 -1
- package/dist/engine/gateway/tenant-config.js +62 -0
- package/dist/engine/gateway/tenant-config.js.map +1 -1
- package/dist/engine/index.d.ts +1 -1
- package/dist/engine/index.d.ts.map +1 -1
- package/dist/engine/index.js.map +1 -1
- package/dist/eval/consistency-runner.d.ts +28 -0
- package/dist/eval/consistency-runner.d.ts.map +1 -0
- package/dist/eval/consistency-runner.js +43 -0
- package/dist/eval/consistency-runner.js.map +1 -0
- package/dist/eval/experiment-runner.d.ts.map +1 -1
- package/dist/eval/experiment-runner.js +1 -0
- package/dist/eval/experiment-runner.js.map +1 -1
- package/dist/eval/index.d.ts +13 -0
- package/dist/eval/index.d.ts.map +1 -1
- package/dist/eval/index.js +12 -0
- package/dist/eval/index.js.map +1 -1
- package/dist/eval/scorer-factory.d.ts.map +1 -1
- package/dist/eval/scorer-factory.js +33 -0
- package/dist/eval/scorer-factory.js.map +1 -1
- package/dist/eval/scorers/context-relevance-scorer.d.ts +8 -0
- package/dist/eval/scorers/context-relevance-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/context-relevance-scorer.js +32 -0
- package/dist/eval/scorers/context-relevance-scorer.js.map +1 -0
- package/dist/eval/scorers/effort-scorer.d.ts +6 -0
- package/dist/eval/scorers/effort-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/effort-scorer.js +15 -0
- package/dist/eval/scorers/effort-scorer.js.map +1 -0
- package/dist/eval/scorers/handoff-quality-scorer.d.ts +8 -0
- package/dist/eval/scorers/handoff-quality-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/handoff-quality-scorer.js +65 -0
- package/dist/eval/scorers/handoff-quality-scorer.js.map +1 -0
- package/dist/eval/scorers/milestone-scorer.d.ts +6 -0
- package/dist/eval/scorers/milestone-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/milestone-scorer.js +35 -0
- package/dist/eval/scorers/milestone-scorer.js.map +1 -0
- package/dist/eval/scorers/multi-turn-consistency-scorer.d.ts +8 -0
- package/dist/eval/scorers/multi-turn-consistency-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/multi-turn-consistency-scorer.js +55 -0
- package/dist/eval/scorers/multi-turn-consistency-scorer.js.map +1 -0
- package/dist/eval/scorers/policy-adherence-scorer.d.ts +9 -0
- package/dist/eval/scorers/policy-adherence-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/policy-adherence-scorer.js +34 -0
- package/dist/eval/scorers/policy-adherence-scorer.js.map +1 -0
- package/dist/eval/scorers/resolution-scorer.d.ts +6 -0
- package/dist/eval/scorers/resolution-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/resolution-scorer.js +25 -0
- package/dist/eval/scorers/resolution-scorer.js.map +1 -0
- package/dist/eval/scorers/routing-accuracy-scorer.d.ts +6 -0
- package/dist/eval/scorers/routing-accuracy-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/routing-accuracy-scorer.js +19 -0
- package/dist/eval/scorers/routing-accuracy-scorer.js.map +1 -0
- package/dist/eval/scorers/safety-preservation-scorer.d.ts +9 -0
- package/dist/eval/scorers/safety-preservation-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/safety-preservation-scorer.js +54 -0
- package/dist/eval/scorers/safety-preservation-scorer.js.map +1 -0
- package/dist/eval/scorers/tool-calling-accuracy-scorer.d.ts +6 -0
- package/dist/eval/scorers/tool-calling-accuracy-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/tool-calling-accuracy-scorer.js +81 -0
- package/dist/eval/scorers/tool-calling-accuracy-scorer.js.map +1 -0
- package/dist/eval/scorers/tool-trajectory-scorer.d.ts +8 -0
- package/dist/eval/scorers/tool-trajectory-scorer.d.ts.map +1 -0
- package/dist/eval/scorers/tool-trajectory-scorer.js +51 -0
- package/dist/eval/scorers/tool-trajectory-scorer.js.map +1 -0
- package/dist/eval/types.d.ts +1 -0
- package/dist/eval/types.d.ts.map +1 -1
- package/dist/events/index.d.ts +11 -1
- package/dist/events/index.d.ts.map +1 -1
- package/dist/events/index.js +1 -0
- package/dist/events/index.js.map +1 -1
- package/dist/index.d.ts +2 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js.map +1 -1
- package/dist/knowledge/infrastructure/composite-extractor.d.ts +2 -2
- package/dist/knowledge/infrastructure/composite-extractor.d.ts.map +1 -1
- package/dist/knowledge/infrastructure/composite-extractor.js +2 -2
- package/dist/knowledge/infrastructure/composite-extractor.js.map +1 -1
- package/dist/knowledge/infrastructure/file-extractor.d.ts +2 -2
- package/dist/knowledge/infrastructure/file-extractor.d.ts.map +1 -1
- package/dist/knowledge/infrastructure/file-extractor.js +1 -1
- package/dist/knowledge/infrastructure/file-extractor.js.map +1 -1
- package/dist/knowledge/infrastructure/pdf-extractor.d.ts +2 -2
- package/dist/knowledge/infrastructure/pdf-extractor.d.ts.map +1 -1
- package/dist/knowledge/infrastructure/pdf-extractor.js +4 -2
- package/dist/knowledge/infrastructure/pdf-extractor.js.map +1 -1
- package/dist/knowledge/infrastructure/url-extractor.d.ts +2 -2
- package/dist/knowledge/infrastructure/url-extractor.d.ts.map +1 -1
- package/dist/knowledge/infrastructure/url-extractor.js +5 -3
- package/dist/knowledge/infrastructure/url-extractor.js.map +1 -1
- package/dist/knowledge/source-manager.d.ts +2 -0
- package/dist/knowledge/source-manager.d.ts.map +1 -1
- package/dist/knowledge/source-manager.js +54 -1
- package/dist/knowledge/source-manager.js.map +1 -1
- package/dist/observability/span-mapper.d.ts.map +1 -1
- package/dist/observability/span-mapper.js +15 -0
- package/dist/observability/span-mapper.js.map +1 -1
- package/dist/package/yaml-parser.d.ts.map +1 -1
- package/dist/package/yaml-parser.js +1 -0
- package/dist/package/yaml-parser.js.map +1 -1
- package/dist/skill/index.d.ts +2 -4
- package/dist/skill/index.d.ts.map +1 -1
- package/dist/skill/index.js +1 -2
- package/dist/skill/index.js.map +1 -1
- package/dist/skill/md-parser.d.ts +21 -0
- package/dist/skill/md-parser.d.ts.map +1 -0
- package/dist/skill/md-parser.js +168 -0
- package/dist/skill/md-parser.js.map +1 -0
- package/dist/skill/skill-registry.d.ts +16 -8
- package/dist/skill/skill-registry.d.ts.map +1 -1
- package/dist/skill/skill-registry.js +77 -30
- package/dist/skill/skill-registry.js.map +1 -1
- package/dist/skill/types.d.ts +7 -3
- package/dist/skill/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/dist/skill/yaml-parser.d.ts +0 -18
- package/dist/skill/yaml-parser.d.ts.map +0 -1
- package/dist/skill/yaml-parser.js +0 -61
- package/dist/skill/yaml-parser.js.map +0 -1
- package/dist/skill/yaml-schema.d.ts +0 -20
- package/dist/skill/yaml-schema.d.ts.map +0 -1
- package/dist/skill/yaml-schema.js +0 -80
- package/dist/skill/yaml-schema.js.map +0 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"experiment-runner.js","sourceRoot":"","sources":["../../src/eval/experiment-runner.ts"],"names":[],"mappings":"AAAA,kFAAkF;AAGlF,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAiBhD,MAAM,OAAO,gBAAgB;IACE;IAA7B,YAA6B,MAA8B;QAA9B,WAAM,GAAN,MAAM,CAAwB;IAAG,CAAC;IAE/D,KAAK,CAAC,GAAG;QACP,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC3C,MAAM,OAAO,GAAuB,EAAE,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC7C,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAE/D,MAAM,SAAS,GAAc;gBAC3B,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,SAAS,CAAC,MAAM;gBACxB,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,UAAU,EAAE,SAAS,CAAC,UAAU;gBAChC,OAAO,EAAE,SAAS,CAAC,OAAO;
|
|
1
|
+
{"version":3,"file":"experiment-runner.js","sourceRoot":"","sources":["../../src/eval/experiment-runner.ts"],"names":[],"mappings":"AAAA,kFAAkF;AAGlF,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAiBhD,MAAM,OAAO,gBAAgB;IACE;IAA7B,YAA6B,MAA8B;QAA9B,WAAM,GAAN,MAAM,CAAwB;IAAG,CAAC;IAE/D,KAAK,CAAC,GAAG;QACP,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAC3C,MAAM,OAAO,GAAuB,EAAE,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC7C,MAAM,SAAS,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAE/D,MAAM,SAAS,GAAc;gBAC3B,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,SAAS,CAAC,MAAM;gBACxB,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,OAAO,EAAE,IAAI,CAAC,OAAO;gBACrB,UAAU,EAAE,SAAS,CAAC,UAAU;gBAChC,OAAO,EAAE,SAAS,CAAC,OAAO;gBAC1B,QAAQ,EAAE,IAAI,CAAC,QAAQ;aACxB,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAC9B,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE;gBAClC,IAAI,CAAC;oBACH,OAAO,MAAM,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;gBAClC,CAAC;gBAAC,OAAO,GAAG,EAAE,CAAC;oBACb,OAAO;wBACL,IAAI,EAAE,CAAC,CAAC,IAAI;wBACZ,KAAK,EAAE,CAAC;wBACR,SAAS,EAAE,GAAG,YAAY,SAAS,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;qBAChE,CAAC;gBACJ,CAAC;YACH,CAAC,CAAC,CACH,CAAC;YAEF,OAAO,CAAC,IAAI,CAAC;gBACX,MAAM,EAAE,IAAI,CAAC,EAAE;gBACf,MAAM,EAAE,SAAS,CAAC,MAAM;gBACxB,MAAM;gBACN,UAAU,EAAE,SAAS,CAAC,UAAU;gBAChC,UAAU,EAAE;oBACV,WAAW,EAAE,SAAS,CAAC,WAAW;oBAClC,YAAY,EAAE,SAAS,CAAC,YAAY;iBACrC;aACF,CAAC,CAAC;QACL,CAAC;QAED,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,cAAc;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,IAAI;YACrC,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;YAC/C,OAAO;YACP,SAAS;YACT,WAAW,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;SACtC,CAAC;IACJ,CAAC;CACF"}
|
package/dist/eval/index.d.ts
CHANGED
|
@@ -5,6 +5,8 @@ export { JsonValidityScorer } from "./scorers/json-validity-scorer.js";
|
|
|
5
5
|
export { LengthScorer } from "./scorers/length-scorer.js";
|
|
6
6
|
export { LatencyScorer } from "./scorers/latency-scorer.js";
|
|
7
7
|
export { CostScorer } from "./scorers/cost-scorer.js";
|
|
8
|
+
export { EffortScorer } from "./scorers/effort-scorer.js";
|
|
9
|
+
export { ResolutionScorer } from "./scorers/resolution-scorer.js";
|
|
8
10
|
export { CompositeScorer } from "./scorers/composite-scorer.js";
|
|
9
11
|
export { FaithfulnessScorer } from "./scorers/faithfulness-scorer.js";
|
|
10
12
|
export { RelevanceScorer } from "./scorers/relevance-scorer.js";
|
|
@@ -12,10 +14,21 @@ export { CoherenceScorer } from "./scorers/coherence-scorer.js";
|
|
|
12
14
|
export { HallucinationScorer } from "./scorers/hallucination-scorer.js";
|
|
13
15
|
export { ToxicityScorer } from "./scorers/toxicity-scorer.js";
|
|
14
16
|
export { CustomPromptScorer } from "./scorers/custom-prompt-scorer.js";
|
|
17
|
+
export { PolicyAdherenceScorer } from "./scorers/policy-adherence-scorer.js";
|
|
18
|
+
export { ContextRelevanceScorer } from "./scorers/context-relevance-scorer.js";
|
|
19
|
+
export { ToolTrajectoryScorer } from "./scorers/tool-trajectory-scorer.js";
|
|
20
|
+
export { ToolCallingAccuracyScorer } from "./scorers/tool-calling-accuracy-scorer.js";
|
|
21
|
+
export { MultiTurnConsistencyScorer } from "./scorers/multi-turn-consistency-scorer.js";
|
|
22
|
+
export { SafetyPreservationScorer } from "./scorers/safety-preservation-scorer.js";
|
|
23
|
+
export { RoutingAccuracyScorer } from "./scorers/routing-accuracy-scorer.js";
|
|
24
|
+
export { HandoffQualityScorer } from "./scorers/handoff-quality-scorer.js";
|
|
25
|
+
export { MilestoneScorer } from "./scorers/milestone-scorer.js";
|
|
15
26
|
export { parseDatasetJsonl } from "./dataset-loader.js";
|
|
16
27
|
export { createScorer } from "./scorer-factory.js";
|
|
17
28
|
export { ExperimentRunner } from "./experiment-runner.js";
|
|
18
29
|
export type { ExperimentRunnerConfig, GenerateOutputResult } from "./experiment-runner.js";
|
|
19
30
|
export { compareExperiments } from "./experiment-comparator.js";
|
|
20
31
|
export type { ComparisonResult, ScorerComparison } from "./experiment-comparator.js";
|
|
32
|
+
export { ConsistencyRunner } from "./consistency-runner.js";
|
|
33
|
+
export type { ConsistencyRunnerConfig, ConsistencyResult, ConsistencyItemResult } from "./consistency-runner.js";
|
|
21
34
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/eval/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAEA,YAAY,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,OAAO,EAAE,oBAAoB,EAAE,gBAAgB,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AACpJ,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,YAAY,EAAE,sBAAsB,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC3F,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAChE,YAAY,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAEA,YAAY,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,OAAO,EAAE,oBAAoB,EAAE,gBAAgB,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AACpJ,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,sBAAsB,EAAE,MAAM,uCAAuC,CAAC;AAC/E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,yBAAyB,EAAE,MAAM,2CAA2C,CAAC;AACtF,OAAO,EAAE,0BAA0B,EAAE,MAAM,4CAA4C,CAAC;AACxF,OAAO,EAAE,wBAAwB,EAAE,MAAM,yCAAyC,CAAC;AACnF,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAC1D,YAAY,EAAE,sBAAsB,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AAC3F,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAChE,YAAY,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC;AACrF,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC;AAC5D,YAAY,EAAE,uBAAuB,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,MAAM,yBAAyB,CAAC"}
|
package/dist/eval/index.js
CHANGED
|
@@ -5,6 +5,8 @@ export { JsonValidityScorer } from "./scorers/json-validity-scorer.js";
|
|
|
5
5
|
export { LengthScorer } from "./scorers/length-scorer.js";
|
|
6
6
|
export { LatencyScorer } from "./scorers/latency-scorer.js";
|
|
7
7
|
export { CostScorer } from "./scorers/cost-scorer.js";
|
|
8
|
+
export { EffortScorer } from "./scorers/effort-scorer.js";
|
|
9
|
+
export { ResolutionScorer } from "./scorers/resolution-scorer.js";
|
|
8
10
|
export { CompositeScorer } from "./scorers/composite-scorer.js";
|
|
9
11
|
export { FaithfulnessScorer } from "./scorers/faithfulness-scorer.js";
|
|
10
12
|
export { RelevanceScorer } from "./scorers/relevance-scorer.js";
|
|
@@ -12,8 +14,18 @@ export { CoherenceScorer } from "./scorers/coherence-scorer.js";
|
|
|
12
14
|
export { HallucinationScorer } from "./scorers/hallucination-scorer.js";
|
|
13
15
|
export { ToxicityScorer } from "./scorers/toxicity-scorer.js";
|
|
14
16
|
export { CustomPromptScorer } from "./scorers/custom-prompt-scorer.js";
|
|
17
|
+
export { PolicyAdherenceScorer } from "./scorers/policy-adherence-scorer.js";
|
|
18
|
+
export { ContextRelevanceScorer } from "./scorers/context-relevance-scorer.js";
|
|
19
|
+
export { ToolTrajectoryScorer } from "./scorers/tool-trajectory-scorer.js";
|
|
20
|
+
export { ToolCallingAccuracyScorer } from "./scorers/tool-calling-accuracy-scorer.js";
|
|
21
|
+
export { MultiTurnConsistencyScorer } from "./scorers/multi-turn-consistency-scorer.js";
|
|
22
|
+
export { SafetyPreservationScorer } from "./scorers/safety-preservation-scorer.js";
|
|
23
|
+
export { RoutingAccuracyScorer } from "./scorers/routing-accuracy-scorer.js";
|
|
24
|
+
export { HandoffQualityScorer } from "./scorers/handoff-quality-scorer.js";
|
|
25
|
+
export { MilestoneScorer } from "./scorers/milestone-scorer.js";
|
|
15
26
|
export { parseDatasetJsonl } from "./dataset-loader.js";
|
|
16
27
|
export { createScorer } from "./scorer-factory.js";
|
|
17
28
|
export { ExperimentRunner } from "./experiment-runner.js";
|
|
18
29
|
export { compareExperiments } from "./experiment-comparator.js";
|
|
30
|
+
export { ConsistencyRunner } from "./consistency-runner.js";
|
|
19
31
|
//# sourceMappingURL=index.js.map
|
package/dist/eval/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAG/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAE1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/eval/index.ts"],"names":[],"mappings":"AAAA,+CAA+C;AAG/C,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,sBAAsB,EAAE,MAAM,uCAAuC,CAAC;AAC/E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,yBAAyB,EAAE,MAAM,2CAA2C,CAAC;AACtF,OAAO,EAAE,0BAA0B,EAAE,MAAM,4CAA4C,CAAC;AACxF,OAAO,EAAE,wBAAwB,EAAE,MAAM,yCAAyC,CAAC;AACnF,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AACxD,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AAE1D,OAAO,EAAE,kBAAkB,EAAE,MAAM,4BAA4B,CAAC;AAEhE,OAAO,EAAE,iBAAiB,EAAE,MAAM,yBAAyB,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"scorer-factory.d.ts","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACpD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;
|
|
1
|
+
{"version":3,"file":"scorer-factory.d.ts","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AACpD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AA2BxE,wBAAgB,YAAY,CAAC,MAAM,EAAE,gBAAgB,EAAE,GAAG,CAAC,EAAE,SAAS,GAAG,MAAM,CA8C9E"}
|
|
@@ -13,6 +13,17 @@ import { CoherenceScorer } from "./scorers/coherence-scorer.js";
|
|
|
13
13
|
import { HallucinationScorer } from "./scorers/hallucination-scorer.js";
|
|
14
14
|
import { ToxicityScorer } from "./scorers/toxicity-scorer.js";
|
|
15
15
|
import { CustomPromptScorer } from "./scorers/custom-prompt-scorer.js";
|
|
16
|
+
import { PolicyAdherenceScorer } from "./scorers/policy-adherence-scorer.js";
|
|
17
|
+
import { ContextRelevanceScorer } from "./scorers/context-relevance-scorer.js";
|
|
18
|
+
import { ToolTrajectoryScorer } from "./scorers/tool-trajectory-scorer.js";
|
|
19
|
+
import { EffortScorer } from "./scorers/effort-scorer.js";
|
|
20
|
+
import { ResolutionScorer } from "./scorers/resolution-scorer.js";
|
|
21
|
+
import { ToolCallingAccuracyScorer } from "./scorers/tool-calling-accuracy-scorer.js";
|
|
22
|
+
import { MultiTurnConsistencyScorer } from "./scorers/multi-turn-consistency-scorer.js";
|
|
23
|
+
import { SafetyPreservationScorer } from "./scorers/safety-preservation-scorer.js";
|
|
24
|
+
import { RoutingAccuracyScorer } from "./scorers/routing-accuracy-scorer.js";
|
|
25
|
+
import { HandoffQualityScorer } from "./scorers/handoff-quality-scorer.js";
|
|
26
|
+
import { MilestoneScorer } from "./scorers/milestone-scorer.js";
|
|
16
27
|
export function createScorer(config, llm) {
|
|
17
28
|
switch (config.type) {
|
|
18
29
|
case "exact-match":
|
|
@@ -27,6 +38,16 @@ export function createScorer(config, llm) {
|
|
|
27
38
|
return new LatencyScorer(config.maxLatencyMs ?? 5000);
|
|
28
39
|
case "cost":
|
|
29
40
|
return new CostScorer(config.maxCostUsd ?? 1.0);
|
|
41
|
+
case "effort":
|
|
42
|
+
return new EffortScorer();
|
|
43
|
+
case "resolution":
|
|
44
|
+
return new ResolutionScorer();
|
|
45
|
+
case "tool-calling-accuracy":
|
|
46
|
+
return new ToolCallingAccuracyScorer();
|
|
47
|
+
case "routing-accuracy":
|
|
48
|
+
return new RoutingAccuracyScorer();
|
|
49
|
+
case "milestone":
|
|
50
|
+
return new MilestoneScorer();
|
|
30
51
|
case "composite": {
|
|
31
52
|
const subScorers = (config.scorers ?? []).map((s) => createScorer(s, llm));
|
|
32
53
|
return new CompositeScorer(config.name, subScorers);
|
|
@@ -37,6 +58,12 @@ export function createScorer(config, llm) {
|
|
|
37
58
|
case "hallucination":
|
|
38
59
|
case "toxicity":
|
|
39
60
|
case "custom-prompt":
|
|
61
|
+
case "policy-adherence":
|
|
62
|
+
case "context-relevance":
|
|
63
|
+
case "tool-trajectory":
|
|
64
|
+
case "multi-turn-consistency":
|
|
65
|
+
case "safety-preservation":
|
|
66
|
+
case "handoff-quality":
|
|
40
67
|
return createLLMScorer(config, llm);
|
|
41
68
|
default:
|
|
42
69
|
throw new KilnError("EVAL_SCORER_FAILED", `Unknown scorer type: ${config.type}`, {
|
|
@@ -58,6 +85,12 @@ function createLLMScorer(config, llm) {
|
|
|
58
85
|
case "hallucination": return new HallucinationScorer(llm);
|
|
59
86
|
case "toxicity": return new ToxicityScorer(llm);
|
|
60
87
|
case "custom-prompt": return new CustomPromptScorer(config.name, config.prompt ?? "", llm);
|
|
88
|
+
case "policy-adherence": return new PolicyAdherenceScorer(llm, config.policies ?? []);
|
|
89
|
+
case "context-relevance": return new ContextRelevanceScorer(llm);
|
|
90
|
+
case "tool-trajectory": return new ToolTrajectoryScorer(llm);
|
|
91
|
+
case "multi-turn-consistency": return new MultiTurnConsistencyScorer(llm);
|
|
92
|
+
case "safety-preservation": return new SafetyPreservationScorer(llm);
|
|
93
|
+
case "handoff-quality": return new HandoffQualityScorer(llm);
|
|
61
94
|
default:
|
|
62
95
|
throw new KilnError("EVAL_SCORER_FAILED", `Unknown LLM scorer type: ${config.type}`, {
|
|
63
96
|
context: { type: config.type, name: config.name },
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"scorer-factory.js","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAAA,2DAA2D;AAI3D,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAChD,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;
|
|
1
|
+
{"version":3,"file":"scorer-factory.js","sourceRoot":"","sources":["../../src/eval/scorer-factory.ts"],"names":[],"mappings":"AAAA,2DAA2D;AAI3D,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAChD,OAAO,EAAE,gBAAgB,EAAE,MAAM,iCAAiC,CAAC;AACnE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,0BAA0B,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AACtE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAChE,OAAO,EAAE,mBAAmB,EAAE,MAAM,mCAAmC,CAAC;AACxE,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,kBAAkB,EAAE,MAAM,mCAAmC,CAAC;AACvE,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,sBAAsB,EAAE,MAAM,uCAAuC,CAAC;AAC/E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,gCAAgC,CAAC;AAClE,OAAO,EAAE,yBAAyB,EAAE,MAAM,2CAA2C,CAAC;AACtF,OAAO,EAAE,0BAA0B,EAAE,MAAM,4CAA4C,CAAC;AACxF,OAAO,EAAE,wBAAwB,EAAE,MAAM,yCAAyC,CAAC;AACnF,OAAO,EAAE,qBAAqB,EAAE,MAAM,sCAAsC,CAAC;AAC7E,OAAO,EAAE,oBAAoB,EAAE,MAAM,qCAAqC,CAAC;AAC3E,OAAO,EAAE,eAAe,EAAE,MAAM,+BAA+B,CAAC;AAEhE,MAAM,UAAU,YAAY,CAAC,MAAwB,EAAE,GAAe;IACpE,QAAQ,MAAM,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,aAAa;YAChB,OAAO,IAAI,gBAAgB,EAAE,CAAC;QAChC,KAAK,UAAU;YACb,OAAO,IAAI,cAAc,CAAC,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC;QACrD,KAAK,eAAe;YAClB,OAAO,IAAI,kBAAkB,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;QAC/C,KAAK,QAAQ;YACX,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,SAAS,CAAC,CAAC;QAC9D,KAAK,SAAS;YACZ,OAAO,IAAI,aAAa,CAAC,MAAM,CAAC,YAAY,IAAI,IAAI,CAAC,CAAC;QACxD,KAAK,MAAM;YACT,OAAO,IAAI,UAAU,CAAC,MAAM,CAAC,UAAU,IAAI,GAAG,CAAC,CAAC;QAClD,KAAK,QAAQ;YACX,OAAO,IAAI,YAAY,EAAE,CAAC;QAC5B,KAAK,YAAY;YACf,OAAO,IAAI,gBAAgB,EAAE,CAAC;QAChC,KAAK,uBAAuB;YAC1B,OAAO,IAAI,yBAAyB,EAAE,CAAC;QACzC,KAAK,kBAAkB;YACrB,OAAO,IAAI,qBAAqB,EAAE,CAAC;QACrC,KAAK,WAAW;YACd,OAAO,IAAI,eAAe,EAAE,CAAC;QAC/B,KAAK,WAAW,CAAC,CAAC,CAAC;YACjB,MAAM,UAAU,GAAG,CAAC,MAAM,CAAC,OAAO,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;YAC3E,OAAO,IAAI,eAAe,CAAC,MAAM,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;QACtD,CAAC;QACD,KAAK,cAAc,CAAC;QACpB,KAAK,WAAW,CAAC;QACjB,KAAK,WAAW,CAAC;QACjB,KAAK,eAAe,CAAC;QACrB,KAAK,UAAU,CAAC;QAChB,KAAK,eAAe,CAAC;QACrB,KAAK,kBAAkB,CAAC;QACxB,KAAK,mBAAmB,CAAC;QACzB,KAAK,iBAAiB,CAAC;QACvB,KAAK,wBAAwB,CAAC;QAC9B,KAAK,qBAAqB,CAAC;QAC3B,KAAK,iBAAiB;YACpB,OAAO,eAAe,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QACtC;YACE,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,wBAAwB,MAAM,CAAC,IAAI,EAAE,EAAE;gBAC/E,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;aAClD,CAAC,CAAC;IACP,CAAC;AACH,CAAC;AAED,SAAS,eAAe,CAAC,MAAwB,EAAE,GAAe;IAChE,IAAI,CAAC,GAAG,EAAE,CAAC;QACT,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,eAAe,MAAM,CAAC,IAAI,iCAAiC,EAAE;YACrG,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;YACjD,UAAU,EAAE,2DAA2D;SACxE,CAAC,CAAC;IACL,CAAC;IACD,QAAQ,MAAM,CAAC,IAAI,EAAE,CAAC;QACpB,KAAK,cAAc,CAAC,CAAC,OAAO,IAAI,kBAAkB,CAAC,GAAG,CAAC,CAAC;QACxD,KAAK,WAAW,CAAC,CAAC,OAAO,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;QAClD,KAAK,WAAW,CAAC,CAAC,OAAO,IAAI,eAAe,CAAC,GAAG,CAAC,CAAC;QAClD,KAAK,eAAe,CAAC,CAAC,OAAO,IAAI,mBAAmB,CAAC,GAAG,CAAC,CAAC;QAC1D,KAAK,UAAU,CAAC,CAAC,OAAO,IAAI,cAAc,CAAC,GAAG,CAAC,CAAC;QAChD,KAAK,eAAe,CAAC,CAAC,OAAO,IAAI,kBAAkB,CAAC,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,IAAI,EAAE,EAAE,GAAG,CAAC,CAAC;QAC3F,KAAK,kBAAkB,CAAC,CAAC,OAAO,IAAI,qBAAqB,CAAC,GAAG,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC;QACtF,KAAK,mBAAmB,CAAC,CAAC,OAAO,IAAI,sBAAsB,CAAC,GAAG,CAAC,CAAC;QACjE,KAAK,iBAAiB,CAAC,CAAC,OAAO,IAAI,oBAAoB,CAAC,GAAG,CAAC,CAAC;QAC7D,KAAK,wBAAwB,CAAC,CAAC,OAAO,IAAI,0BAA0B,CAAC,GAAG,CAAC,CAAC;QAC1E,KAAK,qBAAqB,CAAC,CAAC,OAAO,IAAI,wBAAwB,CAAC,GAAG,CAAC,CAAC;QACrE,KAAK,iBAAiB,CAAC,CAAC,OAAO,IAAI,oBAAoB,CAAC,GAAG,CAAC,CAAC;QAC7D;YACE,MAAM,IAAI,SAAS,CAAC,oBAAoB,EAAE,4BAA4B,MAAM,CAAC,IAAI,EAAE,EAAE;gBACnF,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,MAAM,CAAC,IAAI,EAAE;aAClD,CAAC,CAAC;IACP,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
|
|
2
|
+
export declare class ContextRelevanceScorer implements Scorer {
|
|
3
|
+
private readonly llm;
|
|
4
|
+
readonly name = "context-relevance";
|
|
5
|
+
constructor(llm: ScorerLLM);
|
|
6
|
+
score(input: EvalInput): Promise<EvalScore>;
|
|
7
|
+
}
|
|
8
|
+
//# sourceMappingURL=context-relevance-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context-relevance-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/context-relevance-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAG3E,qBAAa,sBAAuB,YAAW,MAAM;IAGvC,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,uBAAuB;gBAEP,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAwBlD"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
// ContextRelevanceScorer: evaluates whether retrieved context chunks are relevant to the input query
|
|
2
|
+
import { parseLLMResponse } from "./parse-llm-response.js";
|
|
3
|
+
export class ContextRelevanceScorer {
|
|
4
|
+
llm;
|
|
5
|
+
name = "context-relevance";
|
|
6
|
+
constructor(llm) {
|
|
7
|
+
this.llm = llm;
|
|
8
|
+
}
|
|
9
|
+
async score(input) {
|
|
10
|
+
const chunks = input.context ?? [];
|
|
11
|
+
if (chunks.length === 0) {
|
|
12
|
+
return { name: this.name, score: 0, reasoning: "No context provided" };
|
|
13
|
+
}
|
|
14
|
+
const contextList = chunks.map((c, i) => `[${i + 1}] ${c}`).join("\n\n");
|
|
15
|
+
const prompt = `Evaluate context relevance. Are the retrieved context chunks relevant to the user's query? This measures retrieval quality, not answer quality.
|
|
16
|
+
|
|
17
|
+
Query: ${input.input}
|
|
18
|
+
|
|
19
|
+
Retrieved context:
|
|
20
|
+
${contextList}
|
|
21
|
+
|
|
22
|
+
Score 1.0 if all chunks are highly relevant to the query. Score 0.0 if none are relevant. Use intermediate scores based on the proportion and degree of relevance.
|
|
23
|
+
|
|
24
|
+
Respond EXACTLY in this format:
|
|
25
|
+
SCORE: <number from 0.0 to 1.0>
|
|
26
|
+
REASONING: <one sentence explanation>`;
|
|
27
|
+
const response = await this.llm.evaluate(prompt);
|
|
28
|
+
const { score, reasoning } = parseLLMResponse(response, this.name);
|
|
29
|
+
return { name: this.name, score, reasoning };
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=context-relevance-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"context-relevance-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/context-relevance-scorer.ts"],"names":[],"mappings":"AAAA,qGAAqG;AAGrG,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAE3D,MAAM,OAAO,sBAAsB;IAGJ;IAFpB,IAAI,GAAG,mBAAmB,CAAC;IAEpC,YAA6B,GAAc;QAAd,QAAG,GAAH,GAAG,CAAW;IAAG,CAAC;IAE/C,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,MAAM,GAAG,KAAK,CAAC,OAAO,IAAI,EAAE,CAAC;QACnC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACxB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,qBAAqB,EAAE,CAAC;QACzE,CAAC;QAED,MAAM,WAAW,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACzE,MAAM,MAAM,GAAG;;SAEV,KAAK,CAAC,KAAK;;;EAGlB,WAAW;;;;;;sCAMyB,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"effort-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/effort-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAIhE,qBAAa,YAAa,YAAW,MAAM;IACzC,QAAQ,CAAC,IAAI,YAAY;IAEnB,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CASlD"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
// EffortScorer: rule-based scorer bridging enrichment effort score into eval framework
|
|
2
|
+
import { computeEffortScore } from "../../enrichment/effort-score.js";
|
|
3
|
+
export class EffortScorer {
|
|
4
|
+
name = "effort";
|
|
5
|
+
async score(input) {
|
|
6
|
+
const components = input.metadata?.effortComponents;
|
|
7
|
+
if (!components) {
|
|
8
|
+
return { name: this.name, score: 0, reasoning: "No effort components in metadata" };
|
|
9
|
+
}
|
|
10
|
+
const rawScore = computeEffortScore(components);
|
|
11
|
+
const normalized = rawScore / 10; // 0-10 -> 0-1
|
|
12
|
+
return { name: this.name, score: normalized, reasoning: `Effort score: ${rawScore}/10` };
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
//# sourceMappingURL=effort-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"effort-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/effort-scorer.ts"],"names":[],"mappings":"AAAA,uFAAuF;AAGvF,OAAO,EAAE,kBAAkB,EAAE,MAAM,kCAAkC,CAAC;AAGtE,MAAM,OAAO,YAAY;IACd,IAAI,GAAG,QAAQ,CAAC;IAEzB,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,UAAU,GAAG,KAAK,CAAC,QAAQ,EAAE,gBAAgD,CAAC;QACpF,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,kCAAkC,EAAE,CAAC;QACtF,CAAC;QACD,MAAM,QAAQ,GAAG,kBAAkB,CAAC,UAAU,CAAC,CAAC;QAChD,MAAM,UAAU,GAAG,QAAQ,GAAG,EAAE,CAAC,CAAC,cAAc;QAChD,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,UAAU,EAAE,SAAS,EAAE,iBAAiB,QAAQ,KAAK,EAAE,CAAC;IAC3F,CAAC;CACF"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
|
|
2
|
+
export declare class HandoffQualityScorer implements Scorer {
|
|
3
|
+
private readonly llm;
|
|
4
|
+
readonly name = "handoff-quality";
|
|
5
|
+
constructor(llm: ScorerLLM);
|
|
6
|
+
score(input: EvalInput): Promise<EvalScore>;
|
|
7
|
+
}
|
|
8
|
+
//# sourceMappingURL=handoff-quality-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"handoff-quality-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/handoff-quality-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AA4B3E,qBAAa,oBAAqB,YAAW,MAAM;IAGrC,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,qBAAqB;gBAEL,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAuClD"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
// HandoffQualityScorer: LLM-as-judge for context preservation across agent handoffs
|
|
2
|
+
import { parseLLMResponse } from "./parse-llm-response.js";
|
|
3
|
+
function extractHandoffs(metadata) {
|
|
4
|
+
if (!metadata)
|
|
5
|
+
return undefined;
|
|
6
|
+
const raw = metadata["handoffHistory"];
|
|
7
|
+
if (!Array.isArray(raw))
|
|
8
|
+
return undefined;
|
|
9
|
+
const events = [];
|
|
10
|
+
for (const entry of raw) {
|
|
11
|
+
if (typeof entry === "object" &&
|
|
12
|
+
entry !== null &&
|
|
13
|
+
typeof entry["fromAgent"] === "string" &&
|
|
14
|
+
typeof entry["toAgent"] === "string") {
|
|
15
|
+
events.push(entry);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return events.length > 0 ? events : undefined;
|
|
19
|
+
}
|
|
20
|
+
export class HandoffQualityScorer {
|
|
21
|
+
llm;
|
|
22
|
+
name = "handoff-quality";
|
|
23
|
+
constructor(llm) {
|
|
24
|
+
this.llm = llm;
|
|
25
|
+
}
|
|
26
|
+
async score(input) {
|
|
27
|
+
const handoffs = extractHandoffs(input.metadata);
|
|
28
|
+
if (!handoffs) {
|
|
29
|
+
return { name: this.name, score: 0, reasoning: "No handoff history in metadata" };
|
|
30
|
+
}
|
|
31
|
+
const handoffLog = handoffs
|
|
32
|
+
.map((h, i) => {
|
|
33
|
+
const parts = [`Handoff ${i + 1}: ${h.fromAgent} -> ${h.toAgent}`];
|
|
34
|
+
if (h.reason)
|
|
35
|
+
parts.push(` Reason: ${h.reason}`);
|
|
36
|
+
if (h.summary)
|
|
37
|
+
parts.push(` Summary: ${h.summary}`);
|
|
38
|
+
return parts.join("\n");
|
|
39
|
+
})
|
|
40
|
+
.join("\n\n");
|
|
41
|
+
const prompt = `Evaluate the quality of agent handoffs in this conversation. Was context preserved across each agent switch?
|
|
42
|
+
|
|
43
|
+
User query: ${input.input}
|
|
44
|
+
Final output: ${input.output}
|
|
45
|
+
|
|
46
|
+
Handoff history:
|
|
47
|
+
${handoffLog}
|
|
48
|
+
|
|
49
|
+
Evaluate:
|
|
50
|
+
1. Was the handoff reason appropriate (correct agent for the task)?
|
|
51
|
+
2. Was the context summary accurate and complete?
|
|
52
|
+
3. Did the receiving agent pick up seamlessly without re-asking for information?
|
|
53
|
+
4. Was any critical context lost during the handoff?
|
|
54
|
+
|
|
55
|
+
Score 1.0 for seamless handoffs with full context preservation. Score 0.0 for complete context loss.
|
|
56
|
+
|
|
57
|
+
Respond EXACTLY in this format:
|
|
58
|
+
SCORE: <number from 0.0 to 1.0>
|
|
59
|
+
REASONING: <one sentence explanation>`;
|
|
60
|
+
const response = await this.llm.evaluate(prompt);
|
|
61
|
+
const { score, reasoning } = parseLLMResponse(response, this.name);
|
|
62
|
+
return { name: this.name, score, reasoning };
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
//# sourceMappingURL=handoff-quality-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"handoff-quality-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/handoff-quality-scorer.ts"],"names":[],"mappings":"AAAA,oFAAoF;AAGpF,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAS3D,SAAS,eAAe,CAAC,QAA6C;IACpE,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,gBAAgB,CAAC,CAAC;IACvC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,MAAM,MAAM,GAAmB,EAAE,CAAC;IAClC,KAAK,MAAM,KAAK,IAAI,GAAG,EAAE,CAAC;QACxB,IACE,OAAO,KAAK,KAAK,QAAQ;YACzB,KAAK,KAAK,IAAI;YACd,OAAQ,KAAiC,CAAC,WAAW,CAAC,KAAK,QAAQ;YACnE,OAAQ,KAAiC,CAAC,SAAS,CAAC,KAAK,QAAQ,EACjE,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,KAAqB,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,CAAC;AAChD,CAAC;AAED,MAAM,OAAO,oBAAoB;IAGF;IAFpB,IAAI,GAAG,iBAAiB,CAAC;IAElC,YAA6B,GAAc;QAAd,QAAG,GAAH,GAAG,CAAW;IAAG,CAAC;IAE/C,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,QAAQ,GAAG,eAAe,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACjD,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,gCAAgC,EAAE,CAAC;QACpF,CAAC;QAED,MAAM,UAAU,GAAG,QAAQ;aACxB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YACZ,MAAM,KAAK,GAAG,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,SAAS,OAAO,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;YACnE,IAAI,CAAC,CAAC,MAAM;gBAAE,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;YAClD,IAAI,CAAC,CAAC,OAAO;gBAAE,KAAK,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC;YACrD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC,CAAC;aACD,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,MAAM,GAAG;;cAEL,KAAK,CAAC,KAAK;gBACT,KAAK,CAAC,MAAM;;;EAG1B,UAAU;;;;;;;;;;;;sCAY0B,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"milestone-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/milestone-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAyBhE,qBAAa,eAAgB,YAAW,MAAM;IAC5C,QAAQ,CAAC,IAAI,eAAe;IAEtB,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAelD"}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
// MilestoneScorer: rule-based scorer tracking intermediate checkpoint achievement
|
|
2
|
+
function extractMilestones(metadata) {
|
|
3
|
+
if (!metadata)
|
|
4
|
+
return undefined;
|
|
5
|
+
const raw = metadata["milestones"];
|
|
6
|
+
if (!Array.isArray(raw))
|
|
7
|
+
return undefined;
|
|
8
|
+
const milestones = [];
|
|
9
|
+
for (const entry of raw) {
|
|
10
|
+
if (typeof entry === "object" &&
|
|
11
|
+
entry !== null &&
|
|
12
|
+
typeof entry["name"] === "string" &&
|
|
13
|
+
typeof entry["completed"] === "boolean") {
|
|
14
|
+
milestones.push(entry);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
return milestones.length > 0 ? milestones : undefined;
|
|
18
|
+
}
|
|
19
|
+
export class MilestoneScorer {
|
|
20
|
+
name = "milestone";
|
|
21
|
+
async score(input) {
|
|
22
|
+
const milestones = extractMilestones(input.metadata);
|
|
23
|
+
if (!milestones) {
|
|
24
|
+
return { name: this.name, score: 0, reasoning: "No milestones in metadata" };
|
|
25
|
+
}
|
|
26
|
+
const completed = milestones.filter((m) => m.completed);
|
|
27
|
+
const score = completed.length / milestones.length;
|
|
28
|
+
const missed = milestones.filter((m) => !m.completed).map((m) => m.name);
|
|
29
|
+
const parts = [`${completed.length}/${milestones.length} milestones completed`];
|
|
30
|
+
if (missed.length > 0)
|
|
31
|
+
parts.push(`missed: ${missed.join(", ")}`);
|
|
32
|
+
return { name: this.name, score: Math.round(score * 100) / 100, reasoning: parts.join("; ") };
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
//# sourceMappingURL=milestone-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"milestone-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/milestone-scorer.ts"],"names":[],"mappings":"AAAA,kFAAkF;AASlF,SAAS,iBAAiB,CAAC,QAA6C;IACtE,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,YAAY,CAAC,CAAC;IACnC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,MAAM,UAAU,GAAgB,EAAE,CAAC;IACnC,KAAK,MAAM,KAAK,IAAI,GAAG,EAAE,CAAC;QACxB,IACE,OAAO,KAAK,KAAK,QAAQ;YACzB,KAAK,KAAK,IAAI;YACd,OAAQ,KAAiC,CAAC,MAAM,CAAC,KAAK,QAAQ;YAC9D,OAAQ,KAAiC,CAAC,WAAW,CAAC,KAAK,SAAS,EACpE,CAAC;YACD,UAAU,CAAC,IAAI,CAAC,KAAkB,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IACD,OAAO,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CAAC;AACxD,CAAC;AAED,MAAM,OAAO,eAAe;IACjB,IAAI,GAAG,WAAW,CAAC;IAE5B,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,UAAU,GAAG,iBAAiB,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QACrD,IAAI,CAAC,UAAU,EAAE,CAAC;YAChB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,2BAA2B,EAAE,CAAC;QAC/E,CAAC;QAED,MAAM,SAAS,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACxD,MAAM,KAAK,GAAG,SAAS,CAAC,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC;QACnD,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAEzE,MAAM,KAAK,GAAG,CAAC,GAAG,SAAS,CAAC,MAAM,IAAI,UAAU,CAAC,MAAM,uBAAuB,CAAC,CAAC;QAChF,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;YAAE,KAAK,CAAC,IAAI,CAAC,WAAW,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAElE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,GAAG,CAAC,GAAG,GAAG,EAAE,SAAS,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;IAChG,CAAC;CACF"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
|
|
2
|
+
export declare class MultiTurnConsistencyScorer implements Scorer {
|
|
3
|
+
private readonly llm;
|
|
4
|
+
readonly name = "multi-turn-consistency";
|
|
5
|
+
constructor(llm: ScorerLLM);
|
|
6
|
+
score(input: EvalInput): Promise<EvalScore>;
|
|
7
|
+
}
|
|
8
|
+
//# sourceMappingURL=multi-turn-consistency-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"multi-turn-consistency-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/multi-turn-consistency-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AA0B3E,qBAAa,0BAA2B,YAAW,MAAM;IAG3C,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,QAAQ,CAAC,IAAI,4BAA4B;gBAEZ,GAAG,EAAE,SAAS;IAErC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CA+BlD"}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
// MultiTurnConsistencyScorer: LLM-as-judge for context retention across conversation turns
|
|
2
|
+
import { parseLLMResponse } from "./parse-llm-response.js";
|
|
3
|
+
function extractConversationHistory(metadata) {
|
|
4
|
+
if (!metadata)
|
|
5
|
+
return undefined;
|
|
6
|
+
const raw = metadata["conversationHistory"];
|
|
7
|
+
if (!Array.isArray(raw))
|
|
8
|
+
return undefined;
|
|
9
|
+
const turns = [];
|
|
10
|
+
for (const entry of raw) {
|
|
11
|
+
if (typeof entry === "object" &&
|
|
12
|
+
entry !== null &&
|
|
13
|
+
typeof entry["role"] === "string" &&
|
|
14
|
+
typeof entry["content"] === "string") {
|
|
15
|
+
turns.push(entry);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return turns.length >= 2 ? turns : undefined;
|
|
19
|
+
}
|
|
20
|
+
export class MultiTurnConsistencyScorer {
|
|
21
|
+
llm;
|
|
22
|
+
name = "multi-turn-consistency";
|
|
23
|
+
constructor(llm) {
|
|
24
|
+
this.llm = llm;
|
|
25
|
+
}
|
|
26
|
+
async score(input) {
|
|
27
|
+
const history = extractConversationHistory(input.metadata);
|
|
28
|
+
if (!history) {
|
|
29
|
+
return { name: this.name, score: 0, reasoning: "No conversation history in metadata (need >= 2 turns)" };
|
|
30
|
+
}
|
|
31
|
+
const transcript = history.map((t) => `[${t.role}]: ${t.content}`).join("\n");
|
|
32
|
+
const prompt = `Evaluate context retention across this multi-turn conversation. Did the assistant maintain awareness of previously stated facts, requests, and context throughout the conversation?
|
|
33
|
+
|
|
34
|
+
Conversation:
|
|
35
|
+
${transcript}
|
|
36
|
+
|
|
37
|
+
Final output: ${input.output}
|
|
38
|
+
|
|
39
|
+
Evaluate:
|
|
40
|
+
1. Does the assistant contradict earlier statements?
|
|
41
|
+
2. Does it forget previously provided information?
|
|
42
|
+
3. Does it ask questions already answered?
|
|
43
|
+
4. Does it maintain a coherent understanding of the user's evolving needs?
|
|
44
|
+
|
|
45
|
+
Score 1.0 for perfect context retention. Score 0.0 for complete context loss.
|
|
46
|
+
|
|
47
|
+
Respond EXACTLY in this format:
|
|
48
|
+
SCORE: <number from 0.0 to 1.0>
|
|
49
|
+
REASONING: <one sentence explanation>`;
|
|
50
|
+
const response = await this.llm.evaluate(prompt);
|
|
51
|
+
const { score, reasoning } = parseLLMResponse(response, this.name);
|
|
52
|
+
return { name: this.name, score, reasoning };
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
//# sourceMappingURL=multi-turn-consistency-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"multi-turn-consistency-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/multi-turn-consistency-scorer.ts"],"names":[],"mappings":"AAAA,2FAA2F;AAG3F,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAO3D,SAAS,0BAA0B,CAAC,QAA6C;IAC/E,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,GAAG,GAAG,QAAQ,CAAC,qBAAqB,CAAC,CAAC;IAC5C,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,SAAS,CAAC;IAC1C,MAAM,KAAK,GAAuB,EAAE,CAAC;IACrC,KAAK,MAAM,KAAK,IAAI,GAAG,EAAE,CAAC;QACxB,IACE,OAAO,KAAK,KAAK,QAAQ;YACzB,KAAK,KAAK,IAAI;YACd,OAAQ,KAAiC,CAAC,MAAM,CAAC,KAAK,QAAQ;YAC9D,OAAQ,KAAiC,CAAC,SAAS,CAAC,KAAK,QAAQ,EACjE,CAAC;YACD,KAAK,CAAC,IAAI,CAAC,KAAyB,CAAC,CAAC;QACxC,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC;AAC/C,CAAC;AAED,MAAM,OAAO,0BAA0B;IAGR;IAFpB,IAAI,GAAG,wBAAwB,CAAC;IAEzC,YAA6B,GAAc;QAAd,QAAG,GAAH,GAAG,CAAW;IAAG,CAAC;IAE/C,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,OAAO,GAAG,0BAA0B,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QAC3D,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,uDAAuD,EAAE,CAAC;QAC3G,CAAC;QAED,MAAM,UAAU,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE9E,MAAM,MAAM,GAAG;;;EAGjB,UAAU;;gBAEI,KAAK,CAAC,MAAM;;;;;;;;;;;;sCAYU,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { EvalInput, EvalScore, Scorer, ScorerLLM } from "../types.js";
|
|
2
|
+
export declare class PolicyAdherenceScorer implements Scorer {
|
|
3
|
+
private readonly llm;
|
|
4
|
+
private readonly policies;
|
|
5
|
+
readonly name = "policy-adherence";
|
|
6
|
+
constructor(llm: ScorerLLM, policies: readonly string[]);
|
|
7
|
+
score(input: EvalInput): Promise<EvalScore>;
|
|
8
|
+
}
|
|
9
|
+
//# sourceMappingURL=policy-adherence-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"policy-adherence-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/policy-adherence-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAG3E,qBAAa,qBAAsB,YAAW,MAAM;IAIhD,OAAO,CAAC,QAAQ,CAAC,GAAG;IACpB,OAAO,CAAC,QAAQ,CAAC,QAAQ;IAJ3B,QAAQ,CAAC,IAAI,sBAAsB;gBAGhB,GAAG,EAAE,SAAS,EACd,QAAQ,EAAE,SAAS,MAAM,EAAE;IAGxC,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAwBlD"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
// PolicyAdherenceScorer: evaluates whether output follows business policy rules
|
|
2
|
+
import { parseLLMResponse } from "./parse-llm-response.js";
|
|
3
|
+
export class PolicyAdherenceScorer {
|
|
4
|
+
llm;
|
|
5
|
+
policies;
|
|
6
|
+
name = "policy-adherence";
|
|
7
|
+
constructor(llm, policies) {
|
|
8
|
+
this.llm = llm;
|
|
9
|
+
this.policies = policies;
|
|
10
|
+
}
|
|
11
|
+
async score(input) {
|
|
12
|
+
if (this.policies.length === 0) {
|
|
13
|
+
return { name: this.name, score: 0, reasoning: "No policies configured" };
|
|
14
|
+
}
|
|
15
|
+
const policyList = this.policies.map((p, i) => `${i + 1}. ${p}`).join("\n");
|
|
16
|
+
const prompt = `Evaluate policy adherence. Does the output comply with ALL of the following business policies?
|
|
17
|
+
|
|
18
|
+
Policies:
|
|
19
|
+
${policyList}
|
|
20
|
+
|
|
21
|
+
Input: ${input.input}
|
|
22
|
+
Output: ${input.output}
|
|
23
|
+
|
|
24
|
+
Score 1.0 if the output fully adheres to all policies. Score 0.0 if it violates any policy. Use intermediate scores for partial adherence.
|
|
25
|
+
|
|
26
|
+
Respond EXACTLY in this format:
|
|
27
|
+
SCORE: <number from 0.0 to 1.0>
|
|
28
|
+
REASONING: <one sentence explanation>`;
|
|
29
|
+
const response = await this.llm.evaluate(prompt);
|
|
30
|
+
const { score, reasoning } = parseLLMResponse(response, this.name);
|
|
31
|
+
return { name: this.name, score, reasoning };
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
//# sourceMappingURL=policy-adherence-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"policy-adherence-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/policy-adherence-scorer.ts"],"names":[],"mappings":"AAAA,gFAAgF;AAGhF,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAE3D,MAAM,OAAO,qBAAqB;IAIb;IACA;IAJV,IAAI,GAAG,kBAAkB,CAAC;IAEnC,YACmB,GAAc,EACd,QAA2B;QAD3B,QAAG,GAAH,GAAG,CAAW;QACd,aAAQ,GAAR,QAAQ,CAAmB;IAC3C,CAAC;IAEJ,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,IAAI,IAAI,CAAC,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,wBAAwB,EAAE,CAAC;QAC5E,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC5E,MAAM,MAAM,GAAG;;;EAGjB,UAAU;;SAEH,KAAK,CAAC,KAAK;UACV,KAAK,CAAC,MAAM;;;;;;sCAMgB,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QACjD,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,GAAG,gBAAgB,CAAC,QAAQ,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACnE,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IAC/C,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"resolution-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/resolution-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAShE,qBAAa,gBAAiB,YAAW,MAAM;IAC7C,QAAQ,CAAC,IAAI,gBAAgB;IAEvB,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAclD"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
// ResolutionScorer: rule-based scorer mapping resolution status to eval score
|
|
2
|
+
const RESOLUTION_SCORES = {
|
|
3
|
+
resolved: 1.0,
|
|
4
|
+
partial: 0.5,
|
|
5
|
+
ambiguous: 0.25,
|
|
6
|
+
unresolved: 0.0,
|
|
7
|
+
};
|
|
8
|
+
export class ResolutionScorer {
|
|
9
|
+
name = "resolution";
|
|
10
|
+
async score(input) {
|
|
11
|
+
const resolution = input.metadata?.resolution;
|
|
12
|
+
if (!resolution?.status) {
|
|
13
|
+
return { name: this.name, score: 0, reasoning: "No resolution data in metadata" };
|
|
14
|
+
}
|
|
15
|
+
const score = RESOLUTION_SCORES[resolution.status] ?? 0;
|
|
16
|
+
const confidence = resolution.confidence ?? 1.0;
|
|
17
|
+
const weighted = score * confidence;
|
|
18
|
+
return {
|
|
19
|
+
name: this.name,
|
|
20
|
+
score: weighted,
|
|
21
|
+
reasoning: `Resolution: ${resolution.status} (confidence: ${confidence})`,
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=resolution-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"resolution-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/resolution-scorer.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAI9E,MAAM,iBAAiB,GAA2B;IAChD,QAAQ,EAAE,GAAG;IACb,OAAO,EAAE,GAAG;IACZ,SAAS,EAAE,IAAI;IACf,UAAU,EAAE,GAAG;CAChB,CAAC;AAEF,MAAM,OAAO,gBAAgB;IAClB,IAAI,GAAG,YAAY,CAAC;IAE7B,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,UAAU,GAAG,KAAK,CAAC,QAAQ,EAAE,UAAkE,CAAC;QACtG,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,CAAC;YACxB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,gCAAgC,EAAE,CAAC;QACpF,CAAC;QACD,MAAM,KAAK,GAAG,iBAAiB,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QACxD,MAAM,UAAU,GAAG,UAAU,CAAC,UAAU,IAAI,GAAG,CAAC;QAChD,MAAM,QAAQ,GAAG,KAAK,GAAG,UAAU,CAAC;QACpC,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,KAAK,EAAE,QAAQ;YACf,SAAS,EAAE,eAAe,UAAU,CAAC,MAAM,iBAAiB,UAAU,GAAG;SAC1E,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { EvalInput, EvalScore, Scorer } from "../types.js";
|
|
2
|
+
export declare class RoutingAccuracyScorer implements Scorer {
|
|
3
|
+
readonly name = "routing-accuracy";
|
|
4
|
+
score(input: EvalInput): Promise<EvalScore>;
|
|
5
|
+
}
|
|
6
|
+
//# sourceMappingURL=routing-accuracy-scorer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"routing-accuracy-scorer.d.ts","sourceRoot":"","sources":["../../../src/eval/scorers/routing-accuracy-scorer.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAEhE,qBAAa,qBAAsB,YAAW,MAAM;IAClD,QAAQ,CAAC,IAAI,sBAAsB;IAE7B,KAAK,CAAC,KAAK,EAAE,SAAS,GAAG,OAAO,CAAC,SAAS,CAAC;CAiBlD"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
// RoutingAccuracyScorer: rule-based scorer checking if the correct agent handled the message
|
|
2
|
+
export class RoutingAccuracyScorer {
|
|
3
|
+
name = "routing-accuracy";
|
|
4
|
+
async score(input) {
|
|
5
|
+
const expected = input.metadata?.expectedAgentId;
|
|
6
|
+
if (!expected) {
|
|
7
|
+
return { name: this.name, score: 0, reasoning: "No expectedAgentId in metadata" };
|
|
8
|
+
}
|
|
9
|
+
const actual = input.metadata?.activeAgentId;
|
|
10
|
+
if (!actual) {
|
|
11
|
+
return { name: this.name, score: 0, reasoning: "No activeAgentId in metadata" };
|
|
12
|
+
}
|
|
13
|
+
if (actual === expected) {
|
|
14
|
+
return { name: this.name, score: 1, reasoning: `Correct: routed to "${actual}"` };
|
|
15
|
+
}
|
|
16
|
+
return { name: this.name, score: 0, reasoning: `Expected "${expected}", got "${actual}"` };
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=routing-accuracy-scorer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"routing-accuracy-scorer.js","sourceRoot":"","sources":["../../../src/eval/scorers/routing-accuracy-scorer.ts"],"names":[],"mappings":"AAAA,6FAA6F;AAI7F,MAAM,OAAO,qBAAqB;IACvB,IAAI,GAAG,kBAAkB,CAAC;IAEnC,KAAK,CAAC,KAAK,CAAC,KAAgB;QAC1B,MAAM,QAAQ,GAAG,KAAK,CAAC,QAAQ,EAAE,eAAqC,CAAC;QACvE,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,gCAAgC,EAAE,CAAC;QACpF,CAAC;QAED,MAAM,MAAM,GAAG,KAAK,CAAC,QAAQ,EAAE,aAAmC,CAAC;QACnE,IAAI,CAAC,MAAM,EAAE,CAAC;YACZ,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,8BAA8B,EAAE,CAAC;QAClF,CAAC;QAED,IAAI,MAAM,KAAK,QAAQ,EAAE,CAAC;YACxB,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,uBAAuB,MAAM,GAAG,EAAE,CAAC;QACpF,CAAC;QAED,OAAO,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,EAAE,aAAa,QAAQ,WAAW,MAAM,GAAG,EAAE,CAAC;IAC7F,CAAC;CACF"}
|