@swarmtools/evals 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=decision-quality.eval.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"decision-quality.eval.d.ts","sourceRoot":"","sources":["../src/decision-quality.eval.ts"],"names":[],"mappings":""}
@@ -0,0 +1,54 @@
1
+ /**
2
+ * Test cases for decision quality evaluation
3
+ *
4
+ * Each case includes:
5
+ * - input: Decision trace with strategy selection and optional precedent
6
+ * - output: Outcome data with success/failure metrics
7
+ * - expected: Validation criteria for scorer thresholds
8
+ */
9
+ /**
10
+ * Strategy selection fixture with outcome data.
11
+ * Used to test strategySelectionQuality scorer.
12
+ */
13
+ export interface StrategySelectionFixture {
14
+ input: {
15
+ task: string;
16
+ strategy: string;
17
+ precedent_task?: string;
18
+ precedent_strategy?: string;
19
+ };
20
+ output: {
21
+ strategy: string;
22
+ outcome_success: boolean;
23
+ error_count: number;
24
+ duration_ms?: number;
25
+ };
26
+ expected: {
27
+ min_score?: number;
28
+ max_score?: number;
29
+ };
30
+ }
31
+ /**
32
+ * Precedent relevance fixture.
33
+ * Used to test precedentRelevance scorer (LLM-as-judge).
34
+ */
35
+ export interface PrecedentRelevanceFixture {
36
+ input: {
37
+ task: string;
38
+ precedent_task: string;
39
+ precedent_strategy: string;
40
+ };
41
+ expected: {
42
+ min_score?: number;
43
+ max_score?: number;
44
+ };
45
+ }
46
+ /**
47
+ * Strategy selection fixtures - known good and bad outcomes.
48
+ */
49
+ export declare const strategySelectionFixtures: StrategySelectionFixture[];
50
+ /**
51
+ * Precedent relevance fixtures - testing LLM-as-judge similarity scoring.
52
+ */
53
+ export declare const precedentRelevanceFixtures: PrecedentRelevanceFixture[];
54
+ //# sourceMappingURL=decision-quality-fixtures.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"decision-quality-fixtures.d.ts","sourceRoot":"","sources":["../../src/fixtures/decision-quality-fixtures.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH;;;GAGG;AACH,MAAM,WAAW,wBAAwB;IACvC,KAAK,EAAE;QACL,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,EAAE,MAAM,CAAC;QACjB,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,kBAAkB,CAAC,EAAE,MAAM,CAAC;KAC7B,CAAC;IACF,MAAM,EAAE;QACN,QAAQ,EAAE,MAAM,CAAC;QACjB,eAAe,EAAE,OAAO,CAAC;QACzB,WAAW,EAAE,MAAM,CAAC;QACpB,WAAW,CAAC,EAAE,MAAM,CAAC;KACtB,CAAC;IACF,QAAQ,EAAE;QACR,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAED;;;GAGG;AACH,MAAM,WAAW,yBAAyB;IACxC,KAAK,EAAE;QACL,IAAI,EAAE,MAAM,CAAC;QACb,cAAc,EAAE,MAAM,CAAC;QACvB,kBAAkB,EAAE,MAAM,CAAC;KAC5B,CAAC;IACF,QAAQ,EAAE;QACR,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAED;;GAEG;AACH,eAAO,MAAM,yBAAyB,EAAE,wBAAwB,EAuG/D,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,0BAA0B,EAAE,yBAAyB,EA8FjE,CAAC"}
@@ -0,0 +1,31 @@
1
+ /**
2
+ * Strategy Selection Quality Scorer
3
+ *
4
+ * Evaluates whether the chosen decomposition strategy led to successful outcomes.
5
+ * This is the ultimate test - did the decision actually work?
6
+ *
7
+ * Scoring:
8
+ * - 1.0: Success with no errors
9
+ * - 0.7-0.9: Success with minor errors (error_count 1-2)
10
+ * - 0.4-0.6: Success with significant errors (error_count 3+)
11
+ * - 0.0: Failure
12
+ *
13
+ * Based on calculateDecisionQuality() from swarm-mail/decision-trace-store.ts
14
+ */
15
+ export declare const strategySelectionQuality: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
16
+ /**
17
+ * Precedent Relevance Scorer (LLM-as-Judge)
18
+ *
19
+ * Evaluates whether cited precedent tasks are actually semantically similar
20
+ * to the current task. Uses Claude Haiku to judge relevance.
21
+ *
22
+ * This catches cases where coordinators cite irrelevant precedents,
23
+ * which can lead to poor strategy selection.
24
+ *
25
+ * Scoring:
26
+ * - 0.8-1.0: Highly relevant (same domain, similar requirements)
27
+ * - 0.5-0.7: Moderately relevant (related concepts, different scope)
28
+ * - 0.0-0.4: Irrelevant (different domains, no meaningful overlap)
29
+ */
30
+ export declare const precedentRelevance: import("evalite").Evalite.Scorer<unknown, unknown, unknown>;
31
+ //# sourceMappingURL=decision-quality-scorers.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"decision-quality-scorers.d.ts","sourceRoot":"","sources":["../../src/scorers/decision-quality-scorers.ts"],"names":[],"mappings":"AAaA;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,wBAAwB,6DA+CnC,CAAC;AAEH;;;;;;;;;;;;;GAaG;AACH,eAAO,MAAM,kBAAkB,6DA0E7B,CAAC"}
@@ -13,6 +13,7 @@ export declare const subtaskIndependence: import("evalite").Evalite.Scorer<unkno
13
13
  export { executionSuccess, timeBalance, scopeAccuracy, scopeDrift, noRework, } from "./outcome-scorers.js";
14
14
  export { confidenceAccuracy, contextInjectionCorrectness, requiredPatternsPresent, forbiddenPatternsAbsent, compactionQuality, } from "./compaction-scorers.js";
15
15
  export { violationCount, spawnEfficiency, reviewThoroughness, timeToFirstSpawn, overallDiscipline, } from "./coordinator-discipline.js";
16
+ export { strategySelectionQuality, precedentRelevance, } from "./decision-quality-scorers.js";
16
17
  /**
17
18
  * Checks that subtasks cover the full task scope
18
19
  *
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scorers/index.ts"],"names":[],"mappings":"AAOA;;GAEG;AAEH;;;;;;;GAOG;AACH,eAAO,MAAM,mBAAmB,6DAuC9B,CAAC;AAMH,OAAO,EACL,gBAAgB,EAChB,WAAW,EACX,aAAa,EACb,UAAU,EACV,QAAQ,GACT,MAAM,sBAAsB,CAAC;AAM9B,OAAO,EACL,kBAAkB,EAClB,2BAA2B,EAC3B,uBAAuB,EACvB,uBAAuB,EACvB,iBAAiB,GAClB,MAAM,yBAAyB,CAAC;AAMjC,OAAO,EACL,cAAc,EACd,eAAe,EACf,kBAAkB,EAClB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,6BAA6B,CAAC;AAErC;;;;;;;;;;GAUG;AACH,eAAO,MAAM,oBAAoB,6DAsD/B,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,kBAAkB,6DAsD7B,CAAC;AAMH;;;;;;;;;;GAUG;AACH,eAAO,MAAM,sBAAsB,6DAmFjC,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scorers/index.ts"],"names":[],"mappings":"AAOA;;GAEG;AAEH;;;;;;;GAOG;AACH,eAAO,MAAM,mBAAmB,6DAuC9B,CAAC;AAMH,OAAO,EACL,gBAAgB,EAChB,WAAW,EACX,aAAa,EACb,UAAU,EACV,QAAQ,GACT,MAAM,sBAAsB,CAAC;AAM9B,OAAO,EACL,kBAAkB,EAClB,2BAA2B,EAC3B,uBAAuB,EACvB,uBAAuB,EACvB,iBAAiB,GAClB,MAAM,yBAAyB,CAAC;AAMjC,OAAO,EACL,cAAc,EACd,eAAe,EACf,kBAAkB,EAClB,gBAAgB,EAChB,iBAAiB,GAClB,MAAM,6BAA6B,CAAC;AAMrC,OAAO,EACL,wBAAwB,EACxB,kBAAkB,GACnB,MAAM,+BAA+B,CAAC;AAEvC;;;;;;;;;;GAUG;AACH,eAAO,MAAM,oBAAoB,6DAsD/B,CAAC;AAEH;;;;;;;;;GASG;AACH,eAAO,MAAM,kBAAkB,6DAsD7B,CAAC;AAMH;;;;;;;;;;GAUG;AACH,eAAO,MAAM,sBAAsB,6DAmFjC,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@swarmtools/evals",
3
- "version": "0.2.1",
3
+ "version": "0.2.3",
4
4
  "description": "Evaluation suite for swarm-tools multi-agent coordination",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -33,8 +33,8 @@
33
33
  "dependencies": {
34
34
  "evalite": "^1.0.0-beta.10",
35
35
  "ai": "6.0.0-beta.150",
36
- "opencode-swarm-plugin": "0.44.2",
37
- "swarm-mail": "1.6.0"
36
+ "opencode-swarm-plugin": "0.45.1",
37
+ "swarm-mail": "1.6.1"
38
38
  },
39
39
  "devDependencies": {
40
40
  "@types/node": "^22.19.3",