opencode-swarm-plugin 0.37.0 → 0.39.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +20 -5
  4. package/.hive/memories.jsonl +35 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/.turbo/turbo-build.log +4 -4
  7. package/.turbo/turbo-test.log +319 -319
  8. package/CHANGELOG.md +258 -0
  9. package/README.md +50 -0
  10. package/bin/swarm.test.ts +475 -0
  11. package/bin/swarm.ts +385 -208
  12. package/dist/compaction-hook.d.ts +1 -1
  13. package/dist/compaction-hook.d.ts.map +1 -1
  14. package/dist/compaction-prompt-scoring.d.ts +124 -0
  15. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  16. package/dist/eval-capture.d.ts +81 -1
  17. package/dist/eval-capture.d.ts.map +1 -1
  18. package/dist/eval-gates.d.ts +84 -0
  19. package/dist/eval-gates.d.ts.map +1 -0
  20. package/dist/eval-history.d.ts +117 -0
  21. package/dist/eval-history.d.ts.map +1 -0
  22. package/dist/eval-learning.d.ts +216 -0
  23. package/dist/eval-learning.d.ts.map +1 -0
  24. package/dist/hive.d.ts +59 -0
  25. package/dist/hive.d.ts.map +1 -1
  26. package/dist/index.d.ts +87 -0
  27. package/dist/index.d.ts.map +1 -1
  28. package/dist/index.js +823 -131
  29. package/dist/plugin.js +655 -131
  30. package/dist/post-compaction-tracker.d.ts +133 -0
  31. package/dist/post-compaction-tracker.d.ts.map +1 -0
  32. package/dist/swarm-decompose.d.ts +30 -0
  33. package/dist/swarm-decompose.d.ts.map +1 -1
  34. package/dist/swarm-orchestrate.d.ts +23 -0
  35. package/dist/swarm-orchestrate.d.ts.map +1 -1
  36. package/dist/swarm-prompts.d.ts +25 -1
  37. package/dist/swarm-prompts.d.ts.map +1 -1
  38. package/dist/swarm.d.ts +19 -0
  39. package/dist/swarm.d.ts.map +1 -1
  40. package/evals/README.md +595 -94
  41. package/evals/compaction-prompt.eval.ts +149 -0
  42. package/evals/coordinator-behavior.eval.ts +8 -8
  43. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  44. package/evals/lib/compaction-loader.test.ts +248 -0
  45. package/evals/lib/compaction-loader.ts +320 -0
  46. package/evals/lib/data-loader.test.ts +345 -0
  47. package/evals/lib/data-loader.ts +107 -6
  48. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  49. package/evals/scorers/compaction-scorers.ts +13 -13
  50. package/evals/scorers/coordinator-discipline.evalite-test.ts +3 -2
  51. package/evals/scorers/coordinator-discipline.ts +13 -13
  52. package/examples/plugin-wrapper-template.ts +177 -8
  53. package/package.json +7 -2
  54. package/scripts/migrate-unknown-sessions.ts +349 -0
  55. package/src/compaction-capture.integration.test.ts +257 -0
  56. package/src/compaction-hook.test.ts +139 -2
  57. package/src/compaction-hook.ts +113 -2
  58. package/src/compaction-prompt-scorers.test.ts +299 -0
  59. package/src/compaction-prompt-scoring.ts +298 -0
  60. package/src/eval-capture.test.ts +422 -0
  61. package/src/eval-capture.ts +94 -2
  62. package/src/eval-gates.test.ts +306 -0
  63. package/src/eval-gates.ts +218 -0
  64. package/src/eval-history.test.ts +508 -0
  65. package/src/eval-history.ts +214 -0
  66. package/src/eval-learning.test.ts +378 -0
  67. package/src/eval-learning.ts +360 -0
  68. package/src/index.ts +61 -1
  69. package/src/post-compaction-tracker.test.ts +251 -0
  70. package/src/post-compaction-tracker.ts +237 -0
  71. package/src/swarm-decompose.test.ts +40 -47
  72. package/src/swarm-decompose.ts +2 -2
  73. package/src/swarm-orchestrate.test.ts +270 -7
  74. package/src/swarm-orchestrate.ts +100 -13
  75. package/src/swarm-prompts.test.ts +121 -0
  76. package/src/swarm-prompts.ts +297 -4
  77. package/src/swarm-research.integration.test.ts +157 -0
  78. package/src/swarm-review.ts +3 -3
  79. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
@@ -0,0 +1,124 @@
1
+ /**
2
+ * Compaction Prompt Quality Scoring - Pure Functions
3
+ *
4
+ * Evaluates the quality of continuation prompts generated after context compaction.
5
+ * **Problem**: Post-compaction coordinators often "wake up" confused, forget their role,
6
+ * and start editing files instead of checking worker status.
7
+ *
8
+ * **Solution**: Score prompts on 5 dimensions that predict coordinator success:
9
+ *
10
+ * 1. **Epic ID Specificity (0.20)**: Real IDs (`mjkw...`) not placeholders (`<epic-id>`, `bd-xxx`)
11
+ * - Placeholders = coordinator can't check actual swarm status
12
+ *
13
+ * 2. **Actionability (0.20)**: Tool calls with real values (e.g., `swarm_status(epic_id='mjkw81rkq4c')`)
14
+ * - Generic instructions like "check status" don't work
15
+ *
16
+ * 3. **Coordinator Identity (0.25)**: ASCII header + strong mandates (NEVER/ALWAYS)
17
+ * - Visual + semantic cues reinforce role post-compaction
18
+ *
19
+ * 4. **Forbidden Tools Listed (0.15)**: Explicitly lists Edit, Write, swarmmail_reserve, git commit
20
+ * - Naming forbidden tools reduces violations
21
+ *
22
+ * 5. **Post-Compaction Discipline (0.20)**: First suggested tool is swarm_status or inbox (not Edit)
23
+ * - First tool sets the pattern - "check status" vs "dive into code"
24
+ *
25
+ * **Pure functions**: These can be tested without evalite. The evalite wrappers are in
26
+ * `evals/scorers/compaction-prompt-scorers.ts`.
27
+ *
28
+ * **Data source**: Captured from `captureCompactionEvent()` with `compaction_type: "prompt_generated"`.
29
+ * The payload includes the FULL prompt content (not truncated) for scoring.
30
+ *
31
+ * **Integration**: `compaction-prompt.eval.ts` uses these scorers to track prompt quality over time.
32
+ * Progressive gates enforce quality: bootstrap → stabilization → production.
33
+ *
34
+ * @module compaction-prompt-scoring
35
+ */
36
+ /**
37
+ * Compaction prompt structure (from LLM generation)
38
+ */
39
+ export interface CompactionPrompt {
40
+ content: string;
41
+ }
42
+ /**
43
+ * Scorer result type
44
+ */
45
+ export interface ScorerResult {
46
+ score: number;
47
+ message: string;
48
+ }
49
+ /** Matches real epic/cell IDs (mjkw prefix + 7+ base36 chars) */
50
+ export declare const REAL_EPIC_ID: RegExp;
51
+ /** Matches common placeholder patterns */
52
+ export declare const PLACEHOLDERS: RegExp[];
53
+ /** Matches ASCII box-drawing characters (for headers) */
54
+ export declare const ASCII_BOX: RegExp;
55
+ /** Matches strong mandate language */
56
+ export declare const STRONG_LANGUAGE: RegExp[];
57
+ /**
58
+ * Score epic ID specificity
59
+ *
60
+ * Validates that epic IDs are REAL, not placeholders.
61
+ * Placeholders like <epic-id>, bd-xxx, <path> indicate
62
+ * the prompt generator failed to inject actual values.
63
+ *
64
+ * @returns 1.0 if real IDs, 0.0 if placeholders found
65
+ */
66
+ export declare function scoreEpicIdSpecificity(prompt: CompactionPrompt): ScorerResult;
67
+ /**
68
+ * Score actionability of tool calls
69
+ *
70
+ * Validates that the prompt includes SPECIFIC actionable tool calls.
71
+ * Generic instructions like "check status" are useless.
72
+ * Good: swarm_status(epic_id='mjkw81rkq4c', project_key='/path')
73
+ * Bad: "Check the status of workers"
74
+ *
75
+ * @returns 1.0 if actionable tool calls with real values, 0.0 otherwise
76
+ */
77
+ export declare function scoreActionability(prompt: CompactionPrompt): ScorerResult;
78
+ /**
79
+ * Score coordinator identity reinforcement
80
+ *
81
+ * Validates that the prompt has STRONG coordinator identity reinforcement.
82
+ * Post-compaction coordinators lose their identity without visual+semantic cues.
83
+ *
84
+ * Checks:
85
+ * 1. ASCII box header (visual anchor)
86
+ * 2. Strong language (NEVER/ALWAYS, not "should"/"consider")
87
+ *
88
+ * @returns 1.0 for ASCII header + strong mandates, 0.5 for header only, 0.0 otherwise
89
+ */
90
+ export declare function scoreCoordinatorIdentity(prompt: CompactionPrompt): ScorerResult;
91
+ /**
92
+ * Score forbidden tools listing
93
+ *
94
+ * Validates that the prompt LISTS forbidden tools by name.
95
+ * Coordinators must know exactly which tools to avoid.
96
+ *
97
+ * Required forbidden tools:
98
+ * 1. Edit
99
+ * 2. Write
100
+ * 3. swarmmail_reserve (only workers reserve)
101
+ * 4. git commit (workers commit)
102
+ *
103
+ * @returns ratio of forbidden tools mentioned (0.0 to 1.0)
104
+ */
105
+ export declare function scoreForbiddenToolsPresent(prompt: CompactionPrompt): ScorerResult;
106
+ /**
107
+ * Score post-compaction discipline (first tool correctness)
108
+ *
109
+ * Validates that the FIRST suggested tool is correct.
110
+ * Coordinators should check status FIRST, not edit files.
111
+ *
112
+ * Good first tools:
113
+ * - swarm_status
114
+ * - swarmmail_inbox
115
+ *
116
+ * Bad first tools:
117
+ * - Edit
118
+ * - Write
119
+ * - Read (should check status first)
120
+ *
121
+ * @returns 1.0 if first tool is swarm_status or inbox, 0.0 otherwise
122
+ */
123
+ export declare function scorePostCompactionDiscipline(prompt: CompactionPrompt): ScorerResult;
124
+ //# sourceMappingURL=compaction-prompt-scoring.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compaction-prompt-scoring.d.ts","sourceRoot":"","sources":["../src/compaction-prompt-scoring.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAEH;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAChC,OAAO,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC5B,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;CAChB;AAID,iEAAiE;AACjE,eAAO,MAAM,YAAY,QAAqB,CAAC;AAE/C,0CAA0C;AAC1C,eAAO,MAAM,YAAY,UAKxB,CAAC;AAEF,yDAAyD;AACzD,eAAO,MAAM,SAAS,QAAiB,CAAC;AAExC,sCAAsC;AACtC,eAAO,MAAM,eAAe,UAAoD,CAAC;AAIjF;;;;;;;;GAQG;AACH,wBAAgB,sBAAsB,CAAC,MAAM,EAAE,gBAAgB,GAAG,YAAY,CAuB7E;AAED;;;;;;;;;GASG;AACH,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,gBAAgB,GAAG,YAAY,CA+BzE;AAED;;;;;;;;;;;GAWG;AACH,wBAAgB,wBAAwB,CACvC,MAAM,EAAE,gBAAgB,GACtB,YAAY,CA6Bd;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAgB,0BAA0B,CACzC,MAAM,EAAE,gBAAgB,GACtB,YAAY,CAiCd;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,6BAA6B,CAC5C,MAAM,EAAE,gBAAgB,GACtB,YAAY,CAiCd"}
@@ -70,7 +70,7 @@ export type PartialEvalRecord = Partial<EvalRecord> & {
70
70
  task: string;
71
71
  };
72
72
  /**
73
- * Coordinator Event - captures coordinator decisions, violations, and outcomes
73
+ * Coordinator Event - captures coordinator decisions, violations, outcomes, and compaction
74
74
  */
75
75
  export declare const CoordinatorEventSchema: z.ZodDiscriminatedUnion<[z.ZodObject<{
76
76
  session_id: z.ZodString;
@@ -108,6 +108,19 @@ export declare const CoordinatorEventSchema: z.ZodDiscriminatedUnion<[z.ZodObjec
108
108
  epic_complete: "epic_complete";
109
109
  }>;
110
110
  payload: z.ZodAny;
111
+ }, z.core.$strip>, z.ZodObject<{
112
+ session_id: z.ZodString;
113
+ epic_id: z.ZodString;
114
+ timestamp: z.ZodString;
115
+ event_type: z.ZodLiteral<"COMPACTION">;
116
+ compaction_type: z.ZodEnum<{
117
+ detection_complete: "detection_complete";
118
+ prompt_generated: "prompt_generated";
119
+ context_injected: "context_injected";
120
+ resumption_started: "resumption_started";
121
+ tool_call_tracked: "tool_call_tracked";
122
+ }>;
123
+ payload: z.ZodAny;
111
124
  }, z.core.$strip>], "event_type">;
112
125
  export type CoordinatorEvent = z.infer<typeof CoordinatorEventSchema>;
113
126
  /**
@@ -154,6 +167,19 @@ export declare const CoordinatorSessionSchema: z.ZodObject<{
154
167
  epic_complete: "epic_complete";
155
168
  }>;
156
169
  payload: z.ZodAny;
170
+ }, z.core.$strip>, z.ZodObject<{
171
+ session_id: z.ZodString;
172
+ epic_id: z.ZodString;
173
+ timestamp: z.ZodString;
174
+ event_type: z.ZodLiteral<"COMPACTION">;
175
+ compaction_type: z.ZodEnum<{
176
+ detection_complete: "detection_complete";
177
+ prompt_generated: "prompt_generated";
178
+ context_injected: "context_injected";
179
+ resumption_started: "resumption_started";
180
+ tool_call_tracked: "tool_call_tracked";
181
+ }>;
182
+ payload: z.ZodAny;
157
183
  }, z.core.$strip>], "event_type">>;
158
184
  }, z.core.$strip>;
159
185
  export type CoordinatorSession = z.infer<typeof CoordinatorSessionSchema>;
@@ -294,6 +320,60 @@ export declare function ensureSessionDir(): void;
294
320
  * Appends the event as a JSONL line to ~/.config/swarm-tools/sessions/{session_id}.jsonl
295
321
  */
296
322
  export declare function captureCoordinatorEvent(event: CoordinatorEvent): void;
323
+ /**
324
+ * Capture a compaction event to the session file
325
+ *
326
+ * Helper for capturing COMPACTION events with automatic timestamp generation.
327
+ * Tracks compaction hook lifecycle: detection → prompt generation → context injection → resumption.
328
+ *
329
+ * **Part of eval-driven development pipeline:** Compaction events are used by `compaction-prompt.eval.ts`
330
+ * to score prompt quality (ID specificity, actionability, coordinator identity).
331
+ *
332
+ * **Lifecycle stages:**
333
+ * - `detection_complete` - Compaction detected (confidence level, context type)
334
+ * - `prompt_generated` - Continuation prompt created (FULL content stored for eval)
335
+ * - `context_injected` - Prompt injected into OpenCode context
336
+ * - `resumption_started` - Coordinator resumed from checkpoint
337
+ * - `tool_call_tracked` - First tool called post-compaction (measures discipline)
338
+ *
339
+ * @param params - Compaction event parameters
340
+ * @param params.session_id - Coordinator session ID
341
+ * @param params.epic_id - Epic ID being coordinated
342
+ * @param params.compaction_type - Stage of compaction lifecycle
343
+ * @param params.payload - Event-specific data (full prompt content, detection results, etc.)
344
+ *
345
+ * @example
346
+ * // Capture detection complete
347
+ * captureCompactionEvent({
348
+ * session_id: "session-123",
349
+ * epic_id: "bd-456",
350
+ * compaction_type: "detection_complete",
351
+ * payload: {
352
+ * confidence: "high",
353
+ * context_type: "full",
354
+ * epic_id: "bd-456",
355
+ * },
356
+ * });
357
+ *
358
+ * @example
359
+ * // Capture prompt generated (with full content for eval)
360
+ * captureCompactionEvent({
361
+ * session_id: "session-123",
362
+ * epic_id: "bd-456",
363
+ * compaction_type: "prompt_generated",
364
+ * payload: {
365
+ * prompt_length: 5000,
366
+ * full_prompt: "You are a coordinator...", // Full prompt, not truncated - used for quality scoring
367
+ * context_type: "full",
368
+ * },
369
+ * });
370
+ */
371
+ export declare function captureCompactionEvent(params: {
372
+ session_id: string;
373
+ epic_id: string;
374
+ compaction_type: "detection_complete" | "prompt_generated" | "context_injected" | "resumption_started" | "tool_call_tracked";
375
+ payload: any;
376
+ }): void;
297
377
  /**
298
378
  * Read all events from a session file
299
379
  */
@@ -1 +1 @@
1
- {"version":3,"file":"eval-capture.d.ts","sourceRoot":"","sources":["../src/eval-capture.ts"],"names":[],"mappings":"AAmBA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAMxB;;GAEG;AACH,eAAO,MAAM,oBAAoB;;;;;;;;;;iBAmB/B,CAAC;AACH,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,oBAAoB,CAAC,CAAC;AAElE;;GAEG;AACH,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBA2D3B,CAAC;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAE1D;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,OAAO,CAAC,UAAU,CAAC,GAAG;IACpD,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;CACd,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iCA2CjC,CAAC;AACH,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAEtE;;GAEG;AACH,eAAO,MAAM,wBAAwB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAMnC,CAAC;AACH,MAAM,MAAM,kBAAkB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,wBAAwB,CAAC,CAAC;AAM1E;;GAEG;AACH,eAAO,MAAM,sBAAsB,8BAA8B,CAAC;AAElE;;GAEG;AACH,wBAAgB,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,MAAM,CAE3D;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,WAAW,EAAE,MAAM,GAAG,IAAI,CAM3D;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,WAAW,EAAE,MAAM,EACnB,MAAM,EAAE,UAAU,GAAG,iBAAiB,GACrC,IAAI,CAKN;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,EAAE,CAajE;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,WAAW,EAAE,MAAM,GAAG,iBAAiB,EAAE,CAU3E;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,WAAW,EAAE,MAAM,EACnB,EAAE,EAAE,MAAM,EACV,OAAO,EAAE,OAAO,CAAC,UAAU,CAAC,GAC3B,OAAO,CAgBT;AAWD;;;;;GAKG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,EAAE;IAC3C,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,YAAY,GAAG,eAAe,GAAG,YAAY,GAAG,MAAM,CAAC;IACjE,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,KAAK,EAAE,MAAM,EAAE,CAAC;QAChB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;QACxB,oBAAoB,CAAC,EAAE,MAAM,CAAC;KAC/B,CAAC,CAAC;CACJ,GAAG,iBAAiB,CAsBpB;AAED;;;;;GAKG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE;IAC5C,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,OAAO,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB,GAAG,IAAI,CAwBP;AAED;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,MAAM,EAAE;IACzC,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;CACrB,GAAG,UAAU,GAAG,IAAI,CAoDpB;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,EAAE;IAC3C,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,OAAO,CAAC;IAClB,QAAQ,EAAE,OAAO,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB,GAAG,IAAI,CAMP;AAMD;;;;GAIG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,GAAG,KAAK,CAAC;IAC3D,KAAK,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAC1C,QAAQ,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;QACzB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,CAAC;IACF,MAAM,EAAE,UAAU,CAAC;CACpB,CAAC,CAkBD;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,GAAG;IACrD,YAAY,EAAE,MAAM,CAAC;IACrB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,aAAa,EAAE,MAAM,CAAC;IACtB,gBAAgB,EAAE,MAAM,CAAC;IACzB,cAAc,EAAE,MAAM,CAAC;CACxB,CAuCA;AAMD;;GAEG;AACH,wBAAgB,aAAa,IAAI,MAAM,CAEtC;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAExD;AAED;;GAEG;AACH,wBAAgB,gBAAgB,IAAI,IAAI,CAKvC;AAED;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,gBAAgB,GAAG,IAAI,CAWrE;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,gBAAgB,EAAE,CAavE;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CAAC,MAAM,EAAE;IAClC,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;CACjB,GAAG,kBAAkB,GAAG,IAAI,CAoB5B"}
1
+ {"version":3,"file":"eval-capture.d.ts","sourceRoot":"","sources":["../src/eval-capture.ts"],"names":[],"mappings":"AAyBA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAMxB;;GAEG;AACH,eAAO,MAAM,oBAAoB;;;;;;;;;;iBAmB/B,CAAC;AACH,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,oBAAoB,CAAC,CAAC;AAElE;;GAEG;AACH,eAAO,MAAM,gBAAgB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBA2D3B,CAAC;AACH,MAAM,MAAM,UAAU,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,gBAAgB,CAAC,CAAC;AAE1D;;GAEG;AACH,MAAM,MAAM,iBAAiB,GAAG,OAAO,CAAC,UAAU,CAAC,GAAG;IACpD,EAAE,EAAE,MAAM,CAAC;IACX,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;CACd,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,sBAAsB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iCA0DjC,CAAC;AACH,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,sBAAsB,CAAC,CAAC;AAEtE;;GAEG;AACH,eAAO,MAAM,wBAAwB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;iBAMnC,CAAC;AACH,MAAM,MAAM,kBAAkB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,wBAAwB,CAAC,CAAC;AAM1E;;GAEG;AACH,eAAO,MAAM,sBAAsB,8BAA8B,CAAC;AAElE;;GAEG;AACH,wBAAgB,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,MAAM,CAE3D;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,WAAW,EAAE,MAAM,GAAG,IAAI,CAM3D;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,WAAW,EAAE,MAAM,EACnB,MAAM,EAAE,UAAU,GAAG,iBAAiB,GACrC,IAAI,CAKN;AAED;;GAEG;AACH,wBAAgB,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,UAAU,EAAE,CAajE;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,WAAW,EAAE,MAAM,GAAG,iBAAiB,EAAE,CAU3E;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAC9B,WAAW,EAAE,MAAM,EACnB,EAAE,EAAE,MAAM,EACV,OAAO,EAAE,OAAO,CAAC,UAAU,CAAC,GAC3B,OAAO,CAgBT;AAWD;;;;;GAKG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,EAAE;IAC3C,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,YAAY,GAAG,eAAe,GAAG,YAAY,GAAG,MAAM,CAAC;IACjE,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,EAAE,KAAK,CAAC;QACd,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,KAAK,EAAE,MAAM,EAAE,CAAC;QAChB,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;QACxB,oBAAoB,CAAC,EAAE,MAAM,CAAC;KAC/B,CAAC,CAAC;CACJ,GAAG,iBAAiB,CAsBpB;AAED;;;;;GAKG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE;IAC5C,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,OAAO,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB,GAAG,IAAI,CAwBP;AAED;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,MAAM,EAAE;IACzC,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;CACrB,GAAG,UAAU,GAAG,IAAI,CAoDpB;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,EAAE;IAC3C,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,OAAO,CAAC;IAClB,QAAQ,EAAE,OAAO,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB,GAAG,IAAI,CAMP;AAMD;;;;GAIG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,GAAG,KAAK,CAAC;IAC3D,KAAK,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAC1C,QAAQ,EAAE;QACR,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;QACzB,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,CAAC;IACF,MAAM,EAAE,UAAU,CAAC;CACpB,CAAC,CAkBD;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,WAAW,EAAE,MAAM,GAAG;IACrD,YAAY,EAAE,MAAM,CAAC;IACrB,eAAe,EAAE,MAAM,CAAC;IACxB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,aAAa,EAAE,MAAM,CAAC;IACtB,gBAAgB,EAAE,MAAM,CAAC;IACzB,cAAc,EAAE,MAAM,CAAC;CACxB,CAuCA;AAMD;;GAEG;AACH,wBAAgB,aAAa,IAAI,MAAM,CAEtC;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAExD;AAED;;GAEG;AACH,wBAAgB,gBAAgB,IAAI,IAAI,CAKvC;AAED;;;;GAIG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,gBAAgB,GAAG,IAAI,CAWrE;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+CG;AACH,wBAAgB,sBAAsB,CAAC,MAAM,EAAE;IAC7C,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;IAChB,eAAe,EACX,oBAAoB,GACpB,kBAAkB,GAClB,kBAAkB,GAClB,oBAAoB,GACpB,mBAAmB,CAAC;IACxB,OAAO,EAAE,GAAG,CAAC;CACd,GAAG,IAAI,CAWP;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,GAAG,gBAAgB,EAAE,CAavE;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CAAC,MAAM,EAAE;IAClC,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,CAAC;CACjB,GAAG,kBAAkB,GAAG,IAAI,CAoB5B"}
@@ -0,0 +1,84 @@
1
+ /**
2
+ * Result from a gate check
3
+ */
4
+ export interface GateResult {
5
+ /** Whether the gate passed */
6
+ passed: boolean;
7
+ /** Current phase */
8
+ phase: "bootstrap" | "stabilization" | "production";
9
+ /** Human-readable message */
10
+ message: string;
11
+ /** Baseline score (mean of history) */
12
+ baseline?: number;
13
+ /** Current score */
14
+ currentScore: number;
15
+ /** Regression percentage (negative = improvement) */
16
+ regressionPercent?: number;
17
+ }
18
+ /**
19
+ * Configuration for gate thresholds
20
+ */
21
+ export interface GateConfig {
22
+ /** Regression threshold for stabilization phase (default: 0.1 = 10%) */
23
+ stabilizationThreshold?: number;
24
+ /** Regression threshold for production phase (default: 0.05 = 5%) */
25
+ productionThreshold?: number;
26
+ }
27
+ /**
28
+ * Default regression thresholds by phase
29
+ */
30
+ export declare const DEFAULT_THRESHOLDS: {
31
+ readonly stabilization: 0.1;
32
+ readonly production: 0.05;
33
+ };
34
+ /**
35
+ * Check if the current eval score passes the quality gate
36
+ *
37
+ * Progressive gates adapt based on data maturity:
38
+ * - **Bootstrap (<10 runs)**: Always pass, focus on collecting baseline data
39
+ * - **Stabilization (10-50 runs)**: Warn on >10% regression (default), but pass
40
+ * - **Production (>50 runs + variance <0.1)**: Fail on >5% regression (default)
41
+ *
42
+ * **Baseline calculation**: Mean of all historical scores for this eval (not just last run).
43
+ *
44
+ * **Regression formula**: `(baseline - current) / baseline`
45
+ * - Positive = regression (score dropped)
46
+ * - Negative = improvement
47
+ * - Returns 0 if baseline is 0 (avoids division by zero)
48
+ *
49
+ * **Variance threshold (0.1)**: High variance keeps eval in stabilization phase even with >50 runs.
50
+ * This prevents premature production gates when scores are still unstable.
51
+ *
52
+ * **CI Integration**: Production gates can fail PRs. Use `swarm eval status` to check phase before merging.
53
+ *
54
+ * @param projectPath - Absolute path to project root (contains `.opencode/eval-history.jsonl`)
55
+ * @param evalName - Name of the eval (e.g., "swarm-decomposition", "coordinator-behavior")
56
+ * @param currentScore - Current score to check (typically 0-1 range)
57
+ * @param config - Optional threshold configuration (defaults: stabilization=0.1, production=0.05)
58
+ * @returns Gate check result with pass/fail, phase, baseline, regression details
59
+ *
60
+ * @example
61
+ * ```typescript
62
+ * import { checkGate } from "./eval-gates.js";
63
+ *
64
+ * const result = checkGate("/path/to/project", "swarm-decomposition", 0.89);
65
+ *
66
+ * if (!result.passed) {
67
+ * console.error(`❌ Gate FAILED: ${result.message}`);
68
+ * process.exit(1); // Fail CI
69
+ * }
70
+ *
71
+ * console.log(`✅ ${result.phase} phase: ${result.message}`);
72
+ * ```
73
+ *
74
+ * @example
75
+ * ```typescript
76
+ * // Custom thresholds for sensitive eval
77
+ * const result = checkGate("/path", "critical-eval", 0.92, {
78
+ * stabilizationThreshold: 0.05, // 5% threshold in stabilization
79
+ * productionThreshold: 0.02, // 2% threshold in production
80
+ * });
81
+ * ```
82
+ */
83
+ export declare function checkGate(projectPath: string, evalName: string, currentScore: number, config?: GateConfig): GateResult;
84
+ //# sourceMappingURL=eval-gates.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-gates.d.ts","sourceRoot":"","sources":["../src/eval-gates.ts"],"names":[],"mappings":"AAYA;;GAEG;AACH,MAAM,WAAW,UAAU;IAC1B,8BAA8B;IAC9B,MAAM,EAAE,OAAO,CAAC;IAChB,oBAAoB;IACpB,KAAK,EAAE,WAAW,GAAG,eAAe,GAAG,YAAY,CAAC;IACpD,6BAA6B;IAC7B,OAAO,EAAE,MAAM,CAAC;IAChB,uCAAuC;IACvC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,oBAAoB;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,qDAAqD;IACrD,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IAC1B,wEAAwE;IACxE,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAChC,qEAAqE;IACrE,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC7B;AAED;;GAEG;AACH,eAAO,MAAM,kBAAkB;;;CAGrB,CAAC;AAoCX;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgDG;AACH,wBAAgB,SAAS,CACxB,WAAW,EAAE,MAAM,EACnB,QAAQ,EAAE,MAAM,EAChB,YAAY,EAAE,MAAM,EACpB,MAAM,CAAC,EAAE,UAAU,GACjB,UAAU,CAiFZ"}
@@ -0,0 +1,117 @@
1
+ /**
2
+ * Progressive phases based on run count and variance
3
+ */
4
+ export type Phase = "bootstrap" | "stabilization" | "production";
5
+ /**
6
+ * Single eval run record
7
+ */
8
+ export interface EvalRunRecord {
9
+ /** ISO-8601 timestamp */
10
+ timestamp: string;
11
+ /** Name of the eval (e.g., "swarm-decomposition") */
12
+ eval_name: string;
13
+ /** Score (0-1 range typically) */
14
+ score: number;
15
+ /** Run count (monotonically increasing per eval) */
16
+ run_count: number;
17
+ }
18
+ /**
19
+ * Default path for eval history
20
+ */
21
+ export declare const DEFAULT_EVAL_HISTORY_PATH = ".opencode/eval-history.jsonl";
22
+ /**
23
+ * Variance threshold for production phase
24
+ */
25
+ export declare const VARIANCE_THRESHOLD = 0.1;
26
+ /**
27
+ * Run count thresholds for phase transitions
28
+ */
29
+ export declare const BOOTSTRAP_THRESHOLD = 10;
30
+ export declare const STABILIZATION_THRESHOLD = 50;
31
+ /**
32
+ * Get the eval history file path
33
+ */
34
+ export declare function getEvalHistoryPath(projectPath: string): string;
35
+ /**
36
+ * Ensure the eval history directory exists
37
+ */
38
+ export declare function ensureEvalHistoryDir(projectPath: string): void;
39
+ /**
40
+ * Record an eval run to JSONL history
41
+ *
42
+ * Appends atomically to `.opencode/eval-history.jsonl`. Each line is a complete JSON object
43
+ * representing one eval run (timestamp, eval name, score, run count).
44
+ *
45
+ * **Auto-creates directory** if `.opencode/` doesn't exist.
46
+ *
47
+ * **Thread-safe**: Uses `appendFileSync` for atomic writes (safe for concurrent eval runs).
48
+ *
49
+ * **Integration**: Called automatically by evalite runner after each eval completes.
50
+ * Also callable manually for custom eval tracking.
51
+ *
52
+ * @param projectPath - Absolute path to project root
53
+ * @param run - Eval run record with timestamp, eval_name, score, run_count
54
+ *
55
+ * @example
56
+ * ```typescript
57
+ * import { recordEvalRun } from "./eval-history.js";
58
+ *
59
+ * recordEvalRun("/path/to/project", {
60
+ * timestamp: new Date().toISOString(),
61
+ * eval_name: "swarm-decomposition",
62
+ * score: 0.92,
63
+ * run_count: 15,
64
+ * });
65
+ * ```
66
+ */
67
+ export declare function recordEvalRun(projectPath: string, run: EvalRunRecord): void;
68
+ /**
69
+ * Get score history for a specific eval
70
+ *
71
+ * Returns runs in chronological order (oldest first)
72
+ */
73
+ export declare function getScoreHistory(projectPath: string, evalName: string): EvalRunRecord[];
74
+ /**
75
+ * Calculate statistical variance of scores
76
+ *
77
+ * Variance = mean of squared deviations from the mean
78
+ * Formula: Σ((x - μ)²) / n
79
+ */
80
+ export declare function calculateVariance(scores: number[]): number;
81
+ /**
82
+ * Get the current phase for an eval based on run count and score variance
83
+ *
84
+ * Progressive phase logic ensures quality gates adapt to data maturity:
85
+ *
86
+ * - **Bootstrap (<10 runs)**: No gates, just collect baseline data
87
+ * - **Stabilization (10-50 runs)**: Warn on >10% regression (but pass)
88
+ * - **Production (>50 runs AND variance <0.1)**: Fail on >5% regression
89
+ *
90
+ * **Variance check**: If >50 runs but variance ≥0.1, stays in stabilization.
91
+ * This prevents premature production gates when scores are still unstable.
92
+ *
93
+ * **Why variance matters**: An eval with wildly fluctuating scores isn't ready for
94
+ * strict gates. Variance threshold (0.1) ensures the eval is consistent before
95
+ * enforcing production-level quality control.
96
+ *
97
+ * @param projectPath - Absolute path to project root (contains `.opencode/eval-history.jsonl`)
98
+ * @param evalName - Name of the eval (e.g., "swarm-decomposition")
99
+ * @returns Current phase: "bootstrap" | "stabilization" | "production"
100
+ *
101
+ * @example
102
+ * ```typescript
103
+ * import { getPhase } from "./eval-history.js";
104
+ *
105
+ * const phase = getPhase("/path/to/project", "swarm-decomposition");
106
+ *
107
+ * if (phase === "production") {
108
+ * console.log("🚀 Production phase - strict gates enabled");
109
+ * } else if (phase === "stabilization") {
110
+ * console.log("⚙️ Stabilization phase - warnings only");
111
+ * } else {
112
+ * console.log("🌱 Bootstrap phase - collecting data");
113
+ * }
114
+ * ```
115
+ */
116
+ export declare function getPhase(projectPath: string, evalName: string): Phase;
117
+ //# sourceMappingURL=eval-history.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-history.d.ts","sourceRoot":"","sources":["../src/eval-history.ts"],"names":[],"mappings":"AAaA;;GAEG;AACH,MAAM,MAAM,KAAK,GAAG,WAAW,GAAG,eAAe,GAAG,YAAY,CAAC;AAEjE;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,yBAAyB;IACzB,SAAS,EAAE,MAAM,CAAC;IAClB,qDAAqD;IACrD,SAAS,EAAE,MAAM,CAAC;IAClB,kCAAkC;IAClC,KAAK,EAAE,MAAM,CAAC;IACd,oDAAoD;IACpD,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,eAAO,MAAM,yBAAyB,iCAAiC,CAAC;AAExE;;GAEG;AACH,eAAO,MAAM,kBAAkB,MAAM,CAAC;AAEtC;;GAEG;AACH,eAAO,MAAM,mBAAmB,KAAK,CAAC;AACtC,eAAO,MAAM,uBAAuB,KAAK,CAAC;AAE1C;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,WAAW,EAAE,MAAM,GAAG,MAAM,CAE9D;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,WAAW,EAAE,MAAM,GAAG,IAAI,CAM9D;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AACH,wBAAgB,aAAa,CAC3B,WAAW,EAAE,MAAM,EACnB,GAAG,EAAE,aAAa,GACjB,IAAI,CAKN;AAoBD;;;;GAIG;AACH,wBAAgB,eAAe,CAC7B,WAAW,EAAE,MAAM,EACnB,QAAQ,EAAE,MAAM,GACf,aAAa,EAAE,CAIjB;AAED;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,CAa1D;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AACH,wBAAgB,QAAQ,CAAC,WAAW,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,KAAK,CAqBrE"}
@@ -0,0 +1,216 @@
1
+ /**
2
+ * Eval-to-Learning Feedback Loop
3
+ *
4
+ * Automatically stores eval failures to semantic memory for learning.
5
+ * When eval scores drop significantly from rolling average (default >15%),
6
+ * stores context to semantic-memory with tags for future prompt generation.
7
+ *
8
+ * ## Usage
9
+ *
10
+ * ```typescript
11
+ * import { learnFromEvalFailure } from "./eval-learning";
12
+ * import { getMemoryAdapter } from "./memory-tools";
13
+ * import { getScoreHistory } from "./eval-history";
14
+ *
15
+ * const memoryAdapter = await getMemoryAdapter();
16
+ * const history = getScoreHistory(projectPath, "compaction-test");
17
+ *
18
+ * const result = await learnFromEvalFailure(
19
+ * "compaction-test",
20
+ * currentScore,
21
+ * history,
22
+ * memoryAdapter
23
+ * );
24
+ *
25
+ * if (result.triggered) {
26
+ * console.log(`📉 Regression detected: ${(result.drop_percentage * 100).toFixed(1)}% drop`);
27
+ * console.log(`Memory ID: ${result.memory_id}`);
28
+ * }
29
+ * ```
30
+ *
31
+ * ## Integration Points
32
+ *
33
+ * - **After each eval run**: Call to detect regressions automatically
34
+ * - **Memory tags**: `eval-failure`, `{eval-name}`, `regression`
35
+ * - **Future prompts**: Query memories with these tags for context
36
+ * - **Scorer context**: Optional detail about which scorer failed
37
+ *
38
+ * ## Customization
39
+ *
40
+ * ```typescript
41
+ * const customConfig = {
42
+ * dropThreshold: 0.10, // 10% threshold (more sensitive)
43
+ * windowSize: 10, // Last 10 runs for baseline
44
+ * };
45
+ *
46
+ * await learnFromEvalFailure(
47
+ * "test",
48
+ * score,
49
+ * history,
50
+ * adapter,
51
+ * { config: customConfig }
52
+ * );
53
+ * ```
54
+ *
55
+ * @module eval-learning
56
+ */
57
+ import type { EvalRunRecord } from "./eval-history";
58
+ import type { MemoryAdapter } from "./memory-tools";
59
+ /**
60
+ * Configuration for eval-to-learning feedback
61
+ */
62
+ export interface EvalLearningConfig {
63
+ /** Threshold for significant drop (0-1, default 0.15 = 15%) */
64
+ dropThreshold: number;
65
+ /** Rolling average window size (default 5 runs) */
66
+ windowSize: number;
67
+ }
68
+ /**
69
+ * Default configuration
70
+ */
71
+ export declare const DEFAULT_EVAL_LEARNING_CONFIG: EvalLearningConfig;
72
+ /**
73
+ * Result from learning check
74
+ */
75
+ export interface LearningResult {
76
+ /** Whether the check triggered memory storage */
77
+ triggered: boolean;
78
+ /** Baseline score from rolling average */
79
+ baseline: number;
80
+ /** Current score */
81
+ current: number;
82
+ /** Drop percentage (0-1, e.g., 0.20 = 20% drop) */
83
+ drop_percentage: number;
84
+ /** Memory ID if stored */
85
+ memory_id?: string;
86
+ }
87
+ /**
88
+ * Calculate rolling average of recent scores
89
+ *
90
+ * Uses last N runs (default 5) to establish baseline.
91
+ * If history shorter than window, uses all available.
92
+ *
93
+ * @param history - Score history (chronological order)
94
+ * @param windowSize - Number of recent runs to average (default 5)
95
+ * @returns Average score (0 if empty)
96
+ */
97
+ export declare function calculateRollingAverage(history: EvalRunRecord[], windowSize?: number): number;
98
+ /**
99
+ * Check if current score is a significant drop from baseline
100
+ *
101
+ * Significant = drop exceeds threshold (default 15%).
102
+ * Formula: (baseline - current) / baseline >= threshold
103
+ *
104
+ * @param currentScore - Current eval score
105
+ * @param baseline - Baseline score (rolling average)
106
+ * @param threshold - Drop threshold (default 0.15 = 15%)
107
+ * @returns True if drop is significant
108
+ */
109
+ export declare function isSignificantDrop(currentScore: number, baseline: number, threshold?: number): boolean;
110
+ /**
111
+ * Format failure context for semantic memory storage
112
+ *
113
+ * Creates human-readable description of the failure with
114
+ * quantified metrics and optional scorer context.
115
+ *
116
+ * @param evalName - Name of eval that failed
117
+ * @param currentScore - Current score
118
+ * @param baseline - Baseline score
119
+ * @param scorerContext - Optional context about which scorer failed
120
+ * @returns Formatted context string
121
+ */
122
+ export declare function formatFailureContext(evalName: string, currentScore: number, baseline: number, scorerContext?: string): string;
123
+ /**
124
+ * Main learning function - automatically stores eval failures to semantic memory
125
+ *
126
+ * **Closed-loop learning**: When eval scores drop significantly from baseline,
127
+ * this function stores failure context to semantic memory. Future prompt generation
128
+ * queries these memories for context, preventing repeated mistakes.
129
+ *
130
+ * **Trigger condition**: Score drops >15% (default) from rolling average baseline.
131
+ * Uses last 5 runs (default) to establish baseline, not just previous run.
132
+ *
133
+ * **What gets stored**:
134
+ * - Eval name, baseline score, current score, drop percentage
135
+ * - Scorer-specific context (which scorer failed, why)
136
+ * - Timestamp and metadata for querying
137
+ * - Tags: `eval-failure`, `{eval-name}`, `regression`
138
+ *
139
+ * **Future use**: Before generating prompts for the same eval, query semantic memory
140
+ * with tags to inject learnings from past failures.
141
+ *
142
+ * **Integration points**:
143
+ * - After each eval run (in evalite runner or CI)
144
+ * - In `checkGate()` when regression detected
145
+ * - Manual calls for custom eval tracking
146
+ *
147
+ * @param evalName - Name of eval (e.g., "compaction-test", "coordinator-behavior")
148
+ * @param currentScore - Current eval score (typically 0-1 range)
149
+ * @param history - Score history in chronological order (oldest first)
150
+ * @param memoryAdapter - Semantic memory adapter (from `getMemoryAdapter()`)
151
+ * @param options - Optional config (thresholds, window size) and scorer context
152
+ * @param options.config - Custom thresholds (dropThreshold, windowSize)
153
+ * @param options.scorerContext - Details about which scorer failed (for context)
154
+ * @returns Learning result with trigger status, baseline, drop percentage, memory ID
155
+ *
156
+ * @example
157
+ * ```typescript
158
+ * import { learnFromEvalFailure } from "./eval-learning.js";
159
+ * import { getMemoryAdapter } from "./memory-tools.js";
160
+ * import { getScoreHistory } from "./eval-history.js";
161
+ *
162
+ * const memoryAdapter = await getMemoryAdapter();
163
+ * const history = getScoreHistory("/path/to/project", "coordinator-behavior");
164
+ *
165
+ * const result = await learnFromEvalFailure(
166
+ * "coordinator-behavior",
167
+ * 0.68, // Current score
168
+ * history,
169
+ * memoryAdapter,
170
+ * { scorerContext: "violationCount: 5 violations (coordinator edited files)" }
171
+ * );
172
+ *
173
+ * if (result.triggered) {
174
+ * console.log(`📉 Regression detected: ${(result.drop_percentage * 100).toFixed(1)}% drop`);
175
+ * console.log(`Stored to memory: ${result.memory_id}`);
176
+ * }
177
+ * ```
178
+ *
179
+ * @example
180
+ * ```typescript
181
+ * // Custom threshold (more sensitive)
182
+ * const result = await learnFromEvalFailure(
183
+ * "critical-eval",
184
+ * 0.85,
185
+ * history,
186
+ * memoryAdapter,
187
+ * {
188
+ * config: {
189
+ * dropThreshold: 0.10, // 10% threshold (default is 15%)
190
+ * windowSize: 10, // Last 10 runs for baseline (default is 5)
191
+ * },
192
+ * }
193
+ * );
194
+ * ```
195
+ */
196
+ export declare function learnFromEvalFailure(evalName: string, currentScore: number, history: EvalRunRecord[], memoryAdapter: MemoryAdapter, options?: {
197
+ config?: EvalLearningConfig;
198
+ scorerContext?: string;
199
+ }): Promise<LearningResult>;
200
+ /**
201
+ * Create custom learning config with specific threshold
202
+ *
203
+ * Helper for common use case: custom drop threshold.
204
+ *
205
+ * @param dropThreshold - Drop threshold (0-1)
206
+ * @param windowSize - Optional window size (default 5)
207
+ * @returns Custom config
208
+ *
209
+ * @example
210
+ * ```typescript
211
+ * const config = createLearningConfig(0.10); // 10% threshold
212
+ * await learnFromEvalFailure("test", score, history, adapter, { config });
213
+ * ```
214
+ */
215
+ export declare function createLearningConfig(dropThreshold: number, windowSize?: number): EvalLearningConfig;
216
+ //# sourceMappingURL=eval-learning.d.ts.map