opencode-swarm-plugin 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +27 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +182 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +881 -0
  9. package/bin/swarm.ts +686 -0
  10. package/dist/compaction-hook.d.ts +8 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-observability.d.ts +173 -0
  13. package/dist/compaction-observability.d.ts.map +1 -0
  14. package/dist/compaction-prompt-scoring.d.ts +124 -0
  15. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  16. package/dist/eval-capture.d.ts +174 -1
  17. package/dist/eval-capture.d.ts.map +1 -1
  18. package/dist/eval-gates.d.ts +84 -0
  19. package/dist/eval-gates.d.ts.map +1 -0
  20. package/dist/eval-history.d.ts +117 -0
  21. package/dist/eval-history.d.ts.map +1 -0
  22. package/dist/eval-learning.d.ts +216 -0
  23. package/dist/eval-learning.d.ts.map +1 -0
  24. package/dist/hive.d.ts.map +1 -1
  25. package/dist/index.d.ts +80 -1
  26. package/dist/index.d.ts.map +1 -1
  27. package/dist/index.js +16098 -651
  28. package/dist/plugin.js +16012 -756
  29. package/dist/post-compaction-tracker.d.ts +133 -0
  30. package/dist/post-compaction-tracker.d.ts.map +1 -0
  31. package/dist/schemas/task.d.ts +3 -3
  32. package/dist/swarm-orchestrate.d.ts +23 -0
  33. package/dist/swarm-orchestrate.d.ts.map +1 -1
  34. package/dist/swarm-prompts.d.ts +25 -1
  35. package/dist/swarm-prompts.d.ts.map +1 -1
  36. package/dist/swarm.d.ts +4 -0
  37. package/dist/swarm.d.ts.map +1 -1
  38. package/evals/README.md +702 -105
  39. package/evals/compaction-prompt.eval.ts +149 -0
  40. package/evals/coordinator-behavior.eval.ts +8 -8
  41. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  42. package/evals/lib/compaction-loader.test.ts +248 -0
  43. package/evals/lib/compaction-loader.ts +320 -0
  44. package/evals/lib/data-loader.test.ts +345 -0
  45. package/evals/lib/data-loader.ts +107 -6
  46. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  47. package/evals/scorers/compaction-scorers.ts +13 -13
  48. package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
  49. package/evals/scorers/coordinator-discipline.ts +348 -15
  50. package/evals/scorers/index.test.ts +146 -0
  51. package/evals/scorers/index.ts +104 -0
  52. package/evals/swarm-decomposition.eval.ts +9 -2
  53. package/examples/commands/swarm.md +291 -21
  54. package/examples/plugin-wrapper-template.ts +117 -0
  55. package/package.json +7 -5
  56. package/scripts/migrate-unknown-sessions.ts +349 -0
  57. package/src/compaction-capture.integration.test.ts +257 -0
  58. package/src/compaction-hook.test.ts +42 -0
  59. package/src/compaction-hook.ts +315 -86
  60. package/src/compaction-observability.integration.test.ts +139 -0
  61. package/src/compaction-observability.test.ts +187 -0
  62. package/src/compaction-observability.ts +324 -0
  63. package/src/compaction-prompt-scorers.test.ts +299 -0
  64. package/src/compaction-prompt-scoring.ts +298 -0
  65. package/src/eval-capture.test.ts +626 -1
  66. package/src/eval-capture.ts +286 -2
  67. package/src/eval-gates.test.ts +306 -0
  68. package/src/eval-gates.ts +218 -0
  69. package/src/eval-history.test.ts +508 -0
  70. package/src/eval-history.ts +214 -0
  71. package/src/eval-learning.test.ts +378 -0
  72. package/src/eval-learning.ts +360 -0
  73. package/src/eval-runner.test.ts +96 -0
  74. package/src/eval-runner.ts +356 -0
  75. package/src/hive.ts +34 -0
  76. package/src/index.ts +115 -2
  77. package/src/memory.test.ts +110 -0
  78. package/src/memory.ts +34 -0
  79. package/src/post-compaction-tracker.test.ts +251 -0
  80. package/src/post-compaction-tracker.ts +237 -0
  81. package/src/swarm-decompose.ts +2 -2
  82. package/src/swarm-orchestrate.ts +2 -2
  83. package/src/swarm-prompts.ts +2 -2
  84. package/src/swarm-review.ts +3 -3
  85. package/dist/beads.d.ts +0 -386
  86. package/dist/beads.d.ts.map +0 -1
  87. package/dist/schemas/bead-events.d.ts +0 -698
  88. package/dist/schemas/bead-events.d.ts.map +0 -1
  89. package/dist/schemas/bead.d.ts +0 -255
  90. package/dist/schemas/bead.d.ts.map +0 -1
  91. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
package/.env ADDED
@@ -0,0 +1,2 @@
1
+ NPM_1P_ITEM=yeu4tbknx5crxmudtu3pfg3eba
2
+ AI_GATEWAY_API_KEY=vck_2w2KCfF5YskBaxnsIaOqnr87kAOIyL6HpPwtLCTWtn7DFyKXEP4IJsKA
@@ -0,0 +1,26 @@
1
+ {
2
+ "compaction-prompt": {
3
+ "passed": true,
4
+ "phase": "bootstrap",
5
+ "message": "Bootstrap phase (1/10 runs) - collecting data",
6
+ "currentScore": 0.85
7
+ },
8
+ "coordinator-behavior": {
9
+ "passed": true,
10
+ "phase": "bootstrap",
11
+ "message": "Bootstrap phase (1/10 runs) - collecting data",
12
+ "currentScore": 0.85
13
+ },
14
+ "coordinator-session": {
15
+ "passed": true,
16
+ "phase": "bootstrap",
17
+ "message": "Bootstrap phase (1/10 runs) - collecting data",
18
+ "currentScore": 0.85
19
+ },
20
+ "swarm-decomposition": {
21
+ "passed": true,
22
+ "phase": "bootstrap",
23
+ "message": "Bootstrap phase (1/10 runs) - collecting data",
24
+ "currentScore": 0.85
25
+ }
26
+ }
@@ -26,3 +26,30 @@
26
26
  {"id":"opencode-swarm-plugin--ys7z8-mjkn5xp1blq","title":"Wire captureSubtaskOutcome() into swarm_complete","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-24T23:25:06.133Z","updated_at":"2025-12-24T23:52:01.496Z","closed_at":"2025-12-24T23:52:01.496Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkn5xocowf","dependencies":[],"labels":[],"comments":[]}
27
27
  {"id":"opencode-swarm-plugin--ys7z8-mjkn5xp41f2","title":"Wire finalizeEvalRecord() into swarm_record_outcome","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-24T23:25:06.136Z","updated_at":"2025-12-24T23:52:02.719Z","closed_at":"2025-12-24T23:52:02.719Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkn5xocowf","dependencies":[],"labels":[],"comments":[]}
28
28
  {"id":"opencode-swarm-plugin--ys7z8-mjkn5xp793w","title":"Add eval scripts to package.json and update README","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-24T23:25:06.139Z","updated_at":"2025-12-24T23:52:04.385Z","closed_at":"2025-12-24T23:52:04.385Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkn5xocowf","dependencies":[],"labels":[],"comments":[]}
29
+ {"id":"opencode-swarm-plugin--ys7z8-mjkvvysb1bk","title":"Fix Eval Pipeline: Database Table + Scorer API","description":"Two fixes: 1) Ensure eval_records table is created when swarm-mail database initializes, 2) Fix composite scorer API usage in evalite tests","status":"closed","priority":1,"issue_type":"epic","created_at":"2025-12-25T03:29:17.531Z","updated_at":"2025-12-25T03:42:14.497Z","closed_at":"2025-12-25T03:42:14.497Z","dependencies":[],"labels":[],"comments":[]}
30
+ {"id":"opencode-swarm-plugin--ys7z8-mjkvvysjyrv","title":"Verify eval_records table creation in swarm-mail","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T03:29:17.539Z","updated_at":"2025-12-25T03:40:16.396Z","closed_at":"2025-12-25T03:40:16.396Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkvvysb1bk","dependencies":[],"labels":[],"comments":[]}
31
+ {"id":"opencode-swarm-plugin--ys7z8-mjkvvysl8ye","title":"Fix composite scorer API in coordinator-discipline.ts","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T03:29:17.541Z","updated_at":"2025-12-25T03:40:17.922Z","closed_at":"2025-12-25T03:40:17.922Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkvvysb1bk","dependencies":[],"labels":[],"comments":[]}
32
+ {"id":"opencode-swarm-plugin--ys7z8-mjkvvysnzae","title":"Fix composite scorer API in compaction-scorers.ts","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T03:29:17.543Z","updated_at":"2025-12-25T03:40:19.200Z","closed_at":"2025-12-25T03:40:19.200Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkvvysb1bk","dependencies":[],"labels":[],"comments":[]}
33
+ {"id":"opencode-swarm-plugin--ys7z8-mjkvvysrwgk","title":"Fix composite scorer API in coordinator-behavior.eval.ts","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T03:29:17.547Z","updated_at":"2025-12-25T03:42:04.249Z","closed_at":"2025-12-25T03:42:04.249Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkvvysb1bk","dependencies":[],"labels":[],"comments":[]}
34
+ {"id":"opencode-swarm-plugin--ys7z8-mjkwt9rqf2s","title":"Fix session ID propagation in eval capture","description":"Session IDs not flowing to captureCoordinatorEvent - 82% of events orphaned in unknown.jsonl. Root cause: swarm tools use process.env.OPENCODE_SESSION_ID which is not set, instead of ctx.sessionID which IS available.","status":"closed","priority":1,"issue_type":"epic","created_at":"2025-12-25T03:55:11.414Z","updated_at":"2025-12-25T04:14:23.283Z","closed_at":"2025-12-25T04:14:23.283Z","dependencies":[],"labels":[],"comments":[]}
35
+ {"id":"opencode-swarm-plugin--ys7z8-mjkwt9rzlw3","title":"Add sessionId parameter to captureCoordinatorEvent and update call sites","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-25T03:55:11.423Z","updated_at":"2025-12-25T04:05:28.792Z","closed_at":"2025-12-25T04:05:28.792Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkwt9rqf2s","dependencies":[],"labels":[],"comments":[]}
36
+ {"id":"opencode-swarm-plugin--ys7z8-mjkwt9s2boa","title":"Create migration script to re-attribute unknown.jsonl events to proper sessions","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T03:55:11.426Z","updated_at":"2025-12-25T04:05:29.764Z","closed_at":"2025-12-25T04:05:29.764Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkwt9rqf2s","dependencies":[],"labels":[],"comments":[]}
37
+ {"id":"opencode-swarm-plugin--ys7z8-mjkwt9s6xoa","title":"Run migration and verify data integrity","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T03:55:11.430Z","updated_at":"2025-12-25T04:14:16.676Z","closed_at":"2025-12-25T04:14:16.676Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkwt9rqf2s","dependencies":[],"labels":[],"comments":[]}
38
+ {"id":"opencode-swarm-plugin--ys7z8-mjkyhrqmecc","title":"Add quality gate filters to eval session loader","description":"Filter eval sessions by quality signals: minEvents >= 3, hasWorkerSpawn, hasReviewCompleted. Currently 67 of 82 sessions are noise (<3 events). Quality gate will keep ~15 high-signal sessions.","status":"closed","priority":1,"issue_type":"epic","created_at":"2025-12-25T04:42:14.062Z","updated_at":"2025-12-25T04:49:40.809Z","closed_at":"2025-12-25T04:49:40.809Z","dependencies":[],"labels":[],"comments":[]}
39
+ {"id":"opencode-swarm-plugin--ys7z8-mjkyhrr2qm7","title":"Add quality filter options to loadCapturedSessions with TDD","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-25T04:42:14.078Z","updated_at":"2025-12-25T04:49:39.904Z","closed_at":"2025-12-25T04:49:39.904Z","parent_id":"opencode-swarm-plugin--ys7z8-mjkyhrqmecc","dependencies":[],"labels":[],"comments":[]}
40
+ {"id":"opencode-swarm-plugin--ys7z8-mjl04zmvv7c","title":"Eval System Improvements: Tool + Event Capture + Scorers","description":"Improve eval system with:\n1. Plugin tool for running evals (eval_run)\n2. Capture decomposition_complete events\n3. Capture VIOLATION events\n4. Improve compaction prompt structure\n5. Add review efficiency scorer\n6. Enforce knowledge gathering validation\n\nTarget: 70% → 85% overall eval score","status":"open","priority":1,"issue_type":"epic","created_at":"2025-12-25T05:28:16.999Z","updated_at":"2025-12-25T05:28:16.999Z","dependencies":[],"labels":[],"comments":[]}
41
+ {"id":"opencode-swarm-plugin--ys7z8-mjl04zn4u31","title":"Add eval_run plugin tool","status":"open","priority":1,"issue_type":"task","created_at":"2025-12-25T05:28:17.008Z","updated_at":"2025-12-25T05:28:17.008Z","parent_id":"opencode-swarm-plugin--ys7z8-mjl04zmvv7c","dependencies":[],"labels":[],"comments":[]}
42
+ {"id":"opencode-swarm-plugin--ys7z8-mjl04znglws","title":"Capture VIOLATION events for coordinator discipline","status":"open","priority":1,"issue_type":"task","created_at":"2025-12-25T05:28:17.020Z","updated_at":"2025-12-25T05:28:17.020Z","parent_id":"opencode-swarm-plugin--ys7z8-mjl04zmvv7c","dependencies":[],"labels":[],"comments":[]}
43
+ {"id":"opencode-swarm-plugin--ys7z8-mjl04znlxzw","title":"Improve compaction prompt structure","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-25T05:28:17.025Z","updated_at":"2025-12-25T05:28:17.025Z","parent_id":"opencode-swarm-plugin--ys7z8-mjl04zmvv7c","dependencies":[],"labels":[],"comments":[]}
44
+ {"id":"opencode-swarm-plugin--ys7z8-mjl04zn8by5","title":"Capture decomposition_complete event","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-25T05:28:17.012Z","updated_at":"2025-12-25T05:38:07.026Z","closed_at":"2025-12-25T05:38:07.026Z","parent_id":"opencode-swarm-plugin--ys7z8-mjl04zmvv7c","dependencies":[],"labels":[],"comments":[]}
45
+ {"id":"opencode-swarm-plugin--ys7z8-mjl04znn0uk","title":"Add review efficiency scorer","status":"in_progress","priority":2,"issue_type":"task","created_at":"2025-12-25T05:28:17.027Z","updated_at":"2025-12-25T05:37:03.084Z","parent_id":"opencode-swarm-plugin--ys7z8-mjl04zmvv7c","dependencies":[],"labels":[],"comments":[]}
46
+ {"id":"opencode-swarm-plugin--ys7z8-mjl04znqie9","title":"Update spawnEfficiency scorer fallback","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-25T05:28:17.030Z","updated_at":"2025-12-25T05:29:34.561Z","closed_at":"2025-12-25T05:29:34.561Z","parent_id":"opencode-swarm-plugin--ys7z8-mjl04zmvv7c","dependencies":[],"labels":[],"comments":[]}
47
+ {"id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","title":"ADR-009: Pattern Catalog and Innovation Documentation","description":"Comprehensive analysis of opencode-swarm-plugin to document all patterns, innovations, and ideas. Each research worker analyzes a specific domain, then a synthesis worker consolidates findings into ADR-009.","status":"open","priority":1,"issue_type":"epic","created_at":"2025-12-25T14:24:21.120Z","updated_at":"2025-12-25T14:24:21.120Z","dependencies":[],"labels":[],"comments":[]}
48
+ {"id":"opencode-swarm-plugin--ys7z8-mjljadmu3bx","title":"Research: Learning Systems (confidence decay, pattern maturity, anti-patterns)","status":"open","priority":1,"issue_type":"task","created_at":"2025-12-25T14:24:21.126Z","updated_at":"2025-12-25T14:24:21.126Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
49
+ {"id":"opencode-swarm-plugin--ys7z8-mjljadmw66u","title":"Research: Swarm Coordination (decomposition, orchestration, review, worktree)","status":"open","priority":1,"issue_type":"task","created_at":"2025-12-25T14:24:21.128Z","updated_at":"2025-12-25T14:24:21.128Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
50
+ {"id":"opencode-swarm-plugin--ys7z8-mjljadmyadr","title":"Research: Memory & Context Preservation (compaction, semantic memory)","status":"open","priority":1,"issue_type":"task","created_at":"2025-12-25T14:24:21.130Z","updated_at":"2025-12-25T14:24:21.130Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
51
+ {"id":"opencode-swarm-plugin--ys7z8-mjljadn06xp","title":"Research: Observability & Evaluation (logging, eval capture, gates)","status":"open","priority":1,"issue_type":"task","created_at":"2025-12-25T14:24:21.132Z","updated_at":"2025-12-25T14:24:21.132Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
52
+ {"id":"opencode-swarm-plugin--ys7z8-mjljadn1c2k","title":"Research: Skills System & Knowledge Injection","status":"open","priority":1,"issue_type":"task","created_at":"2025-12-25T14:24:21.133Z","updated_at":"2025-12-25T14:24:21.133Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
53
+ {"id":"opencode-swarm-plugin--ys7z8-mjljadn7knk","title":"Research: Mandates, Guardrails & Structured Output","status":"open","priority":1,"issue_type":"task","created_at":"2025-12-25T14:24:21.139Z","updated_at":"2025-12-25T14:24:21.139Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
54
+ {"id":"opencode-swarm-plugin--ys7z8-mjljadn8e6u","title":"Research: Existing ADRs & Documentation Gaps","status":"open","priority":1,"issue_type":"task","created_at":"2025-12-25T14:24:21.140Z","updated_at":"2025-12-25T14:24:21.140Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
55
+ {"id":"opencode-swarm-plugin--ys7z8-mjljadnaj6o","title":"Synthesize: Write ADR-009 Pattern Catalog","status":"open","priority":0,"issue_type":"task","created_at":"2025-12-25T14:24:21.142Z","updated_at":"2025-12-25T14:24:21.142Z","parent_id":"opencode-swarm-plugin--ys7z8-mjljadmo9mg","dependencies":[],"labels":[],"comments":[]}
@@ -546,4 +546,26 @@
546
546
  {"id":"mem_mjkifog0_kyrf1i8","information":"Findable test memory with unique keyword xyztest123","created_at":"2025-12-24T21:12:42.624Z"}
547
547
  {"id":"mem_mjkifrmb_cfzpsbl","information":"Test memory for adapter wiring verification","created_at":"2025-12-24T21:12:46.739Z","tags":"test,memory"}
548
548
  {"id":"mem_mjkifrp8_6p3hyc0","information":"OAuth refresh tokens need 5min buffer before expiry","created_at":"2025-12-24T21:12:46.844Z","metadata":"{\"raw\":\"auth,tokens,oauth\"}","tags":"auth,integration-test"}
549
- {"id":"mem_mjkifrty_n2obcci","information":"Smoke test verified full tool adapter wiring works end-to-end","created_at":"2025-12-24T21:12:47.014Z","tags":"test,verification"}
549
+ {"id":"mem_mjkifrty_n2obcci","information":"Smoke test verified full tool adapter wiring works end-to-end","created_at":"2025-12-24T21:12:47.014Z","tags":"test,verification"}
550
+ {"id":"mem_mjkvzysv_sc2t9vz","information":"Test memory for tools integration","created_at":"2025-12-25T03:32:24.175Z","tags":"test"}
551
+ {"id":"mem_mjkvzzi6_1p6e6a9","information":"Findable test memory with unique keyword xyztest123","created_at":"2025-12-25T03:32:25.086Z"}
552
+ {"id":"mem_mjkw8n77_qjdsp7f","information":"Test memory for tools integration","created_at":"2025-12-25T03:39:09.043Z","tags":"test"}
553
+ {"id":"mem_mjkw8njx_i8h8cyh","information":"Findable test memory with unique keyword xyztest123","created_at":"2025-12-25T03:39:09.501Z"}
554
+ {"id":"mem_mjkw8rmk_f6hitx1","information":"Test memory for adapter wiring verification","created_at":"2025-12-25T03:39:14.780Z","tags":"test,memory"}
555
+ {"id":"mem_mjkw8rpm_lje9arh","information":"OAuth refresh tokens need 5min buffer before expiry","created_at":"2025-12-25T03:39:14.890Z","metadata":"{\"raw\":\"auth,tokens,oauth\"}","tags":"auth,integration-test"}
556
+ {"id":"mem_mjkw8rtm_adjnpml","information":"Smoke test verified full tool adapter wiring works end-to-end","created_at":"2025-12-25T03:39:15.034Z","tags":"test,verification"}
557
+ {"id":"mem_mjkwmbkm_33rhosw","information":"Test memory for tools integration","created_at":"2025-12-25T03:49:47.158Z","tags":"test"}
558
+ {"id":"mem_mjkwmc55_9oi3pyz","information":"Findable test memory with unique keyword xyztest123","created_at":"2025-12-25T03:49:47.897Z"}
559
+ {"id":"mem_mjkwmg5h_07q5cqq","information":"Test memory for adapter wiring verification","created_at":"2025-12-25T03:49:53.093Z","tags":"test,memory"}
560
+ {"id":"mem_mjkwmg9a_evvx6t6","information":"OAuth refresh tokens need 5min buffer before expiry","created_at":"2025-12-25T03:49:53.230Z","metadata":"{\"raw\":\"auth,tokens,oauth\"}","tags":"auth,integration-test"}
561
+ {"id":"mem_mjkwmge4_2pkurm7","information":"Smoke test verified full tool adapter wiring works end-to-end","created_at":"2025-12-25T03:49:53.404Z","tags":"test,verification"}
562
+ {"id":"mem_mjkx05sw_izlcsfs","information":"Test memory for tools integration","created_at":"2025-12-25T04:00:32.864Z","tags":"test"}
563
+ {"id":"mem_mjkx067y_b9hn5qi","information":"Findable test memory with unique keyword xyztest123","created_at":"2025-12-25T04:00:33.406Z"}
564
+ {"id":"mem_mjkx09hf_ygskd44","information":"Test memory for adapter wiring verification","created_at":"2025-12-25T04:00:37.635Z","tags":"test,memory"}
565
+ {"id":"mem_mjkx09lg_hwd8wid","information":"OAuth refresh tokens need 5min buffer before expiry","created_at":"2025-12-25T04:00:37.780Z","metadata":"{\"raw\":\"auth,tokens,oauth\"}","tags":"auth,integration-test"}
566
+ {"id":"mem_mjkx09p9_lc3whf6","information":"Smoke test verified full tool adapter wiring works end-to-end","created_at":"2025-12-25T04:00:37.917Z","tags":"test,verification"}
567
+ {"id":"mem_mjkxgljy_xvyprn1","information":"Test memory for tools integration","created_at":"2025-12-25T04:13:19.774Z","tags":"test"}
568
+ {"id":"mem_mjkxglqg_5ojok3n","information":"Findable test memory with unique keyword xyztest123","created_at":"2025-12-25T04:13:20.008Z"}
569
+ {"id":"mem_mjkxgogk_48pml1f","information":"Test memory for adapter wiring verification","created_at":"2025-12-25T04:13:23.540Z","tags":"test,memory"}
570
+ {"id":"mem_mjkxgomk_mm0hvqg","information":"OAuth refresh tokens need 5min buffer before expiry","created_at":"2025-12-25T04:13:23.756Z","metadata":"{\"raw\":\"auth,tokens,oauth\"}","tags":"auth,integration-test"}
571
+ {"id":"mem_mjkxgopz_mqvrw0z","information":"Smoke test verified full tool adapter wiring works end-to-end","created_at":"2025-12-25T04:13:23.879Z","tags":"test,verification"}
@@ -0,0 +1,12 @@
1
+ {"timestamp":"2025-12-25T04:28:42.041Z","eval_name":"compaction-prompt","score":0.85,"run_count":1}
2
+ {"timestamp":"2025-12-25T04:28:42.041Z","eval_name":"coordinator-behavior","score":0.85,"run_count":1}
3
+ {"timestamp":"2025-12-25T04:28:42.042Z","eval_name":"coordinator-session","score":0.85,"run_count":1}
4
+ {"timestamp":"2025-12-25T04:28:42.042Z","eval_name":"swarm-decomposition","score":0.85,"run_count":1}
5
+ {"timestamp":"2025-12-25T04:28:52.405Z","eval_name":"compaction-prompt","score":0.85,"run_count":2}
6
+ {"timestamp":"2025-12-25T04:28:52.405Z","eval_name":"coordinator-behavior","score":0.85,"run_count":2}
7
+ {"timestamp":"2025-12-25T04:28:52.405Z","eval_name":"coordinator-session","score":0.85,"run_count":2}
8
+ {"timestamp":"2025-12-25T04:28:52.405Z","eval_name":"swarm-decomposition","score":0.85,"run_count":2}
9
+ {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"compaction-prompt","score":0.85,"run_count":3}
10
+ {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"coordinator-behavior","score":0.85,"run_count":3}
11
+ {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"coordinator-session","score":0.85,"run_count":3}
12
+ {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"swarm-decomposition","score":0.85,"run_count":3}
package/CHANGELOG.md CHANGED
@@ -1,5 +1,187 @@
1
1
  # opencode-swarm-plugin
2
2
 
3
+ ## 0.40.0
4
+
5
+ ### Minor Changes
6
+
7
+ - [`948e031`](https://github.com/joelhooks/swarm-tools/commit/948e0318fe5e2c1a5d695a56533fc2a2a7753887) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🔭 Observability Swarm: See What the Bees Are Doing
8
+
9
+ > "The unexamined swarm is not worth coordinating." — Socrates, probably
10
+
11
+ Four parallel workers descended on the observability stack and emerged victorious. The compaction hook no longer runs in darkness, coordinator sessions are now viewable, and the docs finally explain what all those JSONL files are for.
12
+
13
+ ### What's New
14
+
15
+ **Compaction Observability** (`src/compaction-observability.ts`)
16
+
17
+ - Metrics collector tracks phases: START → GATHER → DETECT → INJECT → COMPLETE
18
+ - Pattern extraction/skipping with reasons ("why didn't this get captured?")
19
+ - Timing breakdown per phase (analysis vs extraction vs storage)
20
+ - 15 tests (11 unit + 4 integration)
21
+
22
+ **`swarm log sessions` CLI**
23
+
24
+ - `swarm log sessions` — list all captured coordinator sessions
25
+ - `swarm log sessions <id>` — view events for a session (partial ID matching)
26
+ - `swarm log sessions --latest` — quick access to most recent
27
+ - `--type`, `--since`, `--limit`, `--json` filters
28
+ - 64 tests covering parsing, listing, filtering
29
+
30
+ **Coordinator Observability Docs**
31
+
32
+ - AGENTS.md: overview with quick commands
33
+ - evals/README.md: deep dive with ASCII flow diagrams, event type reference, JSONL examples, jq recipes
34
+
35
+ **Research: Coordinator Prompt Eval** (`.hive/analysis/coordinator-prompt-eval-research.md`)
36
+
37
+ - 26KB analysis of prompt iteration strategies
38
+ - Recommends: versioning + evalite (defer LLM-as-Judge to v0.34+)
39
+ - Implementation plan with effort estimates
40
+
41
+ ### The Observability Story
42
+
43
+ ```
44
+ CAPTURE ──────────► VIEW ──────────► SCORE
45
+ (eval-capture.ts) (swarm log (coordinator
46
+ sessions) evals)
47
+ ```
48
+
49
+ Now you can answer:
50
+
51
+ - "What did the last 10 compaction runs extract?"
52
+ - "Why didn't this pattern get captured?"
53
+ - "Which coordinator sessions had violations?"
54
+
55
+ ## 0.39.1
56
+
57
+ ### Patch Changes
58
+
59
+ - [`19a6557`](https://github.com/joelhooks/swarm-tools/commit/19a6557cee9878858e7f61e2aba86b37a3ec10ad) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🐝 Eval Quality Gates: Signal Over Noise
60
+
61
+ The eval system now filters coordinator sessions to focus on high-quality data.
62
+
63
+ **Problem:** 67 of 82 captured sessions had <3 events - noise from aborted runs, test pokes, and incomplete swarms. This diluted eval scores and made metrics unreliable.
64
+
65
+ **Solution:** Quality filters applied BEFORE sampling:
66
+
67
+ | Filter | Default | Purpose |
68
+ | -------------------- | ------- | --------------------------------- |
69
+ | `minEvents` | 3 | Skip incomplete/aborted sessions |
70
+ | `requireWorkerSpawn` | true | Ensure coordinator delegated work |
71
+ | `requireReview` | true | Ensure full swarm lifecycle |
72
+
73
+ **Impact:**
74
+
75
+ - Filters 93 noisy sessions automatically
76
+ - Overall eval score: 63% → 71% (true signal, not diluted)
77
+ - Coordinator discipline: 47% → 57% (accurate measurement)
78
+
79
+ **Usage:**
80
+
81
+ ```typescript
82
+ // Default: high-quality sessions only
83
+ const sessions = await loadCapturedSessions();
84
+
85
+ // Override for specific analysis
86
+ const allSessions = await loadCapturedSessions({
87
+ minEvents: 1,
88
+ requireWorkerSpawn: false,
89
+ requireReview: false,
90
+ });
91
+ ```
92
+
93
+ Includes 7 unit tests covering filter logic and edge cases.
94
+
95
+ ## 0.39.0
96
+
97
+ ### Minor Changes
98
+
99
+ - [`aa12943`](https://github.com/joelhooks/swarm-tools/commit/aa12943f3edc8d5e23878b22f44073e4c71367c5) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🐝 Eval-Driven Development: The System That Scores Itself
100
+
101
+ > "What gets measured gets managed." — Peter Drucker
102
+ > "What gets scored gets improved." — The Swarm
103
+
104
+ The plugin now evaluates its own output quality through a progressive gate system. Every compaction prompt gets scored, tracked, and learned from. Regressions become impossible to ignore.
105
+
106
+ ### The Pipeline
107
+
108
+ ```
109
+ CAPTURE → SCORE → STORE → GATE → LEARN → IMPROVE
110
+ ↑ ↓
111
+ └──────────────────────────────────────┘
112
+ ```
113
+
114
+ ### What's New
115
+
116
+ **Event Capture** (5 integration points)
117
+
118
+ - `detection_triggered` - When compaction is detected
119
+ - `prompt_generated` - Full LLM prompt captured
120
+ - `context_injected` - Final content before injection
121
+ - All events stored to `~/.config/swarm-tools/sessions/{session_id}.jsonl`
122
+
123
+ **5 Compaction Prompt Scorers**
124
+
125
+ - `epicIdSpecificity` - Real IDs, not placeholders (20%)
126
+ - `actionability` - Specific tool calls with values (20%)
127
+ - `coordinatorIdentity` - ASCII header + mandates (25%)
128
+ - `forbiddenToolsPresent` - Lists what NOT to do (15%)
129
+ - `postCompactionDiscipline` - First tool is correct (20%)
130
+
131
+ **Progressive Gates**
132
+ | Phase | Threshold | Behavior |
133
+ |-------|-----------|----------|
134
+ | Bootstrap | N/A | Always pass, building baseline |
135
+ | Stabilization | 0.6 | Warn but pass |
136
+ | Production | 0.7 | Fail CI on regression |
137
+
138
+ **CLI Commands**
139
+
140
+ ```bash
141
+ swarm eval status # Current phase, thresholds, scores
142
+ swarm eval history # Trends with sparklines ▁▂▃▄▅▆▇█
143
+ swarm eval run [--ci] # Execute evals, gate check
144
+ ```
145
+
146
+ **CI Integration**
147
+
148
+ - Runs after tests pass
149
+ - Posts results as PR comment with emoji status
150
+ - Only fails in production phase with actual regression
151
+
152
+ **Learning Feedback Loop**
153
+
154
+ - Significant score drops auto-stored to semantic memory
155
+ - Future agents learn from past failures
156
+ - Pattern maturity tracking
157
+
158
+ ### Breaking Changes
159
+
160
+ None. All new functionality is additive.
161
+
162
+ ### Files Changed
163
+
164
+ - `src/eval-capture.ts` - Event capture with Zod schemas
165
+ - `src/eval-gates.ts` - Progressive gate logic
166
+ - `src/eval-history.ts` - Score tracking over time
167
+ - `src/eval-learning.ts` - Failure-to-learning extraction
168
+ - `src/compaction-prompt-scoring.ts` - 5 pure scoring functions
169
+ - `evals/compaction-prompt.eval.ts` - Evalite integration
170
+ - `bin/swarm.ts` - CLI commands
171
+ - `.github/workflows/ci.yml` - CI integration
172
+
173
+ ### Test Coverage
174
+
175
+ - 422 new tests for eval-capture
176
+ - 48 CLI tests
177
+ - 7 integration tests for capture wiring
178
+ - All existing tests still passing
179
+
180
+ ### Patch Changes
181
+
182
+ - Updated dependencies [[`aa12943`](https://github.com/joelhooks/swarm-tools/commit/aa12943f3edc8d5e23878b22f44073e4c71367c5)]:
183
+ - swarm-mail@1.5.2
184
+
3
185
  ## 0.38.0
4
186
 
5
187
  ### Minor Changes
package/README.md CHANGED
@@ -242,27 +242,44 @@ bun run eval:run
242
242
  # Run specific suites
243
243
  bun run eval:decomposition # Task decomposition quality
244
244
  bun run eval:coordinator # Coordinator protocol compliance
245
+ bun run eval:compaction # Compaction prompt quality
246
+
247
+ # Check eval status (progressive gates)
248
+ swarm eval status [eval-name]
249
+
250
+ # View history with trends
251
+ swarm eval history
252
+ ```
253
+
254
+ **Progressive Gates:**
255
+
256
+ ```
257
+ Phase Runs Gate Behavior
258
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
259
+ Bootstrap <10 ✅ Always pass (collect data)
260
+ Stabilization 10-50 ⚠️ Warn on >10% regression
261
+ Production >50 ❌ Fail on >5% regression
245
262
  ```
246
263
 
247
264
  **What gets evaluated:**
248
265
 
249
- | Eval Suite | Measures | Data Source |
250
- |------------|----------|-------------|
251
- | `swarm-decomposition` | Subtask independence, complexity balance, coverage, clarity | Fixtures + captured real decompositions |
252
- | `coordinator-session` | Violation count, spawn efficiency, review thoroughness | Real sessions from `~/.config/swarm-tools/sessions/` |
266
+ | Eval Suite | Measures | Data Source |
267
+ | --------------------- | ------------------------------------------------------------- | ------------------------------------------------ |
268
+ | `swarm-decomposition` | Subtask independence, complexity balance, coverage, clarity | Fixtures + `.opencode/eval-data.jsonl` |
269
+ | `coordinator-session` | Violation count, spawn efficiency, review thoroughness | `~/.config/swarm-tools/sessions/*.jsonl` |
270
+ | `compaction-prompt` | ID specificity, actionability, identity, forbidden tools | Session compaction events |
271
+
272
+ **Learning Feedback Loop:**
273
+
274
+ When eval scores drop >15% from baseline, failure context is automatically stored to semantic memory. Future prompts query these learnings for context.
253
275
 
254
276
  **Data capture locations:**
255
277
  - Decomposition inputs/outputs: `.opencode/eval-data.jsonl`
278
+ - Eval history: `.opencode/eval-history.jsonl`
256
279
  - Coordinator sessions: `~/.config/swarm-tools/sessions/*.jsonl`
257
- - Subtask outcomes: swarm-mail database (used for pattern learning)
258
-
259
- **Custom scorers:**
260
- - Subtask independence (0-1): Files don't overlap between subtasks
261
- - Complexity balance (0-1): Subtasks have similar estimated complexity
262
- - Coverage completeness (0-1): Required files are covered
263
- - Instruction clarity (0-1): Descriptions are specific and actionable
280
+ - Subtask outcomes: swarm-mail database
264
281
 
265
- See [evals/README.md](./evals/README.md) for scorer details and how to write new evals.
282
+ See **[evals/README.md](./evals/README.md)** for full architecture, scorer details, CI integration, and how to write new evals.
266
283
 
267
284
  ---
268
285