opencode-swarm-plugin 0.40.0 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
  2. package/.hive/analysis/session-data-quality-audit.md +320 -0
  3. package/.hive/eval-results.json +481 -24
  4. package/.hive/issues.jsonl +65 -16
  5. package/.hive/memories.jsonl +159 -1
  6. package/.opencode/eval-history.jsonl +315 -0
  7. package/.turbo/turbo-build.log +5 -5
  8. package/CHANGELOG.md +155 -0
  9. package/README.md +2 -0
  10. package/SCORER-ANALYSIS.md +598 -0
  11. package/bin/eval-gate.test.ts +158 -0
  12. package/bin/eval-gate.ts +74 -0
  13. package/bin/swarm.test.ts +661 -732
  14. package/bin/swarm.ts +274 -0
  15. package/dist/compaction-hook.d.ts +7 -5
  16. package/dist/compaction-hook.d.ts.map +1 -1
  17. package/dist/compaction-prompt-scoring.d.ts +1 -0
  18. package/dist/compaction-prompt-scoring.d.ts.map +1 -1
  19. package/dist/eval-runner.d.ts +134 -0
  20. package/dist/eval-runner.d.ts.map +1 -0
  21. package/dist/hive.d.ts.map +1 -1
  22. package/dist/index.d.ts +29 -0
  23. package/dist/index.d.ts.map +1 -1
  24. package/dist/index.js +99741 -58858
  25. package/dist/memory-tools.d.ts +70 -2
  26. package/dist/memory-tools.d.ts.map +1 -1
  27. package/dist/memory.d.ts +37 -0
  28. package/dist/memory.d.ts.map +1 -1
  29. package/dist/observability-tools.d.ts +64 -0
  30. package/dist/observability-tools.d.ts.map +1 -1
  31. package/dist/plugin.js +99356 -58318
  32. package/dist/swarm-orchestrate.d.ts.map +1 -1
  33. package/dist/swarm-prompts.d.ts +32 -1
  34. package/dist/swarm-prompts.d.ts.map +1 -1
  35. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
  36. package/evals/ARCHITECTURE.md +1189 -0
  37. package/evals/example.eval.ts +3 -4
  38. package/evals/fixtures/compaction-prompt-cases.ts +6 -0
  39. package/evals/scorers/coordinator-discipline.ts +0 -253
  40. package/evals/swarm-decomposition.eval.ts +4 -2
  41. package/package.json +4 -3
  42. package/src/compaction-prompt-scorers.test.ts +10 -9
  43. package/src/compaction-prompt-scoring.ts +7 -5
  44. package/src/eval-runner.test.ts +128 -1
  45. package/src/eval-runner.ts +46 -0
  46. package/src/hive.ts +43 -42
  47. package/src/memory-tools.test.ts +84 -0
  48. package/src/memory-tools.ts +68 -3
  49. package/src/memory.test.ts +2 -112
  50. package/src/memory.ts +88 -49
  51. package/src/observability-tools.test.ts +13 -0
  52. package/src/observability-tools.ts +277 -0
  53. package/src/swarm-orchestrate.test.ts +162 -0
  54. package/src/swarm-orchestrate.ts +7 -5
  55. package/src/swarm-prompts.test.ts +168 -4
  56. package/src/swarm-prompts.ts +228 -7
  57. package/.env +0 -2
  58. package/.turbo/turbo-test.log +0 -481
  59. package/.turbo/turbo-typecheck.log +0 -1
@@ -10,3 +10,318 @@
10
10
  {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"coordinator-behavior","score":0.85,"run_count":3}
11
11
  {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"coordinator-session","score":0.85,"run_count":3}
12
12
  {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"swarm-decomposition","score":0.85,"run_count":3}
13
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":1}
14
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":1}
15
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":1}
16
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Swarm Decomposition Quality","score":0.7213888888888889,"run_count":1}
17
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":1}
18
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":1}
19
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":1}
20
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"No False Positives","score":1,"run_count":1}
21
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Example: Basic scorer test","score":1,"run_count":1}
22
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":1}
23
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Placeholder Detection","score":0,"run_count":1}
24
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Generic Instructions Fail","score":0,"run_count":1}
25
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"First Tool Discipline","score":0,"run_count":1}
26
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":1}
27
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":1}
28
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Coordinator Behavior After Compaction","score":0.8645833333333333,"run_count":1}
29
+ {"timestamp":"2025-12-25T16:30:43.088Z","eval_name":"Example: Basic scorer test","score":1,"run_count":2}
30
+ {"timestamp":"2025-12-25T16:30:43.202Z","eval_name":"Example: Basic scorer test","score":1,"run_count":3}
31
+ {"timestamp":"2025-12-25T16:30:43.316Z","eval_name":"Example: Basic scorer test","score":1,"run_count":4}
32
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":2}
33
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":2}
34
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":2}
35
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Swarm Decomposition Quality","score":0.6748148148148146,"run_count":2}
36
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":2}
37
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":2}
38
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":2}
39
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"No False Positives","score":1,"run_count":2}
40
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Example: Basic scorer test","score":1,"run_count":5}
41
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":2}
42
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":2}
43
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":2}
44
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Placeholder Detection","score":0,"run_count":2}
45
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Generic Instructions Fail","score":0,"run_count":2}
46
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"First Tool Discipline","score":0,"run_count":2}
47
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":2}
48
+ {"timestamp":"2025-12-25T16:31:17.874Z","eval_name":"Example: Basic scorer test","score":1,"run_count":6}
49
+ {"timestamp":"2025-12-25T16:31:17.995Z","eval_name":"Example: Basic scorer test","score":1,"run_count":7}
50
+ {"timestamp":"2025-12-25T16:31:18.113Z","eval_name":"Example: Basic scorer test","score":1,"run_count":8}
51
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":3}
52
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":3}
53
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":3}
54
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Swarm Decomposition Quality","score":0.6988888888888889,"run_count":3}
55
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":3}
56
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":3}
57
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":3}
58
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"No False Positives","score":1,"run_count":3}
59
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Example: Basic scorer test","score":1,"run_count":9}
60
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":3}
61
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":3}
62
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Placeholder Detection","score":0,"run_count":3}
63
+ {"timestamp":"2025-12-25T16:32:49.808Z","eval_name":"Generic Instructions Fail","score":0,"run_count":3}
64
+ {"timestamp":"2025-12-25T16:32:49.808Z","eval_name":"First Tool Discipline","score":0,"run_count":3}
65
+ {"timestamp":"2025-12-25T16:32:49.808Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":3}
66
+ {"timestamp":"2025-12-25T16:32:49.808Z","eval_name":"Coordinator Resists Direct Implementation","score":0.9375,"run_count":3}
67
+ {"timestamp":"2025-12-25T16:32:49.943Z","eval_name":"Example: Basic scorer test","score":1,"run_count":10}
68
+ {"timestamp":"2025-12-25T16:32:50.073Z","eval_name":"Example: Basic scorer test","score":1,"run_count":11}
69
+ {"timestamp":"2025-12-25T16:32:50.199Z","eval_name":"Example: Basic scorer test","score":1,"run_count":12}
70
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":4}
71
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":4}
72
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":4}
73
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Swarm Decomposition Quality","score":0.6798611111111109,"run_count":4}
74
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":4}
75
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":4}
76
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":4}
77
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"No False Positives","score":1,"run_count":4}
78
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Example: Basic scorer test","score":1,"run_count":13}
79
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":4}
80
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":4}
81
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Placeholder Detection","score":0,"run_count":4}
82
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Generic Instructions Fail","score":0,"run_count":4}
83
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"First Tool Discipline","score":0,"run_count":4}
84
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Coordinator Resists Direct Implementation","score":0.9375,"run_count":4}
85
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Coordinator Behavior After Compaction","score":0.8645833333333333,"run_count":4}
86
+ {"timestamp":"2025-12-25T16:35:31.221Z","eval_name":"Example: Basic scorer test","score":1,"run_count":14}
87
+ {"timestamp":"2025-12-25T16:35:31.329Z","eval_name":"Example: Basic scorer test","score":1,"run_count":15}
88
+ {"timestamp":"2025-12-25T16:35:31.444Z","eval_name":"Example: Basic scorer test","score":1,"run_count":16}
89
+ {"timestamp":"2025-12-25T16:35:31.685Z","eval_name":"Example: Basic scorer test","score":1,"run_count":17}
90
+ {"timestamp":"2025-12-25T16:35:31.843Z","eval_name":"Example: Basic scorer test","score":1,"run_count":18}
91
+ {"timestamp":"2025-12-25T16:35:31.962Z","eval_name":"Example: Basic scorer test","score":1,"run_count":19}
92
+ {"timestamp":"2025-12-25T16:35:32.076Z","eval_name":"Example: Basic scorer test","score":1,"run_count":20}
93
+ {"timestamp":"2025-12-25T16:36:03.596Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":5}
94
+ {"timestamp":"2025-12-25T16:36:03.596Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":5}
95
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":5}
96
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Swarm Decomposition Quality","score":0.6845833333333333,"run_count":5}
97
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":5}
98
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":5}
99
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":5}
100
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"No False Positives","score":1,"run_count":5}
101
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Example: Basic scorer test","score":1,"run_count":21}
102
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":5}
103
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":5}
104
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Placeholder Detection","score":0,"run_count":5}
105
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Generic Instructions Fail","score":0,"run_count":5}
106
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"First Tool Discipline","score":0,"run_count":5}
107
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":5}
108
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":5}
109
+ {"timestamp":"2025-12-25T16:36:03.727Z","eval_name":"Example: Basic scorer test","score":1,"run_count":22}
110
+ {"timestamp":"2025-12-25T16:36:03.842Z","eval_name":"Example: Basic scorer test","score":1,"run_count":23}
111
+ {"timestamp":"2025-12-25T16:36:03.951Z","eval_name":"Example: Basic scorer test","score":1,"run_count":24}
112
+ {"timestamp":"2025-12-25T16:36:04.183Z","eval_name":"Example: Basic scorer test","score":1,"run_count":25}
113
+ {"timestamp":"2025-12-25T16:36:04.330Z","eval_name":"Example: Basic scorer test","score":1,"run_count":26}
114
+ {"timestamp":"2025-12-25T16:36:04.445Z","eval_name":"Example: Basic scorer test","score":1,"run_count":27}
115
+ {"timestamp":"2025-12-25T16:36:04.555Z","eval_name":"Example: Basic scorer test","score":1,"run_count":28}
116
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":6}
117
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":6}
118
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":6}
119
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Swarm Decomposition Quality","score":0.6852777777777778,"run_count":6}
120
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":6}
121
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":6}
122
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"No False Positives","score":1,"run_count":6}
123
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Example: Basic scorer test","score":1,"run_count":29}
124
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":6}
125
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":6}
126
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Placeholder Detection","score":0,"run_count":6}
127
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Generic Instructions Fail","score":0,"run_count":6}
128
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"First Tool Discipline","score":0,"run_count":6}
129
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Coordinator Behavior After Compaction","score":0.978125,"run_count":6}
130
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":6}
131
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":6}
132
+ {"timestamp":"2025-12-25T16:36:32.305Z","eval_name":"Example: Basic scorer test","score":1,"run_count":30}
133
+ {"timestamp":"2025-12-25T16:36:32.416Z","eval_name":"Example: Basic scorer test","score":1,"run_count":31}
134
+ {"timestamp":"2025-12-25T16:36:32.527Z","eval_name":"Example: Basic scorer test","score":1,"run_count":32}
135
+ {"timestamp":"2025-12-25T16:36:32.755Z","eval_name":"Example: Basic scorer test","score":1,"run_count":33}
136
+ {"timestamp":"2025-12-25T16:36:32.957Z","eval_name":"Example: Basic scorer test","score":1,"run_count":34}
137
+ {"timestamp":"2025-12-25T16:36:33.071Z","eval_name":"Example: Basic scorer test","score":1,"run_count":35}
138
+ {"timestamp":"2025-12-25T16:36:33.180Z","eval_name":"Example: Basic scorer test","score":1,"run_count":36}
139
+ {"timestamp":"2025-12-25T16:38:02.146Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":7}
140
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":7}
141
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":7}
142
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Swarm Decomposition Quality","score":0.6726388888888888,"run_count":7}
143
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":7}
144
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":7}
145
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":7}
146
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"No False Positives","score":1,"run_count":7}
147
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Example: Basic scorer test","score":1,"run_count":37}
148
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":7}
149
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":7}
150
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Placeholder Detection","score":0,"run_count":7}
151
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Generic Instructions Fail","score":0,"run_count":7}
152
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"First Tool Discipline","score":0,"run_count":7}
153
+ {"timestamp":"2025-12-25T16:38:02.148Z","eval_name":"Coordinator Behavior After Compaction","score":0.8645833333333333,"run_count":7}
154
+ {"timestamp":"2025-12-25T16:38:02.148Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":7}
155
+ {"timestamp":"2025-12-25T16:38:02.276Z","eval_name":"Example: Basic scorer test","score":1,"run_count":38}
156
+ {"timestamp":"2025-12-25T16:38:02.428Z","eval_name":"Example: Basic scorer test","score":1,"run_count":39}
157
+ {"timestamp":"2025-12-25T16:38:02.547Z","eval_name":"Example: Basic scorer test","score":1,"run_count":40}
158
+ {"timestamp":"2025-12-25T16:38:02.782Z","eval_name":"Example: Basic scorer test","score":1,"run_count":41}
159
+ {"timestamp":"2025-12-25T16:38:02.933Z","eval_name":"Example: Basic scorer test","score":1,"run_count":42}
160
+ {"timestamp":"2025-12-25T16:38:03.050Z","eval_name":"Example: Basic scorer test","score":1,"run_count":43}
161
+ {"timestamp":"2025-12-25T16:38:03.165Z","eval_name":"Example: Basic scorer test","score":1,"run_count":44}
162
+ {"timestamp":"2025-12-25T16:38:52.756Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":8}
163
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":8}
164
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":8}
165
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Swarm Decomposition Quality","score":0.695,"run_count":8}
166
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":8}
167
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":8}
168
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":8}
169
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"No False Positives","score":1,"run_count":8}
170
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Example: Basic scorer test","score":1,"run_count":45}
171
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":8}
172
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Placeholder Detection","score":0,"run_count":8}
173
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Generic Instructions Fail","score":0,"run_count":8}
174
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"First Tool Discipline","score":0,"run_count":8}
175
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":8}
176
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Coordinator Behavior After Compaction","score":0.9526041666666667,"run_count":8}
177
+ {"timestamp":"2025-12-25T16:38:52.758Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":8}
178
+ {"timestamp":"2025-12-25T16:38:52.903Z","eval_name":"Example: Basic scorer test","score":1,"run_count":46}
179
+ {"timestamp":"2025-12-25T16:38:53.020Z","eval_name":"Example: Basic scorer test","score":1,"run_count":47}
180
+ {"timestamp":"2025-12-25T16:38:53.136Z","eval_name":"Example: Basic scorer test","score":1,"run_count":48}
181
+ {"timestamp":"2025-12-25T16:38:53.367Z","eval_name":"Example: Basic scorer test","score":1,"run_count":49}
182
+ {"timestamp":"2025-12-25T16:38:53.511Z","eval_name":"Example: Basic scorer test","score":1,"run_count":50}
183
+ {"timestamp":"2025-12-25T16:38:53.624Z","eval_name":"Example: Basic scorer test","score":1,"run_count":51}
184
+ {"timestamp":"2025-12-25T16:38:53.737Z","eval_name":"Example: Basic scorer test","score":1,"run_count":52}
185
+ {"timestamp":"2025-12-25T16:40:39.219Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":9}
186
+ {"timestamp":"2025-12-25T16:40:39.219Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":9}
187
+ {"timestamp":"2025-12-25T16:40:39.219Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":9}
188
+ {"timestamp":"2025-12-25T16:40:39.219Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":9}
189
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Swarm Decomposition Quality","score":0.7020833333333334,"run_count":9}
190
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":9}
191
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"No False Positives","score":1,"run_count":9}
192
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":9}
193
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Example: Basic scorer test","score":1,"run_count":53}
194
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Coordinator Behavior After Compaction","score":0.7291666666666666,"run_count":9}
195
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":9}
196
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":9}
197
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":9}
198
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Placeholder Detection","score":0,"run_count":9}
199
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Generic Instructions Fail","score":0,"run_count":9}
200
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"First Tool Discipline","score":0,"run_count":9}
201
+ {"timestamp":"2025-12-25T16:40:39.352Z","eval_name":"Example: Basic scorer test","score":1,"run_count":54}
202
+ {"timestamp":"2025-12-25T16:40:39.460Z","eval_name":"Example: Basic scorer test","score":1,"run_count":55}
203
+ {"timestamp":"2025-12-25T16:40:39.572Z","eval_name":"Example: Basic scorer test","score":1,"run_count":56}
204
+ {"timestamp":"2025-12-25T16:40:39.816Z","eval_name":"Example: Basic scorer test","score":1,"run_count":57}
205
+ {"timestamp":"2025-12-25T16:40:39.947Z","eval_name":"Example: Basic scorer test","score":1,"run_count":58}
206
+ {"timestamp":"2025-12-25T16:40:40.084Z","eval_name":"Example: Basic scorer test","score":1,"run_count":59}
207
+ {"timestamp":"2025-12-25T16:40:40.202Z","eval_name":"Example: Basic scorer test","score":1,"run_count":60}
208
+ {"timestamp":"2025-12-25T16:43:12.851Z","eval_name":"Example: Basic scorer test","score":1,"run_count":61}
209
+ {"timestamp":"2025-12-25T16:43:43.041Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":10}
210
+ {"timestamp":"2025-12-25T16:43:43.041Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":10}
211
+ {"timestamp":"2025-12-25T16:43:43.041Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":10}
212
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Swarm Decomposition Quality","score":0.6909722222222222,"run_count":10}
213
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":10}
214
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":10}
215
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":10}
216
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"No False Positives","score":1,"run_count":10}
217
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Example: Basic scorer test","score":1,"run_count":62}
218
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":10}
219
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Placeholder Detection","score":0,"run_count":10}
220
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Generic Instructions Fail","score":0,"run_count":10}
221
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"First Tool Discipline","score":0,"run_count":10}
222
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":10}
223
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":10}
224
+ {"timestamp":"2025-12-25T16:43:43.043Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":10}
225
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":11}
226
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":11}
227
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":11}
228
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Swarm Decomposition Quality","score":0.6720833333333333,"run_count":11}
229
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":11}
230
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Example: Basic scorer test","score":1,"run_count":63}
231
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":11}
232
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":11}
233
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"No False Positives","score":1,"run_count":11}
234
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Coordinator Behavior After Compaction","score":0.9796875,"run_count":11}
235
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":11}
236
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":11}
237
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":11}
238
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Placeholder Detection","score":0,"run_count":11}
239
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Generic Instructions Fail","score":0,"run_count":11}
240
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"First Tool Discipline","score":0,"run_count":11}
241
+ {"timestamp":"2025-12-25T16:49:55.548Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":12}
242
+ {"timestamp":"2025-12-25T16:49:55.549Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":12}
243
+ {"timestamp":"2025-12-25T16:49:55.555Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":12}
244
+ {"timestamp":"2025-12-25T16:49:55.555Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":12}
245
+ {"timestamp":"2025-12-25T16:49:55.555Z","eval_name":"Swarm Decomposition Quality","score":0.7001388888888888,"run_count":12}
246
+ {"timestamp":"2025-12-25T16:49:55.555Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":12}
247
+ {"timestamp":"2025-12-25T16:49:55.556Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":12}
248
+ {"timestamp":"2025-12-25T16:49:55.557Z","eval_name":"No False Positives","score":1,"run_count":12}
249
+ {"timestamp":"2025-12-25T16:49:55.557Z","eval_name":"Example: Basic scorer test","score":1,"run_count":64}
250
+ {"timestamp":"2025-12-25T16:49:55.557Z","eval_name":"Generic Instructions Fail","score":0,"run_count":12}
251
+ {"timestamp":"2025-12-25T16:49:55.557Z","eval_name":"First Tool Discipline","score":0,"run_count":12}
252
+ {"timestamp":"2025-12-25T16:49:55.561Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":12}
253
+ {"timestamp":"2025-12-25T16:49:55.561Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":12}
254
+ {"timestamp":"2025-12-25T16:49:55.561Z","eval_name":"Placeholder Detection","score":0,"run_count":12}
255
+ {"timestamp":"2025-12-25T16:49:55.561Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":12}
256
+ {"timestamp":"2025-12-25T16:49:55.565Z","eval_name":"Coordinator Resists Direct Implementation","score":0.9375,"run_count":12}
257
+ {"timestamp":"2025-12-25T16:49:55.697Z","eval_name":"Example: Basic scorer test","score":1,"run_count":65}
258
+ {"timestamp":"2025-12-25T16:49:55.813Z","eval_name":"Example: Basic scorer test","score":1,"run_count":66}
259
+ {"timestamp":"2025-12-25T16:49:55.934Z","eval_name":"Example: Basic scorer test","score":1,"run_count":67}
260
+ {"timestamp":"2025-12-25T16:49:56.178Z","eval_name":"Example: Basic scorer test","score":1,"run_count":68}
261
+ {"timestamp":"2025-12-25T16:49:56.327Z","eval_name":"Example: Basic scorer test","score":1,"run_count":69}
262
+ {"timestamp":"2025-12-25T16:49:56.446Z","eval_name":"Example: Basic scorer test","score":1,"run_count":70}
263
+ {"timestamp":"2025-12-25T16:49:56.556Z","eval_name":"Example: Basic scorer test","score":1,"run_count":71}
264
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":13}
265
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":13}
266
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":13}
267
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":13}
268
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Swarm Decomposition Quality","score":0.6847222222222221,"run_count":13}
269
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":13}
270
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":13}
271
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"No False Positives","score":1,"run_count":13}
272
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Example: Basic scorer test","score":1,"run_count":72}
273
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":13}
274
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Placeholder Detection","score":0,"run_count":13}
275
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Generic Instructions Fail","score":0,"run_count":13}
276
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"First Tool Discipline","score":0,"run_count":13}
277
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":13}
278
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Coordinator Behavior After Compaction","score":0.8645833333333333,"run_count":13}
279
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Coordinator Resists Direct Implementation","score":0.9375,"run_count":13}
280
+ {"timestamp":"2025-12-25T18:58:44.923Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":14}
281
+ {"timestamp":"2025-12-25T18:58:44.923Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":14}
282
+ {"timestamp":"2025-12-25T18:58:44.923Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":14}
283
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Swarm Decomposition Quality","score":0.7095833333333333,"run_count":14}
284
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":14}
285
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":14}
286
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":14}
287
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"No False Positives","score":1,"run_count":14}
288
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Example: Basic scorer test","score":1,"run_count":73}
289
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Generic Instructions Fail","score":0,"run_count":14}
290
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"First Tool Discipline","score":0,"run_count":14}
291
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":14}
292
+ {"timestamp":"2025-12-25T18:58:44.925Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":14}
293
+ {"timestamp":"2025-12-25T18:58:44.925Z","eval_name":"Placeholder Detection","score":0,"run_count":14}
294
+ {"timestamp":"2025-12-25T18:58:44.925Z","eval_name":"Coordinator Behavior After Compaction","score":0.9375,"run_count":14}
295
+ {"timestamp":"2025-12-25T18:58:44.925Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":14}
296
+ {"timestamp":"2025-12-25T18:59:58.928Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":15}
297
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":15}
298
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":15}
299
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Swarm Decomposition Quality","score":0.6944444444444443,"run_count":15}
300
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":15}
301
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":15}
302
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":15}
303
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"No False Positives","score":1,"run_count":15}
304
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Example: Basic scorer test","score":1,"run_count":74}
305
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":15}
306
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":15}
307
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"Placeholder Detection","score":0,"run_count":15}
308
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"Generic Instructions Fail","score":0,"run_count":15}
309
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"First Tool Discipline","score":0,"run_count":15}
310
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"Coordinator Behavior After Compaction","score":0.9171875,"run_count":15}
311
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":15}
312
+ {"timestamp":"2025-12-25T19:00:48.709Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":16}
313
+ {"timestamp":"2025-12-25T19:00:48.709Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":16}
314
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":16}
315
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Swarm Decomposition Quality","score":0.5464583333333334,"run_count":16}
316
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":16}
317
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":16}
318
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":16}
319
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"No False Positives","score":1,"run_count":16}
320
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Example: Basic scorer test","score":1,"run_count":75}
321
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":16}
322
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Placeholder Detection","score":0,"run_count":16}
323
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Generic Instructions Fail","score":0,"run_count":16}
324
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"First Tool Discipline","score":0,"run_count":16}
325
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":16}
326
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":16}
327
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":16}
@@ -1,9 +1,9 @@
1
- $ bun build ./src/index.ts --outdir ./dist --target node --external @electric-sql/pglite --external swarm-mail && bun build ./src/plugin.ts --outfile ./dist/plugin.js --target node --external @electric-sql/pglite --external swarm-mail && tsc
2
- Bundled 917 modules in 237ms
1
+ $ bun build ./src/index.ts --outdir ./dist --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && bun build ./src/plugin.ts --outfile ./dist/plugin.js --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && tsc
2
+ Bundled 1348 modules in 205ms
3
3
 
4
- index.js 2.16 MB (entry point)
4
+ index.js 4.33 MB (entry point)
5
5
 
6
- Bundled 918 modules in 82ms
6
+ Bundled 1349 modules in 196ms
7
7
 
8
- plugin.js 2.12 MB (entry point)
8
+ plugin.js 4.30 MB (entry point)
9
9
 
package/CHANGELOG.md CHANGED
@@ -1,5 +1,160 @@
1
1
  # opencode-swarm-plugin
2
2
 
3
+ ## 0.42.0
4
+
5
+ ### Minor Changes
6
+
7
+ - [`a79e04b`](https://github.com/joelhooks/swarm-tools/commit/a79e04b1bb3b40c09c5265b5d11739864799e4e2) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🔭 Swarm Observability: See What Your Bees Are Doing
8
+
9
+ > "Observability is about instrumenting your system in a way that ensures sufficient information about a system's runtime is collected and analyzed so that when something goes wrong, it can help you understand why."
10
+ > — Chip Huyen, _AI Engineering_
11
+
12
+ New CLI commands to understand swarm health and history:
13
+
14
+ ### `swarm stats`
15
+
16
+ ```
17
+ ┌─────────────────────────────────────────┐
18
+ │ 🐝 SWARM STATISTICS 🐝 │
19
+ ├─────────────────────────────────────────┤
20
+ │ Total Swarms: 42 Success: 87% │
21
+ │ Avg Duration: 4.2min │
22
+ ├─────────────────────────────────────────┤
23
+ │ BY STRATEGY │
24
+ │ ├─ file-based 92% (23/25) │
25
+ │ ├─ feature-based 78% (14/18) │
26
+ │ ├─ risk-based 67% (2/3) │
27
+ ├─────────────────────────────────────────┤
28
+ │ COORDINATOR HEALTH │
29
+ │ Violation Rate: 2% │
30
+ │ Spawn Efficiency: 94% │
31
+ │ Review Rate: 88% │
32
+ └─────────────────────────────────────────┘
33
+ ```
34
+
35
+ Options: `--since 24h/7d/30d`, `--json`
36
+
37
+ ### `swarm history`
38
+
39
+ Timeline of recent swarm activity with filtering:
40
+
41
+ - `--status success/failed/in_progress`
42
+ - `--strategy file-based/feature-based/risk-based`
43
+ - `--verbose` for subtask details
44
+
45
+ ### Prompt Insights Integration
46
+
47
+ Coordinators and workers now receive injected insights from past swarm outcomes:
48
+
49
+ - Strategy success rates as markdown tables
50
+ - Anti-pattern warnings for low-success strategies
51
+ - File/domain-specific learnings from semantic memory
52
+
53
+ This creates a feedback loop where swarms learn from their own history.
54
+
55
+ ### Also in this release
56
+
57
+ - **swarm-dashboard** (WIP): React/Vite visualizer scaffold
58
+ - **ADR-006**: Swarm PTY decision document
59
+ - **CI fix**: Smarter changeset detection prevents empty PR errors
60
+
61
+ ### Patch Changes
62
+
63
+ - Updated dependencies [[`a79e04b`](https://github.com/joelhooks/swarm-tools/commit/a79e04b1bb3b40c09c5265b5d11739864799e4e2)]:
64
+ - swarm-mail@1.5.4
65
+
66
+ ## 0.41.0
67
+
68
+ ### Minor Changes
69
+
70
+ - [`179b3f0`](https://github.com/joelhooks/swarm-tools/commit/179b3f0e49c7959f8d754c1274d301d0b3845a79) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🐝 Compaction Prompt Now Speaks Swarm
71
+
72
+ > _"Memory is essential for communication: we recall past interactions, infer preferences, and construct evolving mental models of those we engage with."_
73
+ > — Mem0: Building Production-Ready AI Agents with Scalable Long-Term Memory
74
+
75
+ When context compacts mid-swarm, coordinators were waking up confused. They had state information but no protocol guidance. Now the compaction prompt includes a condensed version of the swarm command template.
76
+
77
+ **What's New:**
78
+
79
+ The `SWARM_COMPACTION_CONTEXT` now includes:
80
+
81
+ 1. **What Good Looks Like** - Behavioral examples showing ideal coordinator behavior
82
+
83
+ - ✅ Spawned researcher for unfamiliar tech → got summary → stored in semantic-memory
84
+ - ✅ Checked inbox every 5-10 minutes → caught blocked worker → unblocked in 2min
85
+ - ❌ Called context7 directly → dumped 50KB → context exhaustion
86
+
87
+ 2. **Mandatory Behaviors Checklist** - Post-compaction protocol
88
+ - Inbox monitoring (every 5-10 min with intervention triggers)
89
+ - Skill loading (before spawning workers)
90
+ - Worker review (after every worker returns, 3-strike rule)
91
+ - Research spawning (never call context7/pdf-brain directly)
92
+
93
+ **Why This Matters:**
94
+
95
+ Coordinators resuming from compaction now have:
96
+
97
+ - Clear behavioral guidance (not just state)
98
+ - Actionable tool call examples
99
+ - Anti-patterns to avoid
100
+ - The same protocol as fresh `/swarm` invocations
101
+
102
+ **Backward Compatible:** Existing compaction hooks continue to work. This adds guidance, doesn't change the hook signature.
103
+
104
+ ### Patch Changes
105
+
106
+ - [`3e7c126`](https://github.com/joelhooks/swarm-tools/commit/3e7c126b11aa6ad909ebcb2ab3cf77883f9acfe4) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🧪 Bulletproof Test Suite
107
+
108
+ > "Setting up our tests to run synchronously and using mocking libraries will greatly speed up our testing"
109
+ > — ng-book
110
+
111
+ Fixed test isolation issues that caused 19 tests to fail when run together but pass in isolation.
112
+
113
+ ### The Culprits
114
+
115
+ **1. Global fetch pollution** (`ollama.test.ts`)
116
+
117
+ ```typescript
118
+ // BEFORE: Replaced global.fetch, never restored it
119
+ global.fetch = mockFetch;
120
+
121
+ // AFTER: Save and restore
122
+ const originalFetch = global.fetch;
123
+ afterEach(() => {
124
+ global.fetch = originalFetch;
125
+ });
126
+ ```
127
+
128
+ **2. Port conflicts** (`durable-server.test.ts`)
129
+
130
+ - Tests used hardcoded ports (4483, 4484, 4485)
131
+ - Parallel test runs fought over the same ports
132
+ - Fixed: Use `port: 0` for OS-assigned ports, made `server.url` a getter
133
+
134
+ **3. AI SDK schema incompatibility** (`memory-operations.ts`)
135
+
136
+ - `z.discriminatedUnion` creates `oneOf` at top level
137
+ - Anthropic API requires `type: object` at top level
138
+ - Fixed: Flat object schema with optional fields
139
+
140
+ ### Test Stats
141
+
142
+ ```
143
+ Before: 19 failures when run together
144
+ After: 0 failures, 1406 tests pass
145
+ ```
146
+
147
+ ### Files Changed
148
+
149
+ - `src/memory/ollama.test.ts` - Restore global.fetch after each test
150
+ - `src/streams/durable-server.ts` - Dynamic port getter
151
+ - `src/streams/durable-server.test.ts` - Use port 0, rewrite for isolation
152
+ - `src/memory/memory-operations.ts` - Flat schema for Anthropic compatibility
153
+ - Renamed `memory-operations.test.ts` → `memory-operations.integration.test.ts`
154
+
155
+ - Updated dependencies [[`3e7c126`](https://github.com/joelhooks/swarm-tools/commit/3e7c126b11aa6ad909ebcb2ab3cf77883f9acfe4)]:
156
+ - swarm-mail@1.5.3
157
+
3
158
  ## 0.40.0
4
159
 
5
160
  ### Minor Changes
package/README.md CHANGED
@@ -5,6 +5,8 @@
5
5
  **🌐 Website:** [swarmtools.ai](https://swarmtools.ai)
6
6
  **📚 Full Documentation:** [swarmtools.ai/docs](https://swarmtools.ai/docs)
7
7
 
8
+ [![Eval Gate](https://github.com/joelhooks/opencode-swarm-plugin/actions/workflows/eval-gate.yml/badge.svg)](https://github.com/joelhooks/opencode-swarm-plugin/actions/workflows/eval-gate.yml)
9
+
8
10
  ```
9
11
  ███████╗██╗ ██╗ █████╗ ██████╗ ███╗ ███╗
10
12
  ██╔════╝██║ ██║██╔══██╗██╔══██╗████╗ ████║