opencode-swarm-plugin 0.40.0 → 0.42.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/.hive/analysis/eval-failure-analysis-2025-12-25.md +331 -0
  2. package/.hive/analysis/session-data-quality-audit.md +320 -0
  3. package/.hive/eval-results.json +481 -24
  4. package/.hive/issues.jsonl +67 -16
  5. package/.hive/memories.jsonl +159 -1
  6. package/.opencode/eval-history.jsonl +315 -0
  7. package/.turbo/turbo-build.log +5 -5
  8. package/CHANGELOG.md +165 -0
  9. package/README.md +2 -0
  10. package/SCORER-ANALYSIS.md +598 -0
  11. package/bin/eval-gate.test.ts +158 -0
  12. package/bin/eval-gate.ts +74 -0
  13. package/bin/swarm.serve.test.ts +46 -0
  14. package/bin/swarm.test.ts +661 -732
  15. package/bin/swarm.ts +335 -0
  16. package/dist/compaction-hook.d.ts +7 -5
  17. package/dist/compaction-hook.d.ts.map +1 -1
  18. package/dist/compaction-prompt-scoring.d.ts +1 -0
  19. package/dist/compaction-prompt-scoring.d.ts.map +1 -1
  20. package/dist/eval-runner.d.ts +134 -0
  21. package/dist/eval-runner.d.ts.map +1 -0
  22. package/dist/hive.d.ts.map +1 -1
  23. package/dist/index.d.ts +29 -0
  24. package/dist/index.d.ts.map +1 -1
  25. package/dist/index.js +99741 -58858
  26. package/dist/memory-tools.d.ts +70 -2
  27. package/dist/memory-tools.d.ts.map +1 -1
  28. package/dist/memory.d.ts +37 -0
  29. package/dist/memory.d.ts.map +1 -1
  30. package/dist/observability-tools.d.ts +64 -0
  31. package/dist/observability-tools.d.ts.map +1 -1
  32. package/dist/plugin.js +99356 -58318
  33. package/dist/swarm-orchestrate.d.ts.map +1 -1
  34. package/dist/swarm-prompts.d.ts +32 -1
  35. package/dist/swarm-prompts.d.ts.map +1 -1
  36. package/docs/planning/ADR-009-oh-my-opencode-patterns.md +353 -0
  37. package/evals/ARCHITECTURE.md +1189 -0
  38. package/evals/example.eval.ts +3 -4
  39. package/evals/fixtures/compaction-prompt-cases.ts +6 -0
  40. package/evals/scorers/coordinator-discipline.evalite-test.ts +1 -162
  41. package/evals/scorers/coordinator-discipline.ts +0 -323
  42. package/evals/swarm-decomposition.eval.ts +4 -2
  43. package/package.json +4 -3
  44. package/src/compaction-prompt-scorers.test.ts +185 -9
  45. package/src/compaction-prompt-scoring.ts +7 -5
  46. package/src/eval-runner.test.ts +128 -1
  47. package/src/eval-runner.ts +46 -0
  48. package/src/hive.ts +43 -42
  49. package/src/memory-tools.test.ts +84 -0
  50. package/src/memory-tools.ts +68 -3
  51. package/src/memory.test.ts +2 -112
  52. package/src/memory.ts +88 -49
  53. package/src/observability-tools.test.ts +13 -0
  54. package/src/observability-tools.ts +277 -0
  55. package/src/swarm-orchestrate.test.ts +162 -0
  56. package/src/swarm-orchestrate.ts +7 -5
  57. package/src/swarm-prompts.test.ts +168 -4
  58. package/src/swarm-prompts.ts +228 -7
  59. package/.env +0 -2
  60. package/.turbo/turbo-test.log +0 -481
  61. package/.turbo/turbo-typecheck.log +0 -1
@@ -10,3 +10,318 @@
10
10
  {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"coordinator-behavior","score":0.85,"run_count":3}
11
11
  {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"coordinator-session","score":0.85,"run_count":3}
12
12
  {"timestamp":"2025-12-25T05:11:18.469Z","eval_name":"swarm-decomposition","score":0.85,"run_count":3}
13
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":1}
14
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":1}
15
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":1}
16
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Swarm Decomposition Quality","score":0.7213888888888889,"run_count":1}
17
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":1}
18
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":1}
19
+ {"timestamp":"2025-12-25T16:30:42.957Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":1}
20
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"No False Positives","score":1,"run_count":1}
21
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Example: Basic scorer test","score":1,"run_count":1}
22
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":1}
23
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Placeholder Detection","score":0,"run_count":1}
24
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Generic Instructions Fail","score":0,"run_count":1}
25
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"First Tool Discipline","score":0,"run_count":1}
26
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":1}
27
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":1}
28
+ {"timestamp":"2025-12-25T16:30:42.958Z","eval_name":"Coordinator Behavior After Compaction","score":0.8645833333333333,"run_count":1}
29
+ {"timestamp":"2025-12-25T16:30:43.088Z","eval_name":"Example: Basic scorer test","score":1,"run_count":2}
30
+ {"timestamp":"2025-12-25T16:30:43.202Z","eval_name":"Example: Basic scorer test","score":1,"run_count":3}
31
+ {"timestamp":"2025-12-25T16:30:43.316Z","eval_name":"Example: Basic scorer test","score":1,"run_count":4}
32
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":2}
33
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":2}
34
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":2}
35
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Swarm Decomposition Quality","score":0.6748148148148146,"run_count":2}
36
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":2}
37
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":2}
38
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":2}
39
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"No False Positives","score":1,"run_count":2}
40
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Example: Basic scorer test","score":1,"run_count":5}
41
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":2}
42
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":2}
43
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":2}
44
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Placeholder Detection","score":0,"run_count":2}
45
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Generic Instructions Fail","score":0,"run_count":2}
46
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"First Tool Discipline","score":0,"run_count":2}
47
+ {"timestamp":"2025-12-25T16:31:17.738Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":2}
48
+ {"timestamp":"2025-12-25T16:31:17.874Z","eval_name":"Example: Basic scorer test","score":1,"run_count":6}
49
+ {"timestamp":"2025-12-25T16:31:17.995Z","eval_name":"Example: Basic scorer test","score":1,"run_count":7}
50
+ {"timestamp":"2025-12-25T16:31:18.113Z","eval_name":"Example: Basic scorer test","score":1,"run_count":8}
51
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":3}
52
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":3}
53
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":3}
54
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Swarm Decomposition Quality","score":0.6988888888888889,"run_count":3}
55
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":3}
56
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":3}
57
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":3}
58
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"No False Positives","score":1,"run_count":3}
59
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Example: Basic scorer test","score":1,"run_count":9}
60
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":3}
61
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":3}
62
+ {"timestamp":"2025-12-25T16:32:49.807Z","eval_name":"Placeholder Detection","score":0,"run_count":3}
63
+ {"timestamp":"2025-12-25T16:32:49.808Z","eval_name":"Generic Instructions Fail","score":0,"run_count":3}
64
+ {"timestamp":"2025-12-25T16:32:49.808Z","eval_name":"First Tool Discipline","score":0,"run_count":3}
65
+ {"timestamp":"2025-12-25T16:32:49.808Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":3}
66
+ {"timestamp":"2025-12-25T16:32:49.808Z","eval_name":"Coordinator Resists Direct Implementation","score":0.9375,"run_count":3}
67
+ {"timestamp":"2025-12-25T16:32:49.943Z","eval_name":"Example: Basic scorer test","score":1,"run_count":10}
68
+ {"timestamp":"2025-12-25T16:32:50.073Z","eval_name":"Example: Basic scorer test","score":1,"run_count":11}
69
+ {"timestamp":"2025-12-25T16:32:50.199Z","eval_name":"Example: Basic scorer test","score":1,"run_count":12}
70
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":4}
71
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":4}
72
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":4}
73
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Swarm Decomposition Quality","score":0.6798611111111109,"run_count":4}
74
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":4}
75
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":4}
76
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":4}
77
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"No False Positives","score":1,"run_count":4}
78
+ {"timestamp":"2025-12-25T16:35:31.083Z","eval_name":"Example: Basic scorer test","score":1,"run_count":13}
79
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":4}
80
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":4}
81
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Placeholder Detection","score":0,"run_count":4}
82
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Generic Instructions Fail","score":0,"run_count":4}
83
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"First Tool Discipline","score":0,"run_count":4}
84
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Coordinator Resists Direct Implementation","score":0.9375,"run_count":4}
85
+ {"timestamp":"2025-12-25T16:35:31.084Z","eval_name":"Coordinator Behavior After Compaction","score":0.8645833333333333,"run_count":4}
86
+ {"timestamp":"2025-12-25T16:35:31.221Z","eval_name":"Example: Basic scorer test","score":1,"run_count":14}
87
+ {"timestamp":"2025-12-25T16:35:31.329Z","eval_name":"Example: Basic scorer test","score":1,"run_count":15}
88
+ {"timestamp":"2025-12-25T16:35:31.444Z","eval_name":"Example: Basic scorer test","score":1,"run_count":16}
89
+ {"timestamp":"2025-12-25T16:35:31.685Z","eval_name":"Example: Basic scorer test","score":1,"run_count":17}
90
+ {"timestamp":"2025-12-25T16:35:31.843Z","eval_name":"Example: Basic scorer test","score":1,"run_count":18}
91
+ {"timestamp":"2025-12-25T16:35:31.962Z","eval_name":"Example: Basic scorer test","score":1,"run_count":19}
92
+ {"timestamp":"2025-12-25T16:35:32.076Z","eval_name":"Example: Basic scorer test","score":1,"run_count":20}
93
+ {"timestamp":"2025-12-25T16:36:03.596Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":5}
94
+ {"timestamp":"2025-12-25T16:36:03.596Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":5}
95
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":5}
96
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Swarm Decomposition Quality","score":0.6845833333333333,"run_count":5}
97
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":5}
98
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":5}
99
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":5}
100
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"No False Positives","score":1,"run_count":5}
101
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Example: Basic scorer test","score":1,"run_count":21}
102
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":5}
103
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":5}
104
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Placeholder Detection","score":0,"run_count":5}
105
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Generic Instructions Fail","score":0,"run_count":5}
106
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"First Tool Discipline","score":0,"run_count":5}
107
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":5}
108
+ {"timestamp":"2025-12-25T16:36:03.597Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":5}
109
+ {"timestamp":"2025-12-25T16:36:03.727Z","eval_name":"Example: Basic scorer test","score":1,"run_count":22}
110
+ {"timestamp":"2025-12-25T16:36:03.842Z","eval_name":"Example: Basic scorer test","score":1,"run_count":23}
111
+ {"timestamp":"2025-12-25T16:36:03.951Z","eval_name":"Example: Basic scorer test","score":1,"run_count":24}
112
+ {"timestamp":"2025-12-25T16:36:04.183Z","eval_name":"Example: Basic scorer test","score":1,"run_count":25}
113
+ {"timestamp":"2025-12-25T16:36:04.330Z","eval_name":"Example: Basic scorer test","score":1,"run_count":26}
114
+ {"timestamp":"2025-12-25T16:36:04.445Z","eval_name":"Example: Basic scorer test","score":1,"run_count":27}
115
+ {"timestamp":"2025-12-25T16:36:04.555Z","eval_name":"Example: Basic scorer test","score":1,"run_count":28}
116
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":6}
117
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":6}
118
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":6}
119
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Swarm Decomposition Quality","score":0.6852777777777778,"run_count":6}
120
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":6}
121
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":6}
122
+ {"timestamp":"2025-12-25T16:36:32.176Z","eval_name":"No False Positives","score":1,"run_count":6}
123
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Example: Basic scorer test","score":1,"run_count":29}
124
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":6}
125
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":6}
126
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Placeholder Detection","score":0,"run_count":6}
127
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Generic Instructions Fail","score":0,"run_count":6}
128
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"First Tool Discipline","score":0,"run_count":6}
129
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Coordinator Behavior After Compaction","score":0.978125,"run_count":6}
130
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":6}
131
+ {"timestamp":"2025-12-25T16:36:32.177Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":6}
132
+ {"timestamp":"2025-12-25T16:36:32.305Z","eval_name":"Example: Basic scorer test","score":1,"run_count":30}
133
+ {"timestamp":"2025-12-25T16:36:32.416Z","eval_name":"Example: Basic scorer test","score":1,"run_count":31}
134
+ {"timestamp":"2025-12-25T16:36:32.527Z","eval_name":"Example: Basic scorer test","score":1,"run_count":32}
135
+ {"timestamp":"2025-12-25T16:36:32.755Z","eval_name":"Example: Basic scorer test","score":1,"run_count":33}
136
+ {"timestamp":"2025-12-25T16:36:32.957Z","eval_name":"Example: Basic scorer test","score":1,"run_count":34}
137
+ {"timestamp":"2025-12-25T16:36:33.071Z","eval_name":"Example: Basic scorer test","score":1,"run_count":35}
138
+ {"timestamp":"2025-12-25T16:36:33.180Z","eval_name":"Example: Basic scorer test","score":1,"run_count":36}
139
+ {"timestamp":"2025-12-25T16:38:02.146Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":7}
140
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":7}
141
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":7}
142
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Swarm Decomposition Quality","score":0.6726388888888888,"run_count":7}
143
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":7}
144
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":7}
145
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":7}
146
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"No False Positives","score":1,"run_count":7}
147
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Example: Basic scorer test","score":1,"run_count":37}
148
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":7}
149
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":7}
150
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Placeholder Detection","score":0,"run_count":7}
151
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"Generic Instructions Fail","score":0,"run_count":7}
152
+ {"timestamp":"2025-12-25T16:38:02.147Z","eval_name":"First Tool Discipline","score":0,"run_count":7}
153
+ {"timestamp":"2025-12-25T16:38:02.148Z","eval_name":"Coordinator Behavior After Compaction","score":0.8645833333333333,"run_count":7}
154
+ {"timestamp":"2025-12-25T16:38:02.148Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":7}
155
+ {"timestamp":"2025-12-25T16:38:02.276Z","eval_name":"Example: Basic scorer test","score":1,"run_count":38}
156
+ {"timestamp":"2025-12-25T16:38:02.428Z","eval_name":"Example: Basic scorer test","score":1,"run_count":39}
157
+ {"timestamp":"2025-12-25T16:38:02.547Z","eval_name":"Example: Basic scorer test","score":1,"run_count":40}
158
+ {"timestamp":"2025-12-25T16:38:02.782Z","eval_name":"Example: Basic scorer test","score":1,"run_count":41}
159
+ {"timestamp":"2025-12-25T16:38:02.933Z","eval_name":"Example: Basic scorer test","score":1,"run_count":42}
160
+ {"timestamp":"2025-12-25T16:38:03.050Z","eval_name":"Example: Basic scorer test","score":1,"run_count":43}
161
+ {"timestamp":"2025-12-25T16:38:03.165Z","eval_name":"Example: Basic scorer test","score":1,"run_count":44}
162
+ {"timestamp":"2025-12-25T16:38:52.756Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":8}
163
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":8}
164
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":8}
165
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Swarm Decomposition Quality","score":0.695,"run_count":8}
166
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":8}
167
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":8}
168
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":8}
169
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"No False Positives","score":1,"run_count":8}
170
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Example: Basic scorer test","score":1,"run_count":45}
171
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":8}
172
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Placeholder Detection","score":0,"run_count":8}
173
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Generic Instructions Fail","score":0,"run_count":8}
174
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"First Tool Discipline","score":0,"run_count":8}
175
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":8}
176
+ {"timestamp":"2025-12-25T16:38:52.757Z","eval_name":"Coordinator Behavior After Compaction","score":0.9526041666666667,"run_count":8}
177
+ {"timestamp":"2025-12-25T16:38:52.758Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":8}
178
+ {"timestamp":"2025-12-25T16:38:52.903Z","eval_name":"Example: Basic scorer test","score":1,"run_count":46}
179
+ {"timestamp":"2025-12-25T16:38:53.020Z","eval_name":"Example: Basic scorer test","score":1,"run_count":47}
180
+ {"timestamp":"2025-12-25T16:38:53.136Z","eval_name":"Example: Basic scorer test","score":1,"run_count":48}
181
+ {"timestamp":"2025-12-25T16:38:53.367Z","eval_name":"Example: Basic scorer test","score":1,"run_count":49}
182
+ {"timestamp":"2025-12-25T16:38:53.511Z","eval_name":"Example: Basic scorer test","score":1,"run_count":50}
183
+ {"timestamp":"2025-12-25T16:38:53.624Z","eval_name":"Example: Basic scorer test","score":1,"run_count":51}
184
+ {"timestamp":"2025-12-25T16:38:53.737Z","eval_name":"Example: Basic scorer test","score":1,"run_count":52}
185
+ {"timestamp":"2025-12-25T16:40:39.219Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":9}
186
+ {"timestamp":"2025-12-25T16:40:39.219Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":9}
187
+ {"timestamp":"2025-12-25T16:40:39.219Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":9}
188
+ {"timestamp":"2025-12-25T16:40:39.219Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":9}
189
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Swarm Decomposition Quality","score":0.7020833333333334,"run_count":9}
190
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":9}
191
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"No False Positives","score":1,"run_count":9}
192
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":9}
193
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Example: Basic scorer test","score":1,"run_count":53}
194
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Coordinator Behavior After Compaction","score":0.7291666666666666,"run_count":9}
195
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":9}
196
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":9}
197
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":9}
198
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Placeholder Detection","score":0,"run_count":9}
199
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"Generic Instructions Fail","score":0,"run_count":9}
200
+ {"timestamp":"2025-12-25T16:40:39.220Z","eval_name":"First Tool Discipline","score":0,"run_count":9}
201
+ {"timestamp":"2025-12-25T16:40:39.352Z","eval_name":"Example: Basic scorer test","score":1,"run_count":54}
202
+ {"timestamp":"2025-12-25T16:40:39.460Z","eval_name":"Example: Basic scorer test","score":1,"run_count":55}
203
+ {"timestamp":"2025-12-25T16:40:39.572Z","eval_name":"Example: Basic scorer test","score":1,"run_count":56}
204
+ {"timestamp":"2025-12-25T16:40:39.816Z","eval_name":"Example: Basic scorer test","score":1,"run_count":57}
205
+ {"timestamp":"2025-12-25T16:40:39.947Z","eval_name":"Example: Basic scorer test","score":1,"run_count":58}
206
+ {"timestamp":"2025-12-25T16:40:40.084Z","eval_name":"Example: Basic scorer test","score":1,"run_count":59}
207
+ {"timestamp":"2025-12-25T16:40:40.202Z","eval_name":"Example: Basic scorer test","score":1,"run_count":60}
208
+ {"timestamp":"2025-12-25T16:43:12.851Z","eval_name":"Example: Basic scorer test","score":1,"run_count":61}
209
+ {"timestamp":"2025-12-25T16:43:43.041Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":10}
210
+ {"timestamp":"2025-12-25T16:43:43.041Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":10}
211
+ {"timestamp":"2025-12-25T16:43:43.041Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":10}
212
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Swarm Decomposition Quality","score":0.6909722222222222,"run_count":10}
213
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":10}
214
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":10}
215
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":10}
216
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"No False Positives","score":1,"run_count":10}
217
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Example: Basic scorer test","score":1,"run_count":62}
218
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":10}
219
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Placeholder Detection","score":0,"run_count":10}
220
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Generic Instructions Fail","score":0,"run_count":10}
221
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"First Tool Discipline","score":0,"run_count":10}
222
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":10}
223
+ {"timestamp":"2025-12-25T16:43:43.042Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":10}
224
+ {"timestamp":"2025-12-25T16:43:43.043Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":10}
225
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":11}
226
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":11}
227
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":11}
228
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Swarm Decomposition Quality","score":0.6720833333333333,"run_count":11}
229
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":11}
230
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Example: Basic scorer test","score":1,"run_count":63}
231
+ {"timestamp":"2025-12-25T16:44:12.471Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":11}
232
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":11}
233
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"No False Positives","score":1,"run_count":11}
234
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Coordinator Behavior After Compaction","score":0.9796875,"run_count":11}
235
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":11}
236
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":11}
237
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":11}
238
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Placeholder Detection","score":0,"run_count":11}
239
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"Generic Instructions Fail","score":0,"run_count":11}
240
+ {"timestamp":"2025-12-25T16:44:12.472Z","eval_name":"First Tool Discipline","score":0,"run_count":11}
241
+ {"timestamp":"2025-12-25T16:49:55.548Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":12}
242
+ {"timestamp":"2025-12-25T16:49:55.549Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":12}
243
+ {"timestamp":"2025-12-25T16:49:55.555Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":12}
244
+ {"timestamp":"2025-12-25T16:49:55.555Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":12}
245
+ {"timestamp":"2025-12-25T16:49:55.555Z","eval_name":"Swarm Decomposition Quality","score":0.7001388888888888,"run_count":12}
246
+ {"timestamp":"2025-12-25T16:49:55.555Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":12}
247
+ {"timestamp":"2025-12-25T16:49:55.556Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":12}
248
+ {"timestamp":"2025-12-25T16:49:55.557Z","eval_name":"No False Positives","score":1,"run_count":12}
249
+ {"timestamp":"2025-12-25T16:49:55.557Z","eval_name":"Example: Basic scorer test","score":1,"run_count":64}
250
+ {"timestamp":"2025-12-25T16:49:55.557Z","eval_name":"Generic Instructions Fail","score":0,"run_count":12}
251
+ {"timestamp":"2025-12-25T16:49:55.557Z","eval_name":"First Tool Discipline","score":0,"run_count":12}
252
+ {"timestamp":"2025-12-25T16:49:55.561Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":12}
253
+ {"timestamp":"2025-12-25T16:49:55.561Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":12}
254
+ {"timestamp":"2025-12-25T16:49:55.561Z","eval_name":"Placeholder Detection","score":0,"run_count":12}
255
+ {"timestamp":"2025-12-25T16:49:55.561Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":12}
256
+ {"timestamp":"2025-12-25T16:49:55.565Z","eval_name":"Coordinator Resists Direct Implementation","score":0.9375,"run_count":12}
257
+ {"timestamp":"2025-12-25T16:49:55.697Z","eval_name":"Example: Basic scorer test","score":1,"run_count":65}
258
+ {"timestamp":"2025-12-25T16:49:55.813Z","eval_name":"Example: Basic scorer test","score":1,"run_count":66}
259
+ {"timestamp":"2025-12-25T16:49:55.934Z","eval_name":"Example: Basic scorer test","score":1,"run_count":67}
260
+ {"timestamp":"2025-12-25T16:49:56.178Z","eval_name":"Example: Basic scorer test","score":1,"run_count":68}
261
+ {"timestamp":"2025-12-25T16:49:56.327Z","eval_name":"Example: Basic scorer test","score":1,"run_count":69}
262
+ {"timestamp":"2025-12-25T16:49:56.446Z","eval_name":"Example: Basic scorer test","score":1,"run_count":70}
263
+ {"timestamp":"2025-12-25T16:49:56.556Z","eval_name":"Example: Basic scorer test","score":1,"run_count":71}
264
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":13}
265
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":13}
266
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":13}
267
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":13}
268
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Swarm Decomposition Quality","score":0.6847222222222221,"run_count":13}
269
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":13}
270
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":13}
271
+ {"timestamp":"2025-12-25T17:06:10.610Z","eval_name":"No False Positives","score":1,"run_count":13}
272
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Example: Basic scorer test","score":1,"run_count":72}
273
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":13}
274
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Placeholder Detection","score":0,"run_count":13}
275
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Generic Instructions Fail","score":0,"run_count":13}
276
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"First Tool Discipline","score":0,"run_count":13}
277
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":13}
278
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Coordinator Behavior After Compaction","score":0.8645833333333333,"run_count":13}
279
+ {"timestamp":"2025-12-25T17:06:10.611Z","eval_name":"Coordinator Resists Direct Implementation","score":0.9375,"run_count":13}
280
+ {"timestamp":"2025-12-25T18:58:44.923Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":14}
281
+ {"timestamp":"2025-12-25T18:58:44.923Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":14}
282
+ {"timestamp":"2025-12-25T18:58:44.923Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":14}
283
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Swarm Decomposition Quality","score":0.7095833333333333,"run_count":14}
284
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":14}
285
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":14}
286
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":14}
287
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"No False Positives","score":1,"run_count":14}
288
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Example: Basic scorer test","score":1,"run_count":73}
289
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Generic Instructions Fail","score":0,"run_count":14}
290
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"First Tool Discipline","score":0,"run_count":14}
291
+ {"timestamp":"2025-12-25T18:58:44.924Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":14}
292
+ {"timestamp":"2025-12-25T18:58:44.925Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":14}
293
+ {"timestamp":"2025-12-25T18:58:44.925Z","eval_name":"Placeholder Detection","score":0,"run_count":14}
294
+ {"timestamp":"2025-12-25T18:58:44.925Z","eval_name":"Coordinator Behavior After Compaction","score":0.9375,"run_count":14}
295
+ {"timestamp":"2025-12-25T18:58:44.925Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":14}
296
+ {"timestamp":"2025-12-25T18:59:58.928Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":15}
297
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":15}
298
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":15}
299
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Swarm Decomposition Quality","score":0.6944444444444443,"run_count":15}
300
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":15}
301
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":15}
302
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":15}
303
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"No False Positives","score":1,"run_count":15}
304
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Example: Basic scorer test","score":1,"run_count":74}
305
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":15}
306
+ {"timestamp":"2025-12-25T18:59:58.929Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":15}
307
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"Placeholder Detection","score":0,"run_count":15}
308
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"Generic Instructions Fail","score":0,"run_count":15}
309
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"First Tool Discipline","score":0,"run_count":15}
310
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"Coordinator Behavior After Compaction","score":0.9171875,"run_count":15}
311
+ {"timestamp":"2025-12-25T18:59:58.930Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":15}
312
+ {"timestamp":"2025-12-25T19:00:48.709Z","eval_name":"Coordinator Discipline - Synthetic Fixtures","score":0.6354444444444443,"run_count":16}
313
+ {"timestamp":"2025-12-25T19:00:48.709Z","eval_name":"Coordinator Discipline - Real Sessions","score":0,"run_count":16}
314
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Coordinator Discipline - Perfect vs Bad","score":0.5416666666666666,"run_count":16}
315
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Swarm Decomposition Quality","score":0.5464583333333334,"run_count":16}
316
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Decomposition Edge Cases","score":0.775,"run_count":16}
317
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Compaction Hook Coordinator Resumption","score":0.95,"run_count":16}
318
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Epic ID Specificity","score":0.5,"run_count":16}
319
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"No False Positives","score":1,"run_count":16}
320
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Example: Basic scorer test","score":1,"run_count":75}
321
+ {"timestamp":"2025-12-25T19:00:48.710Z","eval_name":"Perfect Prompt Scores 100%","score":1,"run_count":16}
322
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Placeholder Detection","score":0,"run_count":16}
323
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Generic Instructions Fail","score":0,"run_count":16}
324
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"First Tool Discipline","score":0,"run_count":16}
325
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Compaction Prompt Quality","score":0.6342857142857142,"run_count":16}
326
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Coordinator Behavior After Compaction","score":1,"run_count":16}
327
+ {"timestamp":"2025-12-25T19:00:48.711Z","eval_name":"Coordinator Resists Direct Implementation","score":1,"run_count":16}
@@ -1,9 +1,9 @@
1
- $ bun build ./src/index.ts --outdir ./dist --target node --external @electric-sql/pglite --external swarm-mail && bun build ./src/plugin.ts --outfile ./dist/plugin.js --target node --external @electric-sql/pglite --external swarm-mail && tsc
2
- Bundled 917 modules in 237ms
1
+ $ bun build ./src/index.ts --outdir ./dist --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && bun build ./src/plugin.ts --outfile ./dist/plugin.js --target node --external @electric-sql/pglite --external swarm-mail --external vitest --external @vitest/ui --external lightningcss && tsc
2
+ Bundled 1348 modules in 198ms
3
3
 
4
- index.js 2.16 MB (entry point)
4
+ index.js 4.33 MB (entry point)
5
5
 
6
- Bundled 918 modules in 82ms
6
+ Bundled 1349 modules in 190ms
7
7
 
8
- plugin.js 2.12 MB (entry point)
8
+ plugin.js 4.30 MB (entry point)
9
9
 
package/CHANGELOG.md CHANGED
@@ -1,5 +1,170 @@
1
1
  # opencode-swarm-plugin
2
2
 
3
+ ## 0.42.1
4
+
5
+ ### Patch Changes
6
+
7
+ - [`f6707d5`](https://github.com/joelhooks/swarm-tools/commit/f6707d53eb92021b6976212e903994c98c798483) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🐦 @swarmtoolsai Now Tweets Releases
8
+
9
+ Automated release announcements are live! When packages publish to npm, Claude summarizes the changelog into a tweet and posts from @swarmtoolsai.
10
+
11
+ No more manual "hey we shipped" posts - the bees handle it now.
12
+
13
+ ## 0.42.0
14
+
15
+ ### Minor Changes
16
+
17
+ - [`a79e04b`](https://github.com/joelhooks/swarm-tools/commit/a79e04b1bb3b40c09c5265b5d11739864799e4e2) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🔭 Swarm Observability: See What Your Bees Are Doing
18
+
19
+ > "Observability is about instrumenting your system in a way that ensures sufficient information about a system's runtime is collected and analyzed so that when something goes wrong, it can help you understand why."
20
+ > — Chip Huyen, _AI Engineering_
21
+
22
+ New CLI commands to understand swarm health and history:
23
+
24
+ ### `swarm stats`
25
+
26
+ ```
27
+ ┌─────────────────────────────────────────┐
28
+ │ 🐝 SWARM STATISTICS 🐝 │
29
+ ├─────────────────────────────────────────┤
30
+ │ Total Swarms: 42 Success: 87% │
31
+ │ Avg Duration: 4.2min │
32
+ ├─────────────────────────────────────────┤
33
+ │ BY STRATEGY │
34
+ │ ├─ file-based 92% (23/25) │
35
+ │ ├─ feature-based 78% (14/18) │
36
+ │ ├─ risk-based 67% (2/3) │
37
+ ├─────────────────────────────────────────┤
38
+ │ COORDINATOR HEALTH │
39
+ │ Violation Rate: 2% │
40
+ │ Spawn Efficiency: 94% │
41
+ │ Review Rate: 88% │
42
+ └─────────────────────────────────────────┘
43
+ ```
44
+
45
+ Options: `--since 24h/7d/30d`, `--json`
46
+
47
+ ### `swarm history`
48
+
49
+ Timeline of recent swarm activity with filtering:
50
+
51
+ - `--status success/failed/in_progress`
52
+ - `--strategy file-based/feature-based/risk-based`
53
+ - `--verbose` for subtask details
54
+
55
+ ### Prompt Insights Integration
56
+
57
+ Coordinators and workers now receive injected insights from past swarm outcomes:
58
+
59
+ - Strategy success rates as markdown tables
60
+ - Anti-pattern warnings for low-success strategies
61
+ - File/domain-specific learnings from semantic memory
62
+
63
+ This creates a feedback loop where swarms learn from their own history.
64
+
65
+ ### Also in this release
66
+
67
+ - **swarm-dashboard** (WIP): React/Vite visualizer scaffold
68
+ - **ADR-006**: Swarm PTY decision document
69
+ - **CI fix**: Smarter changeset detection prevents empty PR errors
70
+
71
+ ### Patch Changes
72
+
73
+ - Updated dependencies [[`a79e04b`](https://github.com/joelhooks/swarm-tools/commit/a79e04b1bb3b40c09c5265b5d11739864799e4e2)]:
74
+ - swarm-mail@1.5.4
75
+
76
+ ## 0.41.0
77
+
78
+ ### Minor Changes
79
+
80
+ - [`179b3f0`](https://github.com/joelhooks/swarm-tools/commit/179b3f0e49c7959f8d754c1274d301d0b3845a79) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🐝 Compaction Prompt Now Speaks Swarm
81
+
82
+ > _"Memory is essential for communication: we recall past interactions, infer preferences, and construct evolving mental models of those we engage with."_
83
+ > — Mem0: Building Production-Ready AI Agents with Scalable Long-Term Memory
84
+
85
+ When context compacts mid-swarm, coordinators were waking up confused. They had state information but no protocol guidance. Now the compaction prompt includes a condensed version of the swarm command template.
86
+
87
+ **What's New:**
88
+
89
+ The `SWARM_COMPACTION_CONTEXT` now includes:
90
+
91
+ 1. **What Good Looks Like** - Behavioral examples showing ideal coordinator behavior
92
+
93
+ - ✅ Spawned researcher for unfamiliar tech → got summary → stored in semantic-memory
94
+ - ✅ Checked inbox every 5-10 minutes → caught blocked worker → unblocked in 2min
95
+ - ❌ Called context7 directly → dumped 50KB → context exhaustion
96
+
97
+ 2. **Mandatory Behaviors Checklist** - Post-compaction protocol
98
+ - Inbox monitoring (every 5-10 min with intervention triggers)
99
+ - Skill loading (before spawning workers)
100
+ - Worker review (after every worker returns, 3-strike rule)
101
+ - Research spawning (never call context7/pdf-brain directly)
102
+
103
+ **Why This Matters:**
104
+
105
+ Coordinators resuming from compaction now have:
106
+
107
+ - Clear behavioral guidance (not just state)
108
+ - Actionable tool call examples
109
+ - Anti-patterns to avoid
110
+ - The same protocol as fresh `/swarm` invocations
111
+
112
+ **Backward Compatible:** Existing compaction hooks continue to work. This adds guidance, doesn't change the hook signature.
113
+
114
+ ### Patch Changes
115
+
116
+ - [`3e7c126`](https://github.com/joelhooks/swarm-tools/commit/3e7c126b11aa6ad909ebcb2ab3cf77883f9acfe4) Thanks [@joelhooks](https://github.com/joelhooks)! - ## 🧪 Bulletproof Test Suite
117
+
118
+ > "Setting up our tests to run synchronously and using mocking libraries will greatly speed up our testing"
119
+ > — ng-book
120
+
121
+ Fixed test isolation issues that caused 19 tests to fail when run together but pass in isolation.
122
+
123
+ ### The Culprits
124
+
125
+ **1. Global fetch pollution** (`ollama.test.ts`)
126
+
127
+ ```typescript
128
+ // BEFORE: Replaced global.fetch, never restored it
129
+ global.fetch = mockFetch;
130
+
131
+ // AFTER: Save and restore
132
+ const originalFetch = global.fetch;
133
+ afterEach(() => {
134
+ global.fetch = originalFetch;
135
+ });
136
+ ```
137
+
138
+ **2. Port conflicts** (`durable-server.test.ts`)
139
+
140
+ - Tests used hardcoded ports (4483, 4484, 4485)
141
+ - Parallel test runs fought over the same ports
142
+ - Fixed: Use `port: 0` for OS-assigned ports, made `server.url` a getter
143
+
144
+ **3. AI SDK schema incompatibility** (`memory-operations.ts`)
145
+
146
+ - `z.discriminatedUnion` creates `oneOf` at top level
147
+ - Anthropic API requires `type: object` at top level
148
+ - Fixed: Flat object schema with optional fields
149
+
150
+ ### Test Stats
151
+
152
+ ```
153
+ Before: 19 failures when run together
154
+ After: 0 failures, 1406 tests pass
155
+ ```
156
+
157
+ ### Files Changed
158
+
159
+ - `src/memory/ollama.test.ts` - Restore global.fetch after each test
160
+ - `src/streams/durable-server.ts` - Dynamic port getter
161
+ - `src/streams/durable-server.test.ts` - Use port 0, rewrite for isolation
162
+ - `src/memory/memory-operations.ts` - Flat schema for Anthropic compatibility
163
+ - Renamed `memory-operations.test.ts` → `memory-operations.integration.test.ts`
164
+
165
+ - Updated dependencies [[`3e7c126`](https://github.com/joelhooks/swarm-tools/commit/3e7c126b11aa6ad909ebcb2ab3cf77883f9acfe4)]:
166
+ - swarm-mail@1.5.3
167
+
3
168
  ## 0.40.0
4
169
 
5
170
  ### Minor Changes
package/README.md CHANGED
@@ -5,6 +5,8 @@
5
5
  **🌐 Website:** [swarmtools.ai](https://swarmtools.ai)
6
6
  **📚 Full Documentation:** [swarmtools.ai/docs](https://swarmtools.ai/docs)
7
7
 
8
+ [![Eval Gate](https://github.com/joelhooks/opencode-swarm-plugin/actions/workflows/eval-gate.yml/badge.svg)](https://github.com/joelhooks/opencode-swarm-plugin/actions/workflows/eval-gate.yml)
9
+
8
10
  ```
9
11
  ███████╗██╗ ██╗ █████╗ ██████╗ ███╗ ███╗
10
12
  ██╔════╝██║ ██║██╔══██╗██╔══██╗████╗ ████║