opencode-swarm-plugin 0.39.1 → 0.40.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.hive/issues.jsonl +16 -0
- package/CHANGELOG.md +52 -0
- package/bin/swarm.test.ts +406 -0
- package/bin/swarm.ts +303 -0
- package/dist/compaction-hook.d.ts +8 -1
- package/dist/compaction-hook.d.ts.map +1 -1
- package/dist/compaction-observability.d.ts +173 -0
- package/dist/compaction-observability.d.ts.map +1 -0
- package/dist/eval-capture.d.ts +93 -0
- package/dist/eval-capture.d.ts.map +1 -1
- package/dist/hive.d.ts.map +1 -1
- package/dist/index.d.ts +36 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +15670 -580
- package/dist/plugin.js +15623 -557
- package/dist/schemas/task.d.ts +3 -3
- package/evals/README.md +113 -0
- package/evals/scorers/coordinator-discipline.evalite-test.ts +163 -0
- package/evals/scorers/coordinator-discipline.ts +335 -2
- package/evals/scorers/index.test.ts +146 -0
- package/evals/scorers/index.ts +104 -0
- package/evals/swarm-decomposition.eval.ts +9 -2
- package/examples/commands/swarm.md +291 -21
- package/package.json +1 -1
- package/src/compaction-hook.ts +258 -110
- package/src/compaction-observability.integration.test.ts +139 -0
- package/src/compaction-observability.test.ts +187 -0
- package/src/compaction-observability.ts +324 -0
- package/src/eval-capture.test.ts +204 -1
- package/src/eval-capture.ts +194 -2
- package/src/eval-runner.test.ts +96 -0
- package/src/eval-runner.ts +356 -0
- package/src/hive.ts +34 -0
- package/src/index.ts +54 -1
- package/src/memory.test.ts +110 -0
- package/src/memory.ts +34 -0
- package/dist/beads.d.ts +0 -386
- package/dist/beads.d.ts.map +0 -1
- package/dist/schemas/bead-events.d.ts +0 -698
- package/dist/schemas/bead-events.d.ts.map +0 -1
- package/dist/schemas/bead.d.ts +0 -255
- package/dist/schemas/bead.d.ts.map +0 -1
package/src/eval-capture.test.ts
CHANGED
|
@@ -13,7 +13,7 @@ import {
|
|
|
13
13
|
captureCoordinatorEvent,
|
|
14
14
|
captureCompactionEvent,
|
|
15
15
|
saveSession,
|
|
16
|
-
} from "./eval-capture.
|
|
16
|
+
} from "./eval-capture.ts";
|
|
17
17
|
|
|
18
18
|
describe("CoordinatorEvent schemas", () => {
|
|
19
19
|
describe("DECISION events", () => {
|
|
@@ -82,6 +82,110 @@ describe("CoordinatorEvent schemas", () => {
|
|
|
82
82
|
|
|
83
83
|
expect(() => CoordinatorEventSchema.parse(event)).not.toThrow();
|
|
84
84
|
});
|
|
85
|
+
|
|
86
|
+
test("validates researcher_spawned event", () => {
|
|
87
|
+
const event: CoordinatorEvent = {
|
|
88
|
+
session_id: "test-session",
|
|
89
|
+
epic_id: "bd-123",
|
|
90
|
+
timestamp: new Date().toISOString(),
|
|
91
|
+
event_type: "DECISION",
|
|
92
|
+
decision_type: "researcher_spawned",
|
|
93
|
+
payload: {
|
|
94
|
+
researcher_id: "BlueLake",
|
|
95
|
+
research_topic: "Next.js Cache Components",
|
|
96
|
+
tools_used: ["pdf-brain", "context7"],
|
|
97
|
+
},
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
expect(() => CoordinatorEventSchema.parse(event)).not.toThrow();
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
test("validates skill_loaded event", () => {
|
|
104
|
+
const event: CoordinatorEvent = {
|
|
105
|
+
session_id: "test-session",
|
|
106
|
+
epic_id: "bd-123",
|
|
107
|
+
timestamp: new Date().toISOString(),
|
|
108
|
+
event_type: "DECISION",
|
|
109
|
+
decision_type: "skill_loaded",
|
|
110
|
+
payload: {
|
|
111
|
+
skill_name: "testing-patterns",
|
|
112
|
+
context: "Adding tests to legacy code",
|
|
113
|
+
},
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
expect(() => CoordinatorEventSchema.parse(event)).not.toThrow();
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
test("validates inbox_checked event", () => {
|
|
120
|
+
const event: CoordinatorEvent = {
|
|
121
|
+
session_id: "test-session",
|
|
122
|
+
epic_id: "bd-123",
|
|
123
|
+
timestamp: new Date().toISOString(),
|
|
124
|
+
event_type: "DECISION",
|
|
125
|
+
decision_type: "inbox_checked",
|
|
126
|
+
payload: {
|
|
127
|
+
message_count: 3,
|
|
128
|
+
urgent_count: 1,
|
|
129
|
+
},
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
expect(() => CoordinatorEventSchema.parse(event)).not.toThrow();
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
test("validates blocker_resolved event", () => {
|
|
136
|
+
const event: CoordinatorEvent = {
|
|
137
|
+
session_id: "test-session",
|
|
138
|
+
epic_id: "bd-123",
|
|
139
|
+
timestamp: new Date().toISOString(),
|
|
140
|
+
event_type: "DECISION",
|
|
141
|
+
decision_type: "blocker_resolved",
|
|
142
|
+
payload: {
|
|
143
|
+
worker_id: "GreenStorm",
|
|
144
|
+
subtask_id: "bd-123.2",
|
|
145
|
+
blocker_type: "dependency",
|
|
146
|
+
resolution: "Unblocked via coordinator action",
|
|
147
|
+
},
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
expect(() => CoordinatorEventSchema.parse(event)).not.toThrow();
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
test("validates scope_change_approved event", () => {
|
|
154
|
+
const event: CoordinatorEvent = {
|
|
155
|
+
session_id: "test-session",
|
|
156
|
+
epic_id: "bd-123",
|
|
157
|
+
timestamp: new Date().toISOString(),
|
|
158
|
+
event_type: "DECISION",
|
|
159
|
+
decision_type: "scope_change_approved",
|
|
160
|
+
payload: {
|
|
161
|
+
worker_id: "BlueLake",
|
|
162
|
+
subtask_id: "bd-123.1",
|
|
163
|
+
original_scope: "Add auth service",
|
|
164
|
+
new_scope: "Add auth service + email validation",
|
|
165
|
+
estimated_time_add: 900000, // 15 min in ms
|
|
166
|
+
},
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
expect(() => CoordinatorEventSchema.parse(event)).not.toThrow();
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
test("validates scope_change_rejected event", () => {
|
|
173
|
+
const event: CoordinatorEvent = {
|
|
174
|
+
session_id: "test-session",
|
|
175
|
+
epic_id: "bd-123",
|
|
176
|
+
timestamp: new Date().toISOString(),
|
|
177
|
+
event_type: "DECISION",
|
|
178
|
+
decision_type: "scope_change_rejected",
|
|
179
|
+
payload: {
|
|
180
|
+
worker_id: "BlueLake",
|
|
181
|
+
subtask_id: "bd-123.1",
|
|
182
|
+
requested_scope: "Add auth service + OAuth + SSO",
|
|
183
|
+
rejection_reason: "Too large for single subtask",
|
|
184
|
+
},
|
|
185
|
+
};
|
|
186
|
+
|
|
187
|
+
expect(() => CoordinatorEventSchema.parse(event)).not.toThrow();
|
|
188
|
+
});
|
|
85
189
|
});
|
|
86
190
|
|
|
87
191
|
describe("VIOLATION events", () => {
|
|
@@ -215,6 +319,25 @@ describe("CoordinatorEvent schemas", () => {
|
|
|
215
319
|
|
|
216
320
|
expect(() => CoordinatorEventSchema.parse(event)).not.toThrow();
|
|
217
321
|
});
|
|
322
|
+
|
|
323
|
+
test("validates blocker_detected event", () => {
|
|
324
|
+
const event: CoordinatorEvent = {
|
|
325
|
+
session_id: "test-session",
|
|
326
|
+
epic_id: "bd-123",
|
|
327
|
+
timestamp: new Date().toISOString(),
|
|
328
|
+
event_type: "OUTCOME",
|
|
329
|
+
outcome_type: "blocker_detected",
|
|
330
|
+
payload: {
|
|
331
|
+
worker_id: "GreenStorm",
|
|
332
|
+
subtask_id: "bd-123.2",
|
|
333
|
+
blocker_type: "dependency",
|
|
334
|
+
blocker_description: "Waiting for database schema from bd-123.1",
|
|
335
|
+
reported_at: new Date().toISOString(),
|
|
336
|
+
},
|
|
337
|
+
};
|
|
338
|
+
|
|
339
|
+
expect(() => CoordinatorEventSchema.parse(event)).not.toThrow();
|
|
340
|
+
});
|
|
218
341
|
});
|
|
219
342
|
});
|
|
220
343
|
|
|
@@ -810,3 +933,83 @@ describe("captureCompactionEvent", () => {
|
|
|
810
933
|
expect(capturedEvents[4].compaction_type).toBe("tool_call_tracked");
|
|
811
934
|
});
|
|
812
935
|
});
|
|
936
|
+
|
|
937
|
+
describe("hive_create_epic integration - decomposition_complete event", () => {
|
|
938
|
+
let sessionDir: string;
|
|
939
|
+
let sessionId: string;
|
|
940
|
+
const testProjectPath = "/tmp/test-epic-decomposition";
|
|
941
|
+
|
|
942
|
+
beforeEach(() => {
|
|
943
|
+
sessionDir = path.join(os.homedir(), ".config", "swarm-tools", "sessions");
|
|
944
|
+
sessionId = `test-epic-${Date.now()}`;
|
|
945
|
+
});
|
|
946
|
+
|
|
947
|
+
afterEach(() => {
|
|
948
|
+
// Clean up test session file
|
|
949
|
+
const sessionPath = path.join(sessionDir, `${sessionId}.jsonl`);
|
|
950
|
+
if (fs.existsSync(sessionPath)) {
|
|
951
|
+
fs.unlinkSync(sessionPath);
|
|
952
|
+
}
|
|
953
|
+
});
|
|
954
|
+
|
|
955
|
+
test("captures decomposition_complete event after hive_create_epic succeeds", async () => {
|
|
956
|
+
// Test the event capture by calling captureCoordinatorEvent directly
|
|
957
|
+
// Testing hive_create_epic directly would require full plugin infrastructure
|
|
958
|
+
|
|
959
|
+
// GIVEN: We simulate what hive_create_epic does after epic creation
|
|
960
|
+
const epicId = `test-epic-${Date.now()}`;
|
|
961
|
+
const subtasks = [
|
|
962
|
+
{ title: "Subtask 1", files: ["src/a.ts"] },
|
|
963
|
+
{ title: "Subtask 2", files: ["src/b.ts", "src/c.ts"] },
|
|
964
|
+
{ title: "Subtask 3", files: ["src/d.ts"] },
|
|
965
|
+
];
|
|
966
|
+
|
|
967
|
+
// Build files_per_subtask map (same logic as hive.ts)
|
|
968
|
+
const filesPerSubtask: Record<number, string[]> = {};
|
|
969
|
+
subtasks.forEach((subtask, index) => {
|
|
970
|
+
if (subtask.files && subtask.files.length > 0) {
|
|
971
|
+
filesPerSubtask[index] = subtask.files;
|
|
972
|
+
}
|
|
973
|
+
});
|
|
974
|
+
|
|
975
|
+
// WHEN: decomposition_complete event is captured
|
|
976
|
+
captureCoordinatorEvent({
|
|
977
|
+
session_id: sessionId,
|
|
978
|
+
epic_id: epicId,
|
|
979
|
+
timestamp: new Date().toISOString(),
|
|
980
|
+
event_type: "DECISION",
|
|
981
|
+
decision_type: "decomposition_complete",
|
|
982
|
+
payload: {
|
|
983
|
+
subtask_count: subtasks.length,
|
|
984
|
+
strategy_used: "file-based",
|
|
985
|
+
files_per_subtask: filesPerSubtask,
|
|
986
|
+
epic_title: "Test Epic for Event Capture",
|
|
987
|
+
task: "Original task description",
|
|
988
|
+
},
|
|
989
|
+
});
|
|
990
|
+
|
|
991
|
+
// THEN: Event should be written to session file
|
|
992
|
+
const sessionPath = path.join(sessionDir, `${sessionId}.jsonl`);
|
|
993
|
+
expect(fs.existsSync(sessionPath)).toBe(true);
|
|
994
|
+
|
|
995
|
+
const content = fs.readFileSync(sessionPath, "utf-8");
|
|
996
|
+
const lines = content.trim().split("\n").filter(Boolean);
|
|
997
|
+
expect(lines.length).toBe(1);
|
|
998
|
+
|
|
999
|
+
// Verify event structure
|
|
1000
|
+
const event = JSON.parse(lines[0]);
|
|
1001
|
+
expect(event.session_id).toBe(sessionId);
|
|
1002
|
+
expect(event.epic_id).toBe(epicId);
|
|
1003
|
+
expect(event.event_type).toBe("DECISION");
|
|
1004
|
+
expect(event.decision_type).toBe("decomposition_complete");
|
|
1005
|
+
expect(event.payload.subtask_count).toBe(3);
|
|
1006
|
+
expect(event.payload.strategy_used).toBe("file-based");
|
|
1007
|
+
expect(event.payload.files_per_subtask).toEqual({
|
|
1008
|
+
0: ["src/a.ts"],
|
|
1009
|
+
1: ["src/b.ts", "src/c.ts"],
|
|
1010
|
+
2: ["src/d.ts"],
|
|
1011
|
+
});
|
|
1012
|
+
expect(event.payload.epic_title).toBe("Test Epic for Event Capture");
|
|
1013
|
+
expect(event.payload.task).toBe("Original task description");
|
|
1014
|
+
});
|
|
1015
|
+
});
|
package/src/eval-capture.ts
CHANGED
|
@@ -13,9 +13,9 @@
|
|
|
13
13
|
* 6. Session capture: full coordinator session to ~/.config/swarm-tools/sessions/
|
|
14
14
|
*
|
|
15
15
|
* Event types:
|
|
16
|
-
* - DECISION: strategy_selected, worker_spawned, review_completed, decomposition_complete
|
|
16
|
+
* - DECISION: strategy_selected, worker_spawned, review_completed, decomposition_complete, researcher_spawned, skill_loaded, inbox_checked, blocker_resolved, scope_change_approved, scope_change_rejected
|
|
17
17
|
* - VIOLATION: coordinator_edited_file, coordinator_ran_tests, coordinator_reserved_files, no_worker_spawned
|
|
18
|
-
* - OUTCOME: subtask_success, subtask_retry, subtask_failed, epic_complete
|
|
18
|
+
* - OUTCOME: subtask_success, subtask_retry, subtask_failed, epic_complete, blocker_detected
|
|
19
19
|
* - COMPACTION: detection_complete, prompt_generated, context_injected, resumption_started, tool_call_tracked
|
|
20
20
|
*
|
|
21
21
|
* @module eval-capture
|
|
@@ -143,6 +143,12 @@ export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
|
|
|
143
143
|
"worker_spawned",
|
|
144
144
|
"review_completed",
|
|
145
145
|
"decomposition_complete",
|
|
146
|
+
"researcher_spawned",
|
|
147
|
+
"skill_loaded",
|
|
148
|
+
"inbox_checked",
|
|
149
|
+
"blocker_resolved",
|
|
150
|
+
"scope_change_approved",
|
|
151
|
+
"scope_change_rejected",
|
|
146
152
|
]),
|
|
147
153
|
payload: z.any(),
|
|
148
154
|
}),
|
|
@@ -171,6 +177,7 @@ export const CoordinatorEventSchema = z.discriminatedUnion("event_type", [
|
|
|
171
177
|
"subtask_retry",
|
|
172
178
|
"subtask_failed",
|
|
173
179
|
"epic_complete",
|
|
180
|
+
"blocker_detected",
|
|
174
181
|
]),
|
|
175
182
|
payload: z.any(),
|
|
176
183
|
}),
|
|
@@ -687,6 +694,191 @@ export function captureCompactionEvent(params: {
|
|
|
687
694
|
captureCoordinatorEvent(event);
|
|
688
695
|
}
|
|
689
696
|
|
|
697
|
+
/**
|
|
698
|
+
* Capture a researcher spawned event
|
|
699
|
+
*
|
|
700
|
+
* Called when coordinator spawns a swarm-researcher to handle unfamiliar technology
|
|
701
|
+
* or gather documentation before decomposition.
|
|
702
|
+
*/
|
|
703
|
+
export function captureResearcherSpawned(params: {
|
|
704
|
+
session_id: string;
|
|
705
|
+
epic_id: string;
|
|
706
|
+
researcher_id: string;
|
|
707
|
+
research_topic: string;
|
|
708
|
+
tools_used?: string[];
|
|
709
|
+
}): void {
|
|
710
|
+
const event: CoordinatorEvent = {
|
|
711
|
+
session_id: params.session_id,
|
|
712
|
+
epic_id: params.epic_id,
|
|
713
|
+
timestamp: new Date().toISOString(),
|
|
714
|
+
event_type: "DECISION",
|
|
715
|
+
decision_type: "researcher_spawned",
|
|
716
|
+
payload: {
|
|
717
|
+
researcher_id: params.researcher_id,
|
|
718
|
+
research_topic: params.research_topic,
|
|
719
|
+
tools_used: params.tools_used || [],
|
|
720
|
+
},
|
|
721
|
+
};
|
|
722
|
+
|
|
723
|
+
captureCoordinatorEvent(event);
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
/**
|
|
727
|
+
* Capture a skill loaded event
|
|
728
|
+
*
|
|
729
|
+
* Called when coordinator loads domain knowledge via skills_use().
|
|
730
|
+
*/
|
|
731
|
+
export function captureSkillLoaded(params: {
|
|
732
|
+
session_id: string;
|
|
733
|
+
epic_id: string;
|
|
734
|
+
skill_name: string;
|
|
735
|
+
context?: string;
|
|
736
|
+
}): void {
|
|
737
|
+
const event: CoordinatorEvent = {
|
|
738
|
+
session_id: params.session_id,
|
|
739
|
+
epic_id: params.epic_id,
|
|
740
|
+
timestamp: new Date().toISOString(),
|
|
741
|
+
event_type: "DECISION",
|
|
742
|
+
decision_type: "skill_loaded",
|
|
743
|
+
payload: {
|
|
744
|
+
skill_name: params.skill_name,
|
|
745
|
+
context: params.context,
|
|
746
|
+
},
|
|
747
|
+
};
|
|
748
|
+
|
|
749
|
+
captureCoordinatorEvent(event);
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
/**
|
|
753
|
+
* Capture an inbox checked event
|
|
754
|
+
*
|
|
755
|
+
* Called when coordinator checks swarmmail inbox for worker messages.
|
|
756
|
+
* Tracks monitoring frequency and responsiveness.
|
|
757
|
+
*/
|
|
758
|
+
export function captureInboxChecked(params: {
|
|
759
|
+
session_id: string;
|
|
760
|
+
epic_id: string;
|
|
761
|
+
message_count: number;
|
|
762
|
+
urgent_count: number;
|
|
763
|
+
}): void {
|
|
764
|
+
const event: CoordinatorEvent = {
|
|
765
|
+
session_id: params.session_id,
|
|
766
|
+
epic_id: params.epic_id,
|
|
767
|
+
timestamp: new Date().toISOString(),
|
|
768
|
+
event_type: "DECISION",
|
|
769
|
+
decision_type: "inbox_checked",
|
|
770
|
+
payload: {
|
|
771
|
+
message_count: params.message_count,
|
|
772
|
+
urgent_count: params.urgent_count,
|
|
773
|
+
},
|
|
774
|
+
};
|
|
775
|
+
|
|
776
|
+
captureCoordinatorEvent(event);
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
/**
|
|
780
|
+
* Capture a blocker resolved event
|
|
781
|
+
*
|
|
782
|
+
* Called when coordinator successfully unblocks a worker.
|
|
783
|
+
*/
|
|
784
|
+
export function captureBlockerResolved(params: {
|
|
785
|
+
session_id: string;
|
|
786
|
+
epic_id: string;
|
|
787
|
+
worker_id: string;
|
|
788
|
+
subtask_id: string;
|
|
789
|
+
blocker_type: string;
|
|
790
|
+
resolution: string;
|
|
791
|
+
}): void {
|
|
792
|
+
const event: CoordinatorEvent = {
|
|
793
|
+
session_id: params.session_id,
|
|
794
|
+
epic_id: params.epic_id,
|
|
795
|
+
timestamp: new Date().toISOString(),
|
|
796
|
+
event_type: "DECISION",
|
|
797
|
+
decision_type: "blocker_resolved",
|
|
798
|
+
payload: {
|
|
799
|
+
worker_id: params.worker_id,
|
|
800
|
+
subtask_id: params.subtask_id,
|
|
801
|
+
blocker_type: params.blocker_type,
|
|
802
|
+
resolution: params.resolution,
|
|
803
|
+
},
|
|
804
|
+
};
|
|
805
|
+
|
|
806
|
+
captureCoordinatorEvent(event);
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
/**
|
|
810
|
+
* Capture a scope change decision event
|
|
811
|
+
*
|
|
812
|
+
* Called when coordinator approves or rejects a worker's scope expansion request.
|
|
813
|
+
*/
|
|
814
|
+
export function captureScopeChangeDecision(params: {
|
|
815
|
+
session_id: string;
|
|
816
|
+
epic_id: string;
|
|
817
|
+
worker_id: string;
|
|
818
|
+
subtask_id: string;
|
|
819
|
+
approved: boolean;
|
|
820
|
+
original_scope?: string;
|
|
821
|
+
new_scope?: string;
|
|
822
|
+
requested_scope?: string;
|
|
823
|
+
rejection_reason?: string;
|
|
824
|
+
estimated_time_add?: number;
|
|
825
|
+
}): void {
|
|
826
|
+
const event: CoordinatorEvent = {
|
|
827
|
+
session_id: params.session_id,
|
|
828
|
+
epic_id: params.epic_id,
|
|
829
|
+
timestamp: new Date().toISOString(),
|
|
830
|
+
event_type: "DECISION",
|
|
831
|
+
decision_type: params.approved ? "scope_change_approved" : "scope_change_rejected",
|
|
832
|
+
payload: params.approved
|
|
833
|
+
? {
|
|
834
|
+
worker_id: params.worker_id,
|
|
835
|
+
subtask_id: params.subtask_id,
|
|
836
|
+
original_scope: params.original_scope,
|
|
837
|
+
new_scope: params.new_scope,
|
|
838
|
+
estimated_time_add: params.estimated_time_add,
|
|
839
|
+
}
|
|
840
|
+
: {
|
|
841
|
+
worker_id: params.worker_id,
|
|
842
|
+
subtask_id: params.subtask_id,
|
|
843
|
+
requested_scope: params.requested_scope,
|
|
844
|
+
rejection_reason: params.rejection_reason,
|
|
845
|
+
},
|
|
846
|
+
};
|
|
847
|
+
|
|
848
|
+
captureCoordinatorEvent(event);
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
/**
|
|
852
|
+
* Capture a blocker detected event
|
|
853
|
+
*
|
|
854
|
+
* Called when a worker reports being blocked (OUTCOME event, not DECISION).
|
|
855
|
+
*/
|
|
856
|
+
export function captureBlockerDetected(params: {
|
|
857
|
+
session_id: string;
|
|
858
|
+
epic_id: string;
|
|
859
|
+
worker_id: string;
|
|
860
|
+
subtask_id: string;
|
|
861
|
+
blocker_type: string;
|
|
862
|
+
blocker_description: string;
|
|
863
|
+
}): void {
|
|
864
|
+
const event: CoordinatorEvent = {
|
|
865
|
+
session_id: params.session_id,
|
|
866
|
+
epic_id: params.epic_id,
|
|
867
|
+
timestamp: new Date().toISOString(),
|
|
868
|
+
event_type: "OUTCOME",
|
|
869
|
+
outcome_type: "blocker_detected",
|
|
870
|
+
payload: {
|
|
871
|
+
worker_id: params.worker_id,
|
|
872
|
+
subtask_id: params.subtask_id,
|
|
873
|
+
blocker_type: params.blocker_type,
|
|
874
|
+
blocker_description: params.blocker_description,
|
|
875
|
+
reported_at: new Date().toISOString(),
|
|
876
|
+
},
|
|
877
|
+
};
|
|
878
|
+
|
|
879
|
+
captureCoordinatorEvent(event);
|
|
880
|
+
}
|
|
881
|
+
|
|
690
882
|
/**
|
|
691
883
|
* Read all events from a session file
|
|
692
884
|
*/
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for eval-runner - Programmatic evalite execution
|
|
3
|
+
*
|
|
4
|
+
* TDD: These tests MUST fail initially, then pass after implementation.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { describe, test, expect, beforeAll } from "bun:test";
|
|
8
|
+
import { runEvals } from "./eval-runner";
|
|
9
|
+
import path from "node:path";
|
|
10
|
+
|
|
11
|
+
// Use project root for all tests
|
|
12
|
+
const PROJECT_ROOT = path.resolve(import.meta.dir, "..");
|
|
13
|
+
|
|
14
|
+
describe("runEvals", () => {
|
|
15
|
+
test("runs all evals when no suite filter provided", async () => {
|
|
16
|
+
const result = await runEvals({
|
|
17
|
+
cwd: PROJECT_ROOT,
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
// Even if some evals fail, we should get results
|
|
21
|
+
expect(typeof result.success).toBe("boolean");
|
|
22
|
+
expect(typeof result.totalSuites).toBe("number");
|
|
23
|
+
expect(typeof result.totalEvals).toBe("number");
|
|
24
|
+
expect(typeof result.averageScore).toBe("number");
|
|
25
|
+
expect(Array.isArray(result.suites)).toBe(true);
|
|
26
|
+
|
|
27
|
+
// Should have at least the example.eval.ts suite
|
|
28
|
+
expect(result.totalSuites).toBeGreaterThan(0);
|
|
29
|
+
expect(result.suites.length).toBeGreaterThan(0);
|
|
30
|
+
}, 60000); // 60s timeout for full eval run
|
|
31
|
+
|
|
32
|
+
test("filters evals by suite name", async () => {
|
|
33
|
+
const result = await runEvals({
|
|
34
|
+
cwd: PROJECT_ROOT,
|
|
35
|
+
suiteFilter: "example",
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
expect(result.success).toBe(true);
|
|
39
|
+
// All suite filepaths should contain "example"
|
|
40
|
+
for (const suite of result.suites) {
|
|
41
|
+
expect(suite.filepath.toLowerCase()).toContain("example");
|
|
42
|
+
}
|
|
43
|
+
}, 30000);
|
|
44
|
+
|
|
45
|
+
test("respects score threshold", async () => {
|
|
46
|
+
const result = await runEvals({
|
|
47
|
+
cwd: PROJECT_ROOT,
|
|
48
|
+
suiteFilter: "example", // Known good eval
|
|
49
|
+
scoreThreshold: 0, // Very low threshold, should pass
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
expect(result.success).toBe(true);
|
|
53
|
+
expect(result.averageScore).toBeGreaterThanOrEqual(0);
|
|
54
|
+
}, 30000);
|
|
55
|
+
|
|
56
|
+
test("returns structured suite results with scores", async () => {
|
|
57
|
+
const result = await runEvals({
|
|
58
|
+
cwd: PROJECT_ROOT,
|
|
59
|
+
suiteFilter: "example",
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
expect(result.suites.length).toBeGreaterThan(0);
|
|
63
|
+
|
|
64
|
+
const suite = result.suites[0];
|
|
65
|
+
expect(suite).toMatchObject({
|
|
66
|
+
name: expect.any(String),
|
|
67
|
+
filepath: expect.any(String),
|
|
68
|
+
status: expect.stringMatching(/^(success|fail|running)$/),
|
|
69
|
+
duration: expect.any(Number),
|
|
70
|
+
averageScore: expect.any(Number),
|
|
71
|
+
evalCount: expect.any(Number),
|
|
72
|
+
});
|
|
73
|
+
}, 30000);
|
|
74
|
+
|
|
75
|
+
test("handles errors gracefully", async () => {
|
|
76
|
+
const result = await runEvals({
|
|
77
|
+
cwd: "/nonexistent/path",
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
expect(result.success).toBe(false);
|
|
81
|
+
expect(result.error).toBeDefined();
|
|
82
|
+
expect(result.suites).toEqual([]);
|
|
83
|
+
}, 10000);
|
|
84
|
+
|
|
85
|
+
test("returns empty results when no evals match filter", async () => {
|
|
86
|
+
const result = await runEvals({
|
|
87
|
+
cwd: PROJECT_ROOT,
|
|
88
|
+
suiteFilter: "nonexistent-eval-name-xyz",
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
// Should succeed but with no suites
|
|
92
|
+
expect(result.success).toBe(true);
|
|
93
|
+
expect(result.totalSuites).toBe(0);
|
|
94
|
+
expect(result.suites).toEqual([]);
|
|
95
|
+
}, 10000);
|
|
96
|
+
});
|