opencode-swarm-plugin 0.38.0 → 0.40.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.env +2 -0
  2. package/.hive/eval-results.json +26 -0
  3. package/.hive/issues.jsonl +27 -0
  4. package/.hive/memories.jsonl +23 -1
  5. package/.opencode/eval-history.jsonl +12 -0
  6. package/CHANGELOG.md +182 -0
  7. package/README.md +29 -12
  8. package/bin/swarm.test.ts +881 -0
  9. package/bin/swarm.ts +686 -0
  10. package/dist/compaction-hook.d.ts +8 -1
  11. package/dist/compaction-hook.d.ts.map +1 -1
  12. package/dist/compaction-observability.d.ts +173 -0
  13. package/dist/compaction-observability.d.ts.map +1 -0
  14. package/dist/compaction-prompt-scoring.d.ts +124 -0
  15. package/dist/compaction-prompt-scoring.d.ts.map +1 -0
  16. package/dist/eval-capture.d.ts +174 -1
  17. package/dist/eval-capture.d.ts.map +1 -1
  18. package/dist/eval-gates.d.ts +84 -0
  19. package/dist/eval-gates.d.ts.map +1 -0
  20. package/dist/eval-history.d.ts +117 -0
  21. package/dist/eval-history.d.ts.map +1 -0
  22. package/dist/eval-learning.d.ts +216 -0
  23. package/dist/eval-learning.d.ts.map +1 -0
  24. package/dist/hive.d.ts.map +1 -1
  25. package/dist/index.d.ts +80 -1
  26. package/dist/index.d.ts.map +1 -1
  27. package/dist/index.js +16098 -651
  28. package/dist/plugin.js +16012 -756
  29. package/dist/post-compaction-tracker.d.ts +133 -0
  30. package/dist/post-compaction-tracker.d.ts.map +1 -0
  31. package/dist/schemas/task.d.ts +3 -3
  32. package/dist/swarm-orchestrate.d.ts +23 -0
  33. package/dist/swarm-orchestrate.d.ts.map +1 -1
  34. package/dist/swarm-prompts.d.ts +25 -1
  35. package/dist/swarm-prompts.d.ts.map +1 -1
  36. package/dist/swarm.d.ts +4 -0
  37. package/dist/swarm.d.ts.map +1 -1
  38. package/evals/README.md +702 -105
  39. package/evals/compaction-prompt.eval.ts +149 -0
  40. package/evals/coordinator-behavior.eval.ts +8 -8
  41. package/evals/fixtures/compaction-prompt-cases.ts +305 -0
  42. package/evals/lib/compaction-loader.test.ts +248 -0
  43. package/evals/lib/compaction-loader.ts +320 -0
  44. package/evals/lib/data-loader.test.ts +345 -0
  45. package/evals/lib/data-loader.ts +107 -6
  46. package/evals/scorers/compaction-prompt-scorers.ts +145 -0
  47. package/evals/scorers/compaction-scorers.ts +13 -13
  48. package/evals/scorers/coordinator-discipline.evalite-test.ts +166 -2
  49. package/evals/scorers/coordinator-discipline.ts +348 -15
  50. package/evals/scorers/index.test.ts +146 -0
  51. package/evals/scorers/index.ts +104 -0
  52. package/evals/swarm-decomposition.eval.ts +9 -2
  53. package/examples/commands/swarm.md +291 -21
  54. package/examples/plugin-wrapper-template.ts +117 -0
  55. package/package.json +7 -5
  56. package/scripts/migrate-unknown-sessions.ts +349 -0
  57. package/src/compaction-capture.integration.test.ts +257 -0
  58. package/src/compaction-hook.test.ts +42 -0
  59. package/src/compaction-hook.ts +315 -86
  60. package/src/compaction-observability.integration.test.ts +139 -0
  61. package/src/compaction-observability.test.ts +187 -0
  62. package/src/compaction-observability.ts +324 -0
  63. package/src/compaction-prompt-scorers.test.ts +299 -0
  64. package/src/compaction-prompt-scoring.ts +298 -0
  65. package/src/eval-capture.test.ts +626 -1
  66. package/src/eval-capture.ts +286 -2
  67. package/src/eval-gates.test.ts +306 -0
  68. package/src/eval-gates.ts +218 -0
  69. package/src/eval-history.test.ts +508 -0
  70. package/src/eval-history.ts +214 -0
  71. package/src/eval-learning.test.ts +378 -0
  72. package/src/eval-learning.ts +360 -0
  73. package/src/eval-runner.test.ts +96 -0
  74. package/src/eval-runner.ts +356 -0
  75. package/src/hive.ts +34 -0
  76. package/src/index.ts +115 -2
  77. package/src/memory.test.ts +110 -0
  78. package/src/memory.ts +34 -0
  79. package/src/post-compaction-tracker.test.ts +251 -0
  80. package/src/post-compaction-tracker.ts +237 -0
  81. package/src/swarm-decompose.ts +2 -2
  82. package/src/swarm-orchestrate.ts +2 -2
  83. package/src/swarm-prompts.ts +2 -2
  84. package/src/swarm-review.ts +3 -3
  85. package/dist/beads.d.ts +0 -386
  86. package/dist/beads.d.ts.map +0 -1
  87. package/dist/schemas/bead-events.d.ts +0 -698
  88. package/dist/schemas/bead-events.d.ts.map +0 -1
  89. package/dist/schemas/bead.d.ts +0 -255
  90. package/dist/schemas/bead.d.ts.map +0 -1
  91. /package/evals/{evalite.config.ts → evalite.config.ts.bak} +0 -0
@@ -5,6 +5,7 @@ import { describe, expect, it } from "bun:test";
5
5
  import type { CoordinatorSession } from "../../src/eval-capture.js";
6
6
  import {
7
7
  overallDiscipline,
8
+ reviewEfficiency,
8
9
  reviewThoroughness,
9
10
  spawnEfficiency,
10
11
  timeToFirstSpawn,
@@ -12,7 +13,7 @@ import {
12
13
  } from "./coordinator-discipline.js";
13
14
 
14
15
  describe("violationCount", () => {
15
- it("scores 1.0 for zero violations", () => {
16
+ it("scores 1.0 for zero violations", async () => {
16
17
  const session: CoordinatorSession = {
17
18
  session_id: "test-session",
18
19
  epic_id: "test-epic",
@@ -30,9 +31,10 @@ describe("violationCount", () => {
30
31
  ],
31
32
  };
32
33
 
33
- const result = violationCount.scorer({
34
+ const result = await violationCount({
34
35
  output: JSON.stringify(session),
35
36
  expected: {},
37
+ input: undefined,
36
38
  });
37
39
 
38
40
  expect(result.score).toBe(1.0);
@@ -534,3 +536,165 @@ describe("overallDiscipline", () => {
534
536
  expect(result.message).toContain("Speed:");
535
537
  });
536
538
  });
539
+
540
+ describe("reviewEfficiency", () => {
541
+ it("scores 1.0 for ideal 1:1 ratio (one review per spawn)", async () => {
542
+ const session: CoordinatorSession = {
543
+ session_id: "test-session",
544
+ epic_id: "test-epic",
545
+ start_time: "2025-01-01T00:00:00Z",
546
+ events: [
547
+ {
548
+ session_id: "test-session",
549
+ epic_id: "test-epic",
550
+ timestamp: "2025-01-01T00:00:10Z",
551
+ event_type: "DECISION",
552
+ decision_type: "worker_spawned",
553
+ payload: { bead_id: "bd-1" },
554
+ },
555
+ {
556
+ session_id: "test-session",
557
+ epic_id: "test-epic",
558
+ timestamp: "2025-01-01T00:00:20Z",
559
+ event_type: "DECISION",
560
+ decision_type: "worker_spawned",
561
+ payload: { bead_id: "bd-2" },
562
+ },
563
+ {
564
+ session_id: "test-session",
565
+ epic_id: "test-epic",
566
+ timestamp: "2025-01-01T00:10:00Z",
567
+ event_type: "DECISION",
568
+ decision_type: "review_completed",
569
+ payload: { bead_id: "bd-1" },
570
+ },
571
+ {
572
+ session_id: "test-session",
573
+ epic_id: "test-epic",
574
+ timestamp: "2025-01-01T00:10:10Z",
575
+ event_type: "DECISION",
576
+ decision_type: "review_completed",
577
+ payload: { bead_id: "bd-2" },
578
+ },
579
+ ],
580
+ };
581
+
582
+ const result = await reviewEfficiency({
583
+ output: JSON.stringify(session),
584
+ expected: {},
585
+ input: undefined,
586
+ });
587
+
588
+ expect(result.score).toBe(1.0);
589
+ expect(result.message).toContain("2 reviews / 2 spawns");
590
+ });
591
+
592
+ it("penalizes over-reviewing (>2:1 ratio)", async () => {
593
+ // 6 reviews for 2 spawns = 3:1 ratio (over-reviewing)
594
+ const session: CoordinatorSession = {
595
+ session_id: "test-session",
596
+ epic_id: "test-epic",
597
+ start_time: "2025-01-01T00:00:00Z",
598
+ events: [
599
+ {
600
+ session_id: "test-session",
601
+ epic_id: "test-epic",
602
+ timestamp: "2025-01-01T00:00:10Z",
603
+ event_type: "DECISION",
604
+ decision_type: "worker_spawned",
605
+ payload: { bead_id: "bd-1" },
606
+ },
607
+ {
608
+ session_id: "test-session",
609
+ epic_id: "test-epic",
610
+ timestamp: "2025-01-01T00:00:20Z",
611
+ event_type: "DECISION",
612
+ decision_type: "worker_spawned",
613
+ payload: { bead_id: "bd-2" },
614
+ },
615
+ ...Array.from({ length: 6 }, (_, i) => ({
616
+ session_id: "test-session",
617
+ epic_id: "test-epic",
618
+ timestamp: `2025-01-01T00:10:${String(i * 10).padStart(2, "0")}Z`,
619
+ event_type: "DECISION" as const,
620
+ decision_type: "review_completed" as const,
621
+ payload: { bead_id: `bd-${(i % 2) + 1}` },
622
+ })),
623
+ ],
624
+ };
625
+
626
+ const result = await reviewEfficiency({
627
+ output: JSON.stringify(session),
628
+ expected: {},
629
+ input: undefined,
630
+ });
631
+
632
+ // 3:1 ratio should be penalized (score < 0.5)
633
+ expect(result.score).toBeLessThan(0.5);
634
+ expect(result.message).toContain("6 reviews / 2 spawns");
635
+ });
636
+
637
+ it("handles no spawns gracefully", async () => {
638
+ const session: CoordinatorSession = {
639
+ session_id: "test-session",
640
+ epic_id: "test-epic",
641
+ start_time: "2025-01-01T00:00:00Z",
642
+ events: [
643
+ {
644
+ session_id: "test-session",
645
+ epic_id: "test-epic",
646
+ timestamp: "2025-01-01T00:00:00Z",
647
+ event_type: "DECISION",
648
+ decision_type: "strategy_selected",
649
+ payload: { strategy: "file-based" },
650
+ },
651
+ ],
652
+ };
653
+
654
+ const result = await reviewEfficiency({
655
+ output: JSON.stringify(session),
656
+ expected: {},
657
+ input: undefined,
658
+ });
659
+
660
+ expect(result.score).toBe(1.0);
661
+ expect(result.message).toContain("No workers spawned");
662
+ });
663
+
664
+ it("handles no reviews gracefully (0:N ratio)", async () => {
665
+ const session: CoordinatorSession = {
666
+ session_id: "test-session",
667
+ epic_id: "test-epic",
668
+ start_time: "2025-01-01T00:00:00Z",
669
+ events: [
670
+ {
671
+ session_id: "test-session",
672
+ epic_id: "test-epic",
673
+ timestamp: "2025-01-01T00:00:10Z",
674
+ event_type: "DECISION",
675
+ decision_type: "worker_spawned",
676
+ payload: { bead_id: "bd-1" },
677
+ },
678
+ {
679
+ session_id: "test-session",
680
+ epic_id: "test-epic",
681
+ timestamp: "2025-01-01T00:00:20Z",
682
+ event_type: "DECISION",
683
+ decision_type: "worker_spawned",
684
+ payload: { bead_id: "bd-2" },
685
+ },
686
+ ],
687
+ };
688
+
689
+ const result = await reviewEfficiency({
690
+ output: JSON.stringify(session),
691
+ expected: {},
692
+ input: undefined,
693
+ });
694
+
695
+ // No reviews is bad (should use reviewThoroughness for this)
696
+ // But this scorer focuses on over-reviewing, so no reviews = 1.0 (not over-reviewing)
697
+ expect(result.score).toBe(1.0);
698
+ expect(result.message).toContain("0 reviews / 2 spawns");
699
+ });
700
+ });
@@ -70,6 +70,9 @@ export const violationCount = createScorer({
70
70
  * Coordinators should delegate work, not do it themselves.
71
71
  *
72
72
  * Score: workers_spawned / subtasks_planned
73
+ *
74
+ * If no decomposition_complete event exists, falls back to counting spawns
75
+ * and returns 1.0 if any workers were spawned (better than nothing).
73
76
  */
74
77
  export const spawnEfficiency = createScorer({
75
78
  name: "Spawn Efficiency",
@@ -85,7 +88,20 @@ export const spawnEfficiency = createScorer({
85
88
  e.decision_type === "decomposition_complete"
86
89
  );
87
90
 
91
+ // Count worker_spawned events
92
+ const spawned = session.events.filter(
93
+ (e) =>
94
+ e.event_type === "DECISION" && e.decision_type === "worker_spawned"
95
+ ).length;
96
+
88
97
  if (!decomp) {
98
+ // Fallback: if workers were spawned but no decomp event, assume they're doing work
99
+ if (spawned > 0) {
100
+ return {
101
+ score: 1.0,
102
+ message: `${spawned} workers spawned (no decomposition event)`,
103
+ };
104
+ }
89
105
  return {
90
106
  score: 0,
91
107
  message: "No decomposition event found",
@@ -101,17 +117,81 @@ export const spawnEfficiency = createScorer({
101
117
  };
102
118
  }
103
119
 
120
+ const score = spawned / subtaskCount;
121
+
122
+ return {
123
+ score,
124
+ message: `${spawned}/${subtaskCount} workers spawned (${(score * 100).toFixed(0)}%)`,
125
+ };
126
+ } catch (error) {
127
+ return {
128
+ score: 0,
129
+ message: `Failed to parse CoordinatorSession: ${error}`,
130
+ };
131
+ }
132
+ },
133
+ });
134
+
135
+ /**
136
+ * Review Efficiency Scorer
137
+ *
138
+ * Measures review-to-spawn ratio to detect over-reviewing.
139
+ * Ideal ratio is 1:1 (one review per spawned worker).
140
+ * Penalizes >2:1 ratio (over-reviewing wastes context).
141
+ *
142
+ * Scoring:
143
+ * - 0:N or 1:1 ratio = 1.0 (perfect)
144
+ * - 2:1 ratio = 0.5 (threshold)
145
+ * - >2:1 ratio = linear penalty toward 0.0
146
+ *
147
+ * Score: normalized to 0-1 (lower ratio is better)
148
+ */
149
+ export const reviewEfficiency = createScorer({
150
+ name: "Review Efficiency",
151
+ description: "Review-to-spawn ratio (penalize over-reviewing >2:1)",
152
+ scorer: ({ output }) => {
153
+ try {
154
+ const session = JSON.parse(String(output)) as CoordinatorSession;
155
+
104
156
  // Count worker_spawned events
105
157
  const spawned = session.events.filter(
106
158
  (e) =>
107
159
  e.event_type === "DECISION" && e.decision_type === "worker_spawned"
108
160
  ).length;
109
161
 
110
- const score = spawned / subtaskCount;
162
+ if (spawned === 0) {
163
+ return {
164
+ score: 1.0,
165
+ message: "No workers spawned",
166
+ };
167
+ }
168
+
169
+ // Count review_completed events
170
+ const reviewed = session.events.filter(
171
+ (e) =>
172
+ e.event_type === "DECISION" && e.decision_type === "review_completed"
173
+ ).length;
174
+
175
+ const ratio = reviewed / spawned;
176
+
177
+ // Scoring:
178
+ // - ratio <= 1.0: perfect (1.0)
179
+ // - ratio <= 2.0: linear decay from 1.0 to 0.5
180
+ // - ratio > 2.0: linear penalty from 0.5 toward 0.0
181
+ let score: number;
182
+ if (ratio <= 1.0) {
183
+ score = 1.0;
184
+ } else if (ratio <= 2.0) {
185
+ // Linear decay: 1.0 at ratio=1.0, 0.5 at ratio=2.0
186
+ score = 1.0 - (ratio - 1.0) * 0.5;
187
+ } else {
188
+ // Penalty for extreme over-reviewing: 0.5 at ratio=2.0, 0.0 at ratio=4.0
189
+ score = Math.max(0, 0.5 - (ratio - 2.0) * 0.25);
190
+ }
111
191
 
112
192
  return {
113
193
  score,
114
- message: `${spawned}/${subtaskCount} workers spawned (${(score * 100).toFixed(0)}%)`,
194
+ message: `${reviewed} reviews / ${spawned} spawns (${ratio.toFixed(1)}:1 ratio)`,
115
195
  };
116
196
  } catch (error) {
117
197
  return {
@@ -254,6 +334,259 @@ export const timeToFirstSpawn = createScorer({
254
334
  },
255
335
  });
256
336
 
337
+ /**
338
+ * Researcher Spawn Rate Scorer
339
+ *
340
+ * Measures whether coordinator spawns researchers for unfamiliar technology.
341
+ * Coordinators should delegate research instead of calling pdf-brain/context7 directly.
342
+ *
343
+ * Score: 1.0 if researcher_spawned events exist, 0.0 otherwise
344
+ */
345
+ export const researcherSpawnRate = createScorer({
346
+ name: "Researcher Spawn Rate",
347
+ description: "Coordinator spawned researchers for unfamiliar tech",
348
+ scorer: ({ output }) => {
349
+ try {
350
+ const session = JSON.parse(String(output)) as CoordinatorSession;
351
+
352
+ // Count researcher_spawned events
353
+ const researchers = session.events.filter(
354
+ (e) =>
355
+ e.event_type === "DECISION" && e.decision_type === "researcher_spawned"
356
+ );
357
+
358
+ const count = researchers.length;
359
+
360
+ if (count === 0) {
361
+ return {
362
+ score: 0.0,
363
+ message: "No researchers spawned (may indicate coordinator queried docs directly)",
364
+ };
365
+ }
366
+
367
+ return {
368
+ score: 1.0,
369
+ message: `${count} researcher(s) spawned`,
370
+ };
371
+ } catch (error) {
372
+ return {
373
+ score: 0,
374
+ message: `Failed to parse CoordinatorSession: ${error}`,
375
+ };
376
+ }
377
+ },
378
+ });
379
+
380
+ /**
381
+ * Skill Loading Rate Scorer
382
+ *
383
+ * Measures whether coordinator loads relevant skills via skills_use().
384
+ * Shows knowledge-seeking behavior.
385
+ *
386
+ * Score: 1.0 if skill_loaded events exist, 0.5 otherwise (not critical, but helpful)
387
+ */
388
+ export const skillLoadingRate = createScorer({
389
+ name: "Skill Loading Rate",
390
+ description: "Coordinator loaded relevant skills for domain knowledge",
391
+ scorer: ({ output }) => {
392
+ try {
393
+ const session = JSON.parse(String(output)) as CoordinatorSession;
394
+
395
+ // Count skill_loaded events
396
+ const skills = session.events.filter(
397
+ (e) =>
398
+ e.event_type === "DECISION" && e.decision_type === "skill_loaded"
399
+ );
400
+
401
+ const count = skills.length;
402
+
403
+ if (count === 0) {
404
+ return {
405
+ score: 0.5,
406
+ message: "No skills loaded (not critical, but helpful)",
407
+ };
408
+ }
409
+
410
+ return {
411
+ score: 1.0,
412
+ message: `${count} skill(s) loaded`,
413
+ };
414
+ } catch (error) {
415
+ return {
416
+ score: 0,
417
+ message: `Failed to parse CoordinatorSession: ${error}`,
418
+ };
419
+ }
420
+ },
421
+ });
422
+
423
+ /**
424
+ * Inbox Monitoring Rate Scorer
425
+ *
426
+ * Measures how frequently coordinator checks inbox for worker messages.
427
+ * Regular monitoring (every ~15min or when workers finish) shows good coordination.
428
+ *
429
+ * Score based on inbox_checked events relative to worker activity:
430
+ * - 0 checks = 0.0 (coordinator not monitoring)
431
+ * - 1+ checks = 1.0 (coordinator is responsive)
432
+ */
433
+ export const inboxMonitoringRate = createScorer({
434
+ name: "Inbox Monitoring Rate",
435
+ description: "Coordinator checked inbox regularly for worker messages",
436
+ scorer: ({ output }) => {
437
+ try {
438
+ const session = JSON.parse(String(output)) as CoordinatorSession;
439
+
440
+ // Count inbox_checked events
441
+ const checks = session.events.filter(
442
+ (e) =>
443
+ e.event_type === "DECISION" && e.decision_type === "inbox_checked"
444
+ );
445
+
446
+ // Count worker activity (spawns + outcomes)
447
+ const workerActivity = session.events.filter(
448
+ (e) =>
449
+ (e.event_type === "DECISION" && e.decision_type === "worker_spawned") ||
450
+ (e.event_type === "OUTCOME" &&
451
+ ["subtask_success", "subtask_failed", "blocker_detected"].includes(
452
+ e.outcome_type
453
+ ))
454
+ );
455
+
456
+ const checkCount = checks.length;
457
+ const activityCount = workerActivity.length;
458
+
459
+ if (activityCount === 0) {
460
+ return {
461
+ score: 1.0,
462
+ message: "No worker activity to monitor",
463
+ };
464
+ }
465
+
466
+ if (checkCount === 0) {
467
+ return {
468
+ score: 0.0,
469
+ message: `${activityCount} worker events, 0 inbox checks (not monitoring)`,
470
+ };
471
+ }
472
+
473
+ return {
474
+ score: 1.0,
475
+ message: `${checkCount} inbox check(s) for ${activityCount} worker events`,
476
+ };
477
+ } catch (error) {
478
+ return {
479
+ score: 0,
480
+ message: `Failed to parse CoordinatorSession: ${error}`,
481
+ };
482
+ }
483
+ },
484
+ });
485
+
486
+ /**
487
+ * Blocker Response Time Scorer
488
+ *
489
+ * Measures how quickly coordinator responds to blocked workers.
490
+ * Time between blocker_detected (OUTCOME) and blocker_resolved (DECISION).
491
+ *
492
+ * Normalization:
493
+ * - < 5min: 1.0 (excellent)
494
+ * - 5-15min: linear decay to 0.5
495
+ * - > 15min: 0.0 (too slow, worker is idle)
496
+ *
497
+ * Score: Average response time across all blockers
498
+ */
499
+ export const blockerResponseTime = createScorer({
500
+ name: "Blocker Response Time",
501
+ description: "Coordinator unblocked workers quickly",
502
+ scorer: ({ output }) => {
503
+ try {
504
+ const session = JSON.parse(String(output)) as CoordinatorSession;
505
+
506
+ // Find blocker_detected events
507
+ const blockers = session.events.filter(
508
+ (e) =>
509
+ e.event_type === "OUTCOME" && e.outcome_type === "blocker_detected"
510
+ );
511
+
512
+ if (blockers.length === 0) {
513
+ return {
514
+ score: 1.0,
515
+ message: "No blockers detected",
516
+ };
517
+ }
518
+
519
+ // Find blocker_resolved events
520
+ const resolutions = session.events.filter(
521
+ (e) =>
522
+ e.event_type === "DECISION" && e.decision_type === "blocker_resolved"
523
+ );
524
+
525
+ if (resolutions.length === 0) {
526
+ return {
527
+ score: 0.0,
528
+ message: `${blockers.length} blocker(s) detected, 0 resolved (workers still blocked)`,
529
+ };
530
+ }
531
+
532
+ // Match blockers to resolutions by subtask_id and calculate response times
533
+ const responseTimes: number[] = [];
534
+ for (const blocker of blockers) {
535
+ const subtaskId = (blocker.payload as any).subtask_id;
536
+ const blockerTime = new Date(blocker.timestamp).getTime();
537
+
538
+ // Find resolution for this subtask
539
+ const resolution = resolutions.find(
540
+ (r) => (r.payload as any).subtask_id === subtaskId
541
+ );
542
+
543
+ if (resolution) {
544
+ const resolutionTime = new Date(resolution.timestamp).getTime();
545
+ const deltaMs = resolutionTime - blockerTime;
546
+ responseTimes.push(deltaMs);
547
+ }
548
+ }
549
+
550
+ if (responseTimes.length === 0) {
551
+ return {
552
+ score: 0.5,
553
+ message: `${blockers.length} blocker(s) detected, ${resolutions.length} resolution(s), but no matches by subtask_id`,
554
+ };
555
+ }
556
+
557
+ // Calculate average response time
558
+ const avgResponseMs =
559
+ responseTimes.reduce((sum, t) => sum + t, 0) / responseTimes.length;
560
+
561
+ // Normalize: < 5min = 1.0, > 15min = 0.0, linear in between
562
+ const EXCELLENT_MS = 5 * 60 * 1000; // 5 min
563
+ const POOR_MS = 15 * 60 * 1000; // 15 min
564
+
565
+ let score: number;
566
+ if (avgResponseMs < EXCELLENT_MS) {
567
+ score = 1.0;
568
+ } else if (avgResponseMs > POOR_MS) {
569
+ score = 0.0;
570
+ } else {
571
+ // Linear decay from 1.0 to 0.0
572
+ score = 1.0 - (avgResponseMs - EXCELLENT_MS) / (POOR_MS - EXCELLENT_MS);
573
+ }
574
+
575
+ const avgMinutes = Math.round(avgResponseMs / 1000 / 60);
576
+
577
+ return {
578
+ score,
579
+ message: `Avg response time: ${avgMinutes}min (${responseTimes.length}/${blockers.length} blockers resolved)`,
580
+ };
581
+ } catch (error) {
582
+ return {
583
+ score: 0,
584
+ message: `Failed to parse CoordinatorSession: ${error}`,
585
+ };
586
+ }
587
+ },
588
+ });
589
+
257
590
  /**
258
591
  * Overall Discipline Scorer
259
592
  *
@@ -270,14 +603,14 @@ export const timeToFirstSpawn = createScorer({
270
603
  export const overallDiscipline = createScorer({
271
604
  name: "Overall Coordinator Discipline",
272
605
  description: "Composite score for coordinator protocol adherence",
273
- scorer: ({ output, expected }) => {
606
+ scorer: async ({ output, expected, input }) => {
274
607
  try {
275
608
  // Run all scorers
276
609
  const scores = {
277
- violations: violationCount.scorer({ output, expected }),
278
- spawn: spawnEfficiency.scorer({ output, expected }),
279
- review: reviewThoroughness.scorer({ output, expected }),
280
- speed: timeToFirstSpawn.scorer({ output, expected }),
610
+ violations: await violationCount({ output, expected, input }),
611
+ spawn: await spawnEfficiency({ output, expected, input }),
612
+ review: await reviewThoroughness({ output, expected, input }),
613
+ speed: await timeToFirstSpawn({ output, expected, input }),
281
614
  };
282
615
 
283
616
  // Weighted average
@@ -289,16 +622,16 @@ export const overallDiscipline = createScorer({
289
622
  };
290
623
 
291
624
  const totalScore =
292
- scores.violations.score * weights.violations +
293
- scores.spawn.score * weights.spawn +
294
- scores.review.score * weights.review +
295
- scores.speed.score * weights.speed;
625
+ (scores.violations.score ?? 0) * weights.violations +
626
+ (scores.spawn.score ?? 0) * weights.spawn +
627
+ (scores.review.score ?? 0) * weights.review +
628
+ (scores.speed.score ?? 0) * weights.speed;
296
629
 
297
630
  const details = [
298
- `Violations: ${(scores.violations.score * 100).toFixed(0)}%`,
299
- `Spawn: ${(scores.spawn.score * 100).toFixed(0)}%`,
300
- `Review: ${(scores.review.score * 100).toFixed(0)}%`,
301
- `Speed: ${(scores.speed.score * 100).toFixed(0)}%`,
631
+ `Violations: ${((scores.violations.score ?? 0) * 100).toFixed(0)}%`,
632
+ `Spawn: ${((scores.spawn.score ?? 0) * 100).toFixed(0)}%`,
633
+ `Review: ${((scores.review.score ?? 0) * 100).toFixed(0)}%`,
634
+ `Speed: ${((scores.speed.score ?? 0) * 100).toFixed(0)}%`,
302
635
  ].join(", ");
303
636
 
304
637
  return {