groove-dev 0.27.115 → 0.27.117

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/TRAINING_DATA_v4.md +6 -3
  2. package/moe-training/client/domain-tagger.js +20 -0
  3. package/moe-training/client/trajectory-capture.js +36 -7
  4. package/moe-training/test/client/trajectory-capture.test.js +182 -1
  5. package/node_modules/@groove-dev/cli/package.json +1 -1
  6. package/node_modules/@groove-dev/cli/src/commands/team.js +27 -12
  7. package/node_modules/@groove-dev/daemon/package.json +1 -1
  8. package/node_modules/@groove-dev/daemon/src/api.js +3 -2
  9. package/node_modules/@groove-dev/daemon/src/process.js +283 -211
  10. package/node_modules/@groove-dev/daemon/src/teams.js +53 -24
  11. package/node_modules/@groove-dev/daemon/src/tunnel-manager.js +21 -3
  12. package/node_modules/@groove-dev/gui/dist/assets/{index-D4Q72afD.css → index-DdN9RVnC.css} +1 -1
  13. package/node_modules/@groove-dev/gui/dist/assets/{index-BKCiOUDb.js → index-fq--PD7_.js} +1724 -1724
  14. package/node_modules/@groove-dev/gui/dist/index.html +2 -2
  15. package/node_modules/@groove-dev/gui/package.json +1 -1
  16. package/node_modules/@groove-dev/gui/src/components/teams/team-removal-dialog.jsx +156 -0
  17. package/node_modules/@groove-dev/gui/src/stores/groove.js +15 -4
  18. package/node_modules/@groove-dev/gui/src/views/agents.jsx +10 -19
  19. package/node_modules/@groove-dev/gui/src/views/teams.jsx +17 -41
  20. package/node_modules/moe-training/client/domain-tagger.js +20 -0
  21. package/node_modules/moe-training/client/trajectory-capture.js +36 -7
  22. package/node_modules/moe-training/test/client/trajectory-capture.test.js +182 -1
  23. package/package.json +1 -1
  24. package/packages/cli/package.json +1 -1
  25. package/packages/cli/src/commands/team.js +27 -12
  26. package/packages/daemon/package.json +1 -1
  27. package/packages/daemon/src/api.js +3 -2
  28. package/packages/daemon/src/process.js +283 -211
  29. package/packages/daemon/src/teams.js +53 -24
  30. package/packages/daemon/src/tunnel-manager.js +21 -3
  31. package/packages/gui/dist/assets/{index-D4Q72afD.css → index-DdN9RVnC.css} +1 -1
  32. package/packages/gui/dist/assets/{index-BKCiOUDb.js → index-fq--PD7_.js} +1724 -1724
  33. package/packages/gui/dist/index.html +2 -2
  34. package/packages/gui/package.json +1 -1
  35. package/packages/gui/src/components/teams/team-removal-dialog.jsx +156 -0
  36. package/packages/gui/src/stores/groove.js +15 -4
  37. package/packages/gui/src/views/agents.jsx +10 -19
  38. package/packages/gui/src/views/teams.jsx +17 -41
@@ -329,6 +329,8 @@ export class ProcessManager {
329
329
  this._streamThrottle = new Map(); // agentId -> { timer, pending }
330
330
  this._rotatingAgents = new Set(); // agentIds currently being rotated (rotator wrote handoff)
331
331
  this._stalledAgents = new Set(); // agentIds already flagged as stalled (avoids duplicate broadcasts)
332
+ this._exitHandled = new Set();
333
+ this._resultReceived = new Set();
332
334
 
333
335
  this._stallWatchdog = setInterval(() => this._checkStalls(), STALL_CHECK_INTERVAL_MS);
334
336
  if (this._stallWatchdog.unref) this._stallWatchdog.unref();
@@ -366,6 +368,251 @@ export class ProcessManager {
366
368
  });
367
369
  console.warn(`[Groove] Agent ${agent.name} (${agentId}) silent for ${Math.round(silentMs / 1000)}s — possible stalled API stream`);
368
370
  }
371
+
372
+ // Defense in depth: detect zombie handles where PID is no longer alive
373
+ const ZOMBIE_THRESHOLD_MS = 10 * 60_000;
374
+ for (const [agentId, handle] of this.handles.entries()) {
375
+ const agent = registry.get(agentId);
376
+ if (!agent) continue;
377
+ const lastActivity = agent.lastActivity ? new Date(agent.lastActivity).getTime() : now;
378
+ if (now - lastActivity < ZOMBIE_THRESHOLD_MS) continue;
379
+ const pid = handle.proc?.pid;
380
+ if (!pid) continue;
381
+ try {
382
+ process.kill(pid, 0);
383
+ } catch {
384
+ console.warn(`[Groove] Agent ${agent.name} (${agentId}) PID ${pid} no longer alive — force-cleaning handle`);
385
+ if (handle.logStream && !handle.logStream.destroyed) {
386
+ handle.logStream.write(`[${new Date().toISOString()}] Force-cleaned: PID ${pid} no longer alive\n`);
387
+ handle.logStream.end();
388
+ }
389
+ this.handles.delete(agentId);
390
+ this._exitHandled.add(agentId);
391
+ setTimeout(() => this._exitHandled.delete(agentId), 30_000);
392
+ this._stalledAgents.delete(agentId);
393
+ this._resultReceived.delete(agentId);
394
+ const throttle = this._streamThrottle.get(agentId);
395
+ if (throttle?.timer) clearTimeout(throttle.timer);
396
+ this._streamThrottle.delete(agentId);
397
+ this.peakContextUsage.delete(agentId);
398
+ this.pendingMessages.delete(agentId);
399
+ if (this.daemon.locks) this.daemon.locks.release(agentId);
400
+ registry.update(agentId, { status: 'completed', pid: null });
401
+ this.daemon.broadcast({ type: 'agent:exit', agentId, code: 0, signal: null, status: 'completed' });
402
+ }
403
+ }
404
+ }
405
+
406
+ _handleProcessExit(agent, code, signal, logStream, stderrBuf, logPath) {
407
+ if (this._exitHandled.has(agent.id)) return;
408
+ this._exitHandled.add(agent.id);
409
+ setTimeout(() => this._exitHandled.delete(agent.id), 30_000);
410
+
411
+ const { registry } = this.daemon;
412
+
413
+ if (!logStream.destroyed) {
414
+ logStream.write(`[${new Date().toISOString()}] Process exited: code=${code} signal=${signal}\n`);
415
+ logStream.end();
416
+ }
417
+
418
+ this.handles.delete(agent.id);
419
+
420
+ const throttle = this._streamThrottle.get(agent.id);
421
+ if (throttle?.timer) clearTimeout(throttle.timer);
422
+ this._streamThrottle.delete(agent.id);
423
+
424
+ this.peakContextUsage.delete(agent.id);
425
+ this.pendingMessages.delete(agent.id);
426
+ this._stalledAgents.delete(agent.id);
427
+
428
+ if (this.daemon.locks) this.daemon.locks.release(agent.id);
429
+
430
+ const hadResult = this._resultReceived.has(agent.id);
431
+ this._resultReceived.delete(agent.id);
432
+
433
+ const finalStatus = hadResult
434
+ ? 'completed'
435
+ : signal === 'SIGTERM' || signal === 'SIGKILL'
436
+ ? 'killed'
437
+ : code === 0
438
+ ? 'completed'
439
+ : 'crashed';
440
+
441
+ const crashError = finalStatus === 'crashed' ? stderrBuf.join('').trim().slice(-500) : null;
442
+
443
+ registry.update(agent.id, { status: finalStatus, pid: null });
444
+
445
+ if (this.daemon.timeline) {
446
+ const agentData = registry.get(agent.id);
447
+ this.daemon.timeline.recordEvent(finalStatus === 'completed' ? 'complete' : finalStatus === 'crashed' ? 'crash' : 'kill', {
448
+ agentId: agent.id, agentName: agent.name, role: agent.role,
449
+ finalTokens: agentData?.tokensUsed || 0, costUsd: agentData?.costUsd || 0,
450
+ exitCode: code,
451
+ });
452
+ }
453
+
454
+ if (this.daemon.trajectoryCapture) {
455
+ try {
456
+ if (finalStatus === 'completed') {
457
+ this.daemon.trajectoryCapture.onAgentComplete(agent.id, {
458
+ status: 'SUCCESS', exit_code: code, signal,
459
+ });
460
+ } else {
461
+ this.daemon.trajectoryCapture.onAgentCrash(agent.id,
462
+ signal ? 'Killed by signal ' + signal : 'Exit code ' + code
463
+ );
464
+ }
465
+ const count = (this.daemon.state.get('training_sessions_captured') || 0) + 1;
466
+ this.daemon.state.set('training_sessions_captured', count);
467
+ } catch (e) { /* fail silent */ }
468
+ }
469
+
470
+ this.daemon.broadcast({
471
+ type: 'agent:exit',
472
+ agentId: agent.id,
473
+ code,
474
+ signal,
475
+ status: finalStatus,
476
+ error: crashError || undefined,
477
+ });
478
+
479
+ if (this.daemon.integrations) {
480
+ this.daemon.integrations.refreshMcpJson();
481
+ }
482
+
483
+ if (finalStatus === 'completed' && agent.role === 'planner') {
484
+ this._extractRecommendedTeam(agent, logPath);
485
+ }
486
+
487
+ if (finalStatus === 'completed') {
488
+ const pending = this.consumePendingMessage(agent.id);
489
+ if (pending) {
490
+ const agentData = registry.get(agent.id);
491
+ if (agentData?.sessionId) {
492
+ this.resume(agent.id, pending.message).catch((err) => {
493
+ console.error(`[Groove] Auto-resume with queued message failed for ${agent.name}: ${err.message}`);
494
+ });
495
+ return;
496
+ }
497
+ }
498
+ }
499
+
500
+ if (finalStatus === 'completed' && this.daemon.journalist) {
501
+ const a = registry.get(agent.id);
502
+ const turns = a?.turns || 0;
503
+ const tok = a?.tokensUsed || 0;
504
+ if (turns > 1 || tok >= 100) {
505
+ this.daemon.journalist.requestSynthesis('completion');
506
+ }
507
+ }
508
+
509
+ this._checkPhase2(agent.id);
510
+
511
+ if (agent.teamId) {
512
+ this._checkPreviewReady(agent.teamId);
513
+ }
514
+
515
+ if (finalStatus === 'completed') {
516
+ const files = this.daemon.journalist?.getAgentFiles(agent) || [];
517
+ if (files.length > 0) this._triggerIdleQC(agent);
518
+ this._processHandoffs(agent);
519
+ if (this._rotatingAgents.has(agent.id)) {
520
+ this._rotatingAgents.delete(agent.id);
521
+ } else {
522
+ this._writeCompletionHandoff(agent).catch(err => console.error(`[Groove] Completion handoff failed for ${agent.name}:`, err.message));
523
+ }
524
+ }
525
+
526
+ if (this.daemon.memory && (finalStatus === 'completed' || finalStatus === 'crashed')) {
527
+ try {
528
+ const events = this.daemon.classifier?.agentWindows?.[agent.id] || [];
529
+ const signals = events.length >= 6
530
+ ? this.daemon.adaptive.extractSignals(events, agent.scope)
531
+ : null;
532
+ const score = signals ? this.daemon.adaptive.scoreSession(signals) : null;
533
+ const files = this.daemon.journalist?.getAgentFiles(agent) || [];
534
+ this.daemon.memory.updateSpecialization(agent.id, {
535
+ role: agent.role,
536
+ qualityScore: score,
537
+ filesTouched: files,
538
+ signals,
539
+ threshold: this.daemon.adaptive?.getThreshold(agent.provider, agent.role),
540
+ });
541
+ } catch { /* best-effort */ }
542
+ }
543
+ }
544
+
545
+ _handleResumeProcessExit(agent, code, signal, logStream) {
546
+ if (this._exitHandled.has(agent.id)) return;
547
+ this._exitHandled.add(agent.id);
548
+ setTimeout(() => this._exitHandled.delete(agent.id), 30_000);
549
+
550
+ const { registry } = this.daemon;
551
+
552
+ if (!logStream.destroyed) {
553
+ logStream.write(`[${new Date().toISOString()}] Process exited: code=${code} signal=${signal}\n`);
554
+ logStream.end();
555
+ }
556
+
557
+ this.handles.delete(agent.id);
558
+ this._stalledAgents.delete(agent.id);
559
+
560
+ if (this.daemon.locks) this.daemon.locks.release(agent.id);
561
+
562
+ const hadResult = this._resultReceived.has(agent.id);
563
+ this._resultReceived.delete(agent.id);
564
+
565
+ const finalStatus = hadResult ? 'completed' : signal === 'SIGTERM' || signal === 'SIGKILL' ? 'killed' : code === 0 ? 'completed' : 'crashed';
566
+ registry.update(agent.id, { status: finalStatus, pid: null });
567
+
568
+ if (this.daemon.trajectoryCapture) {
569
+ try {
570
+ if (finalStatus === 'completed') {
571
+ this.daemon.trajectoryCapture.onAgentComplete(agent.id, {
572
+ status: 'SUCCESS', exit_code: code, signal,
573
+ });
574
+ } else {
575
+ this.daemon.trajectoryCapture.onAgentCrash(agent.id,
576
+ signal ? 'Killed by signal ' + signal : 'Exit code ' + code
577
+ );
578
+ }
579
+ const count = (this.daemon.state.get('training_sessions_captured') || 0) + 1;
580
+ this.daemon.state.set('training_sessions_captured', count);
581
+ } catch (e) { /* fail silent */ }
582
+ }
583
+
584
+ this.daemon.broadcast({ type: 'agent:exit', agentId: agent.id, code, signal, status: finalStatus });
585
+ if (finalStatus === 'completed' && this.daemon.journalist) {
586
+ const a = registry.get(agent.id);
587
+ const turns = a?.turns || 0;
588
+ const tok = a?.tokensUsed || 0;
589
+ if (turns > 1 || tok >= 100) this.daemon.journalist.requestSynthesis('completion');
590
+ }
591
+
592
+ if (finalStatus === 'completed' && !this._rotatingAgents.has(agent.id)) {
593
+ this._writeCompletionHandoff(agent).catch(err =>
594
+ console.error(`[Groove] Completion handoff failed for ${agent.name}:`, err.message));
595
+ }
596
+ if (this._rotatingAgents.has(agent.id)) {
597
+ this._rotatingAgents.delete(agent.id);
598
+ }
599
+ if (this.daemon.memory && (finalStatus === 'completed' || finalStatus === 'crashed')) {
600
+ try {
601
+ const events = this.daemon.classifier?.agentWindows?.[agent.id] || [];
602
+ const signals = events.length >= 6
603
+ ? this.daemon.adaptive.extractSignals(events, agent.scope)
604
+ : null;
605
+ const score = signals ? this.daemon.adaptive.scoreSession(signals) : null;
606
+ const files = this.daemon.journalist?.getAgentFiles(agent) || [];
607
+ this.daemon.memory.updateSpecialization(agent.id, {
608
+ role: agent.role,
609
+ qualityScore: score,
610
+ filesTouched: files,
611
+ signals,
612
+ threshold: this.daemon.adaptive?.getThreshold(agent.provider, agent.role),
613
+ });
614
+ } catch { /* best-effort */ }
615
+ }
369
616
  }
370
617
 
371
618
  async spawn(config) {
@@ -490,7 +737,7 @@ export class ProcessManager {
490
737
  try {
491
738
  const teamSize = registry.getAll().filter(a => a.status === 'active' || a.status === 'running' || a.status === 'starting').length;
492
739
  this.daemon.trajectoryCapture.onAgentSpawn(
493
- agent.id, providerName, config.model || null, config.role, teamSize
740
+ agent.id, providerName, config.model || null, config.role, teamSize, config.prompt
494
741
  ).catch(() => {});
495
742
  } catch (e) { /* fail silent */ }
496
743
  }
@@ -732,6 +979,8 @@ For normal file edits within your scope, proceed without review.
732
979
  logStream.write(`[${new Date().toISOString()}] Agent loop exited: status=${status}\n`);
733
980
  logStream.end();
734
981
  this.handles.delete(agent.id);
982
+ this._stalledAgents.delete(agent.id);
983
+ this._resultReceived.delete(agent.id);
735
984
 
736
985
  // Clean up stream throttle so pending timers don't fire for dead agents
737
986
  const throttle = this._streamThrottle.get(agent.id);
@@ -775,8 +1024,9 @@ For normal file edits within your scope, proceed without review.
775
1024
  this.daemon.broadcast({ type: 'agent:exit', agentId: agent.id, code: code || 0, signal, status });
776
1025
  if (this.daemon.integrations) this.daemon.integrations.refreshMcpJson();
777
1026
  if (status === 'completed' && this.daemon.journalist) {
778
- const turns = agentData?.turns || 0;
779
- const tok = agentData?.tokensUsed || 0;
1027
+ const a = registry.get(agent.id);
1028
+ const turns = a?.turns || 0;
1029
+ const tok = a?.tokensUsed || 0;
780
1030
  if (turns > 1 || tok >= 100) this.daemon.journalist.requestSynthesis('completion');
781
1031
  }
782
1032
  this._checkPhase2(agent.id);
@@ -862,6 +1112,7 @@ For normal file edits within your scope, proceed without review.
862
1112
  if (!logStream.destroyed) logStream.write(`[${new Date().toISOString()}] Spawn error: ${err.message}\n`);
863
1113
  if (!logStream.destroyed) logStream.end();
864
1114
  this.handles.delete(agent.id);
1115
+ this._exitHandled.add(agent.id);
865
1116
  registry.update(agent.id, { status: 'crashed', pid: null });
866
1117
  this.daemon.broadcast({ type: 'agent:exit', agentId: agent.id, code: null, signal: null, status: 'crashed', error: err.message });
867
1118
  });
@@ -906,154 +1157,13 @@ For normal file edits within your scope, proceed without review.
906
1157
  while (stderrBuf.join('').length > 2048) stderrBuf.shift();
907
1158
  });
908
1159
 
909
- // Handle process exit
1160
+ // Handle process exit — cleanup extracted to _handleProcessExit with dedup
910
1161
  proc.on('exit', (code, signal) => {
911
- const exitLine = `[${new Date().toISOString()}] Process exited: code=${code} signal=${signal}\n`;
912
- logStream.write(exitLine);
913
- logStream.end();
914
-
915
- this.handles.delete(agent.id);
916
-
917
- // Clean up stream throttle so pending timers don't fire for dead agents
918
- const throttle = this._streamThrottle.get(agent.id);
919
- if (throttle?.timer) clearTimeout(throttle.timer);
920
- this._streamThrottle.delete(agent.id);
921
-
922
- // Clean up per-agent maps to prevent unbounded growth in long sessions
923
- this.peakContextUsage.delete(agent.id);
924
- this.pendingMessages.delete(agent.id);
925
- this._stalledAgents.delete(agent.id);
926
-
927
- // Release file-scope locks so they don't persist after agent death
928
- if (this.daemon.locks) this.daemon.locks.release(agent.id);
929
-
930
- const finalStatus = signal === 'SIGTERM' || signal === 'SIGKILL'
931
- ? 'killed'
932
- : code === 0
933
- ? 'completed'
934
- : 'crashed';
935
-
936
- // Capture crash error from stderr for UI display
937
- const crashError = finalStatus === 'crashed' ? stderrBuf.join('').trim().slice(-500) : null;
938
-
939
- registry.update(agent.id, { status: finalStatus, pid: null });
940
-
941
- // Record lifecycle event for timeline
942
- if (this.daemon.timeline) {
943
- const agentData = registry.get(agent.id);
944
- this.daemon.timeline.recordEvent(finalStatus === 'completed' ? 'complete' : finalStatus === 'crashed' ? 'crash' : 'kill', {
945
- agentId: agent.id, agentName: agent.name, role: agent.role,
946
- finalTokens: agentData?.tokensUsed || 0, costUsd: agentData?.costUsd || 0,
947
- exitCode: code,
948
- });
949
- }
950
-
951
- if (this.daemon.trajectoryCapture) {
952
- try {
953
- if (finalStatus === 'completed') {
954
- this.daemon.trajectoryCapture.onAgentComplete(agent.id, {
955
- status: 'SUCCESS', exit_code: code, signal,
956
- });
957
- } else {
958
- this.daemon.trajectoryCapture.onAgentCrash(agent.id,
959
- signal ? 'Killed by signal ' + signal : 'Exit code ' + code
960
- );
961
- }
962
- const count = (this.daemon.state.get('training_sessions_captured') || 0) + 1;
963
- this.daemon.state.set('training_sessions_captured', count);
964
- } catch (e) { /* fail silent */ }
965
- }
966
-
967
- this.daemon.broadcast({
968
- type: 'agent:exit',
969
- agentId: agent.id,
970
- code,
971
- signal,
972
- status: finalStatus,
973
- error: crashError || undefined,
974
- });
975
-
976
- // Refresh MCP config — remove integrations no longer needed by running agents
977
- if (this.daemon.integrations) {
978
- this.daemon.integrations.refreshMcpJson();
979
- }
980
-
981
- // Extract recommended-team.json from planner text output if it wasn't written to disk.
982
- // Non-Claude providers (Codex, Gemini) may embed the JSON in text rather than using Write.
983
- if (finalStatus === 'completed' && agent.role === 'planner') {
984
- this._extractRecommendedTeam(agent, logPath);
985
- }
986
-
987
- // Auto-resume with queued message: if the user sent a message while this
988
- // CLI agent was still running, resume the session now that it's done.
989
- if (finalStatus === 'completed') {
990
- const pending = this.consumePendingMessage(agent.id);
991
- if (pending) {
992
- const agentData = registry.get(agent.id);
993
- if (agentData?.sessionId) {
994
- this.resume(agent.id, pending.message).catch((err) => {
995
- console.error(`[Groove] Auto-resume with queued message failed for ${agent.name}: ${err.message}`);
996
- });
997
- return;
998
- }
999
- }
1000
- }
1001
-
1002
- // Trigger journalist synthesis on completion (event-driven, debounced).
1003
- // Skip trivial sessions — a greeting-only completion (user never gave a task)
1004
- // has nothing worth synthesizing and wastes a $0.04+ headless claude call.
1005
- if (finalStatus === 'completed' && this.daemon.journalist) {
1006
- const a = registry.get(agent.id);
1007
- const turns = a?.turns || 0;
1008
- const tok = a?.tokensUsed || 0;
1009
- if (turns > 1 || tok >= 100) {
1010
- this.daemon.journalist.requestSynthesis('completion');
1011
- }
1012
- }
1013
-
1014
- // Phase 2 auto-spawn: check if all phase 1 agents for a team are done
1015
- this._checkPhase2(agent.id);
1016
-
1017
- // Preview launch: when every agent in this team is in a terminal state,
1018
- // kick off the one-click preview (dev server or static serve) the planner
1019
- // staged in the team plan. Fires once per team launch.
1020
- // Fire on any terminal status so crashed QC agents don't block preview
1021
- // when builders completed successfully.
1022
- if (agent.teamId) {
1023
- this._checkPreviewReady(agent.teamId);
1024
- }
1025
-
1026
- // Auto-trigger idle QC: if this agent modified files and there's an idle QC
1027
- // in the same team, activate it to verify the changes
1028
- if (finalStatus === 'completed') {
1029
- const files = this.daemon.journalist?.getAgentFiles(agent) || [];
1030
- if (files.length > 0) this._triggerIdleQC(agent);
1031
- this._processHandoffs(agent);
1032
- if (this._rotatingAgents.has(agent.id)) {
1033
- this._rotatingAgents.delete(agent.id);
1034
- } else {
1035
- this._writeCompletionHandoff(agent).catch(err => console.error(`[Groove] Completion handoff failed for ${agent.name}:`, err.message));
1036
- }
1037
- }
1162
+ this._handleProcessExit(agent, code, signal, logStream, stderrBuf, logPath);
1163
+ });
1038
1164
 
1039
- // Update Layer 7 specialization profile for this agent's session
1040
- if (this.daemon.memory && (finalStatus === 'completed' || finalStatus === 'crashed')) {
1041
- try {
1042
- const events = this.daemon.classifier?.agentWindows?.[agent.id] || [];
1043
- const signals = events.length >= 6
1044
- ? this.daemon.adaptive.extractSignals(events, agent.scope)
1045
- : null;
1046
- const score = signals ? this.daemon.adaptive.scoreSession(signals) : null;
1047
- const files = this.daemon.journalist?.getAgentFiles(agent) || [];
1048
- this.daemon.memory.updateSpecialization(agent.id, {
1049
- role: agent.role,
1050
- qualityScore: score,
1051
- filesTouched: files,
1052
- signals,
1053
- threshold: this.daemon.adaptive?.getThreshold(agent.provider, agent.role),
1054
- });
1055
- } catch { /* best-effort */ }
1056
- }
1165
+ proc.on('close', (code, signal) => {
1166
+ this._handleProcessExit(agent, code, signal, logStream, stderrBuf, logPath);
1057
1167
  });
1058
1168
 
1059
1169
  proc.on('error', (err) => {
@@ -1061,6 +1171,7 @@ For normal file edits within your scope, proceed without review.
1061
1171
  logStream.end();
1062
1172
 
1063
1173
  this.handles.delete(agent.id);
1174
+ this._exitHandled.add(agent.id);
1064
1175
  if (this.daemon.locks) this.daemon.locks.release(agent.id);
1065
1176
  registry.update(agent.id, { status: 'crashed', pid: null });
1066
1177
  this.daemon.broadcast({
@@ -1162,6 +1273,21 @@ For normal file edits within your scope, proceed without review.
1162
1273
  if (output.cost) updates.costUsd = (agent.costUsd || 0) + output.cost;
1163
1274
  if (output.duration) updates.durationMs = output.duration;
1164
1275
  if (output.turns) updates.turns = output.turns;
1276
+
1277
+ // Claude Code sometimes hangs after emitting the result event — the
1278
+ // process stays alive instead of exiting. Record that the result
1279
+ // arrived so exit handlers know this was a successful completion even
1280
+ // if we have to SIGTERM the process. After a 5s grace period, force-
1281
+ // kill any process that hasn't exited on its own.
1282
+ this._resultReceived.add(agentId);
1283
+ const handle = this.handles.get(agentId);
1284
+ if (handle?.proc && typeof handle.proc.kill === 'function') {
1285
+ setTimeout(() => {
1286
+ if (this.handles.has(agentId) && this._resultReceived.has(agentId)) {
1287
+ try { handle.proc.kill('SIGTERM'); } catch {}
1288
+ }
1289
+ }, 5_000);
1290
+ }
1165
1291
  }
1166
1292
 
1167
1293
  // Context window usage (0-1 scale) — drives rotation threshold
@@ -1745,7 +1871,7 @@ For normal file edits within your scope, proceed without review.
1745
1871
  try {
1746
1872
  const teamSize = registry.getAll().filter(a => a.status === 'active' || a.status === 'running' || a.status === 'starting').length;
1747
1873
  this.daemon.trajectoryCapture.onAgentSpawn(
1748
- newAgent.id, config.provider, config.model || null, config.role, teamSize
1874
+ newAgent.id, config.provider, config.model || null, config.role, teamSize, config.prompt
1749
1875
  ).catch(() => {});
1750
1876
  } catch (e) { /* fail silent */ }
1751
1877
  }
@@ -1763,6 +1889,7 @@ For normal file edits within your scope, proceed without review.
1763
1889
  if (!logStream.destroyed) logStream.write(`[${new Date().toISOString()}] Resume spawn error: ${err.message}\n`);
1764
1890
  if (!logStream.destroyed) logStream.end();
1765
1891
  this.handles.delete(newAgent.id);
1892
+ this._exitHandled.add(newAgent.id);
1766
1893
  registry.update(newAgent.id, { status: 'crashed', pid: null });
1767
1894
  this.daemon.broadcast({ type: 'agent:exit', agentId: newAgent.id, code: null, signal: null, status: 'crashed', error: err.message });
1768
1895
  });
@@ -1795,73 +1922,18 @@ For normal file edits within your scope, proceed without review.
1795
1922
  proc.stderr.on('data', (chunk) => { logStream.write(`[stderr] ${chunk}`); });
1796
1923
 
1797
1924
  proc.on('exit', (code, signal) => {
1798
- logStream.write(`[${new Date().toISOString()}] Process exited: code=${code} signal=${signal}\n`);
1799
- logStream.end();
1800
- this.handles.delete(newAgent.id);
1801
- this._stalledAgents.delete(newAgent.id);
1802
-
1803
- // Release file-scope locks so they don't persist after agent death
1804
- if (this.daemon.locks) this.daemon.locks.release(newAgent.id);
1805
-
1806
- const finalStatus = signal === 'SIGTERM' || signal === 'SIGKILL' ? 'killed' : code === 0 ? 'completed' : 'crashed';
1807
- registry.update(newAgent.id, { status: finalStatus, pid: null });
1808
-
1809
- if (this.daemon.trajectoryCapture) {
1810
- try {
1811
- if (finalStatus === 'completed') {
1812
- this.daemon.trajectoryCapture.onAgentComplete(newAgent.id, {
1813
- status: 'SUCCESS', exit_code: code, signal,
1814
- });
1815
- } else {
1816
- this.daemon.trajectoryCapture.onAgentCrash(newAgent.id,
1817
- signal ? 'Killed by signal ' + signal : 'Exit code ' + code
1818
- );
1819
- }
1820
- const count = (this.daemon.state.get('training_sessions_captured') || 0) + 1;
1821
- this.daemon.state.set('training_sessions_captured', count);
1822
- } catch (e) { /* fail silent */ }
1823
- }
1824
-
1825
- this.daemon.broadcast({ type: 'agent:exit', agentId: newAgent.id, code, signal, status: finalStatus });
1826
- if (finalStatus === 'completed' && this.daemon.journalist) {
1827
- const a = registry.get(newAgent.id);
1828
- const turns = a?.turns || 0;
1829
- const tok = a?.tokensUsed || 0;
1830
- if (turns > 1 || tok >= 100) this.daemon.journalist.requestSynthesis('completion');
1831
- }
1925
+ this._handleResumeProcessExit(newAgent, code, signal, logStream);
1926
+ });
1832
1927
 
1833
- // Persist Layer 7 state for resumed-session completions too, not just fresh spawns.
1834
- // Without this, every resume after the first loses its work from the handoff chain.
1835
- if (finalStatus === 'completed' && !this._rotatingAgents.has(newAgent.id)) {
1836
- this._writeCompletionHandoff(newAgent).catch(err =>
1837
- console.error(`[Groove] Completion handoff failed for ${newAgent.name}:`, err.message));
1838
- }
1839
- if (this._rotatingAgents.has(newAgent.id)) {
1840
- this._rotatingAgents.delete(newAgent.id);
1841
- }
1842
- if (this.daemon.memory && (finalStatus === 'completed' || finalStatus === 'crashed')) {
1843
- try {
1844
- const events = this.daemon.classifier?.agentWindows?.[newAgent.id] || [];
1845
- const signals = events.length >= 6
1846
- ? this.daemon.adaptive.extractSignals(events, newAgent.scope)
1847
- : null;
1848
- const score = signals ? this.daemon.adaptive.scoreSession(signals) : null;
1849
- const files = this.daemon.journalist?.getAgentFiles(newAgent) || [];
1850
- this.daemon.memory.updateSpecialization(newAgent.id, {
1851
- role: newAgent.role,
1852
- qualityScore: score,
1853
- filesTouched: files,
1854
- signals,
1855
- threshold: this.daemon.adaptive?.getThreshold(newAgent.provider, newAgent.role),
1856
- });
1857
- } catch { /* best-effort */ }
1858
- }
1928
+ proc.on('close', (code, signal) => {
1929
+ this._handleResumeProcessExit(newAgent, code, signal, logStream);
1859
1930
  });
1860
1931
 
1861
1932
  proc.on('error', (err) => {
1862
1933
  logStream.write(`[error] ${err.message}\n`);
1863
1934
  logStream.end();
1864
1935
  this.handles.delete(newAgent.id);
1936
+ this._exitHandled.add(newAgent.id);
1865
1937
  this._stalledAgents.delete(newAgent.id);
1866
1938
  registry.update(newAgent.id, { status: 'crashed', pid: null });
1867
1939
  });
@@ -1941,7 +2013,7 @@ For normal file edits within your scope, proceed without review.
1941
2013
  try {
1942
2014
  const teamSize = registry.getAll().filter(a => a.status === 'active' || a.status === 'running' || a.status === 'starting').length;
1943
2015
  this.daemon.trajectoryCapture.onAgentSpawn(
1944
- newAgent.id, config.provider, loopConfig.model || config.model || null, config.role, teamSize
2016
+ newAgent.id, config.provider, loopConfig.model || config.model || null, config.role, teamSize, config.prompt
1945
2017
  ).catch(() => {});
1946
2018
  } catch (e) { /* fail silent */ }
1947
2019
  }