teleportation-cli 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@ import { ingestTranscriptToTimeline } from './transcript-ingestion.js';
18
18
  const CLAUDE_CLI = process.env.CLAUDE_CLI_PATH || 'claude';
19
19
  const DEFAULT_TIMEOUT_MS = 600000; // 10 minutes per turn
20
20
  const MAX_TURNS = 100;
21
+ const UUID_PATTERN = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
21
22
 
22
23
  // Track running processes for stop functionality (only active processes)
23
24
  const runningProcesses = new Map();
@@ -96,21 +97,48 @@ export async function executeTaskTurn(options) {
96
97
  }
97
98
 
98
99
  // 6. Determine prompt
99
- const prompt = getNextPrompt(state, task);
100
+ let prompt = getNextPrompt(state, task);
100
101
  if (!prompt) {
101
102
  return { success: false, error: 'No prompt available' };
102
103
  }
103
104
 
104
105
  // 7. Determine which session to resume
105
- // First turn: start fresh (don't resume parent it's actively in use)
106
- // Subsequent turns: resume child session from previous turn
107
- const resumeSessionId = state.claude_session_id || null;
106
+ // Turn 1: resume parent session so Claude has full conversation context
107
+ // Turn 2+: resume child session from previous turn for continuity
108
+ // Safety: relay guarantees parent session is idle when this task was created (mobile Send
109
+ // flow only fires when the user is waiting for a response). Claude Code branches on --resume
110
+ // so the parent transcript is never mutated — only the child session ID is written back.
111
+ const validParentSessionId = UUID_PATTERN.test(task.parent_claude_session_id || '')
112
+ ? task.parent_claude_session_id
113
+ : null;
114
+ const resumeSessionId = state.claude_session_id // child session from previous turns
115
+ || validParentSessionId // parent session for turn 1 context
116
+ || null; // fallback: fresh session (tests / legacy tasks without relay validation)
108
117
 
109
118
  console.log(`[task-v2] Executing turn ${state.turn_count + 1} for task ${task_id.slice(0, 20)}...`);
110
- if (resumeSessionId) {
111
- console.log(`[task-v2] Resuming child session: ${resumeSessionId}`);
119
+ if (state.claude_session_id) {
120
+ console.log(`[task-v2] Resuming child session: ${state.claude_session_id}`);
121
+ } else if (validParentSessionId) {
122
+ console.log(`[task-v2] Turn 1: resuming parent session for context: ${validParentSessionId}`);
112
123
  } else {
113
- console.log(`[task-v2] Starting fresh session (first turn)`);
124
+ if (task.parent_claude_session_id && !validParentSessionId) {
125
+ console.warn(`[task-v2] parent_claude_session_id is not a valid UUID, starting fresh: ${task.parent_claude_session_id}`);
126
+ }
127
+ // Fresh session: inject caller context so Claude is never completely blind
128
+ const contextLines = [];
129
+ if (task.caller) contextLines.push(`Triggered by: ${task.caller}`);
130
+ if (task.cwd) contextLines.push(`Working directory: ${task.cwd}`);
131
+ if (task.project_name) contextLines.push(`Project: ${task.project_name}`);
132
+ if (task.branch) contextLines.push(`Branch: ${task.branch}`);
133
+ if (task.hostname) contextLines.push(`Host: ${task.hostname}`);
134
+
135
+ if (contextLines.length > 0) {
136
+ const preamble = `[Context]\n${contextLines.join('\n')}\n\n`;
137
+ prompt = preamble + prompt;
138
+ console.log(`[task-v2] Starting fresh session with context preamble (${contextLines.length} fields)`);
139
+ } else {
140
+ console.log(`[task-v2] Starting fresh session (no context metadata available)`);
141
+ }
114
142
  }
115
143
  console.log(`[task-v2] Prompt: ${prompt.slice(0, 100)}...`);
116
144
 
@@ -177,12 +205,13 @@ export async function executeTaskTurn(options) {
177
205
  },
178
206
  body: JSON.stringify({
179
207
  session_id,
180
- type: 'assistant_response',
208
+ type: 'task_update',
181
209
  data: {
182
210
  task_id,
183
211
  source: 'cli_interactive',
184
212
  claude_session_id: result.session_id,
185
- turn: state.turn_count + 1,
213
+ turn_number: state.turn_count + 1,
214
+ status: 'turn_complete',
186
215
  cost_usd: result.cost_usd,
187
216
  timestamp: Date.now(),
188
217
  message: extractAssistantText(result.output) || 'Turn completed',
@@ -39,10 +39,29 @@ import { fileURLToPath } from 'url';
39
39
  import { spawn, exec } from 'child_process';
40
40
  import { promisify } from 'util';
41
41
  import { homedir, tmpdir } from 'os';
42
- import { existsSync, appendFileSync } from 'fs';
43
- import { join } from 'path';
42
+ import { existsSync, appendFileSync, readFileSync, unlinkSync } from 'fs';
43
+ import { join, dirname } from 'path';
44
44
  // NOTE: PID locking is handled by agent-process at the platform level (launchd/systemd/pm2).
45
45
  // Signal handling and heartbeat management are handled inline below.
46
+
47
+ // File-based session registry — imported lazily so daemon can start without it
48
+ // fileURLToPath is imported from 'url' at line 38.
49
+ // REGISTRY_UNAVAILABLE is a no-op sentinel returned on transient import failure.
50
+ // Callers guard with `if (!registry.readAllSessionFiles) return` so null-deref is safe.
51
+ // _registry stays null so the next scan cycle retries the import automatically.
52
+ const REGISTRY_UNAVAILABLE = Object.freeze({ readAllSessionFiles: null, isClaudePidAlive: null });
53
+ let _registry = null;
54
+ async function getRegistry() {
55
+ if (_registry) return _registry;
56
+ try {
57
+ const registryPath = join(dirname(fileURLToPath(import.meta.url)), 'session-file-registry.js');
58
+ _registry = await import(registryPath);
59
+ } catch (e) {
60
+ console.warn('[daemon] session-file-registry not available (will retry next scan):', e.message);
61
+ return REGISTRY_UNAVAILABLE;
62
+ }
63
+ return _registry;
64
+ }
46
65
  // The following were removed in PRD-0025 migration:
47
66
  // - pid-manager.js (replaced by agent-process platform locking)
48
67
  // - lifecycle.js (replaced by inline signal handlers)
@@ -87,6 +106,8 @@ const CLAUDE_CLI = process.env.CLAUDE_CLI_PATH || 'claude'; // Configurable Clau
87
106
  const ALLOW_ALL_COMMANDS = process.env.TELEPORTATION_DAEMON_ALLOW_ALL_COMMANDS === 'true';
88
107
  const HEARTBEAT_INTERVAL_MS = parseInt(process.env.DAEMON_HEARTBEAT_INTERVAL_MS || '30000', 10); // 30 sec default
89
108
  const HEARTBEAT_CHECK_INTERVAL_MS = parseInt(process.env.DAEMON_HEARTBEAT_CHECK_INTERVAL_MS || '60000', 10); // 1 min default
109
+ // How long a PID must be dead before the daemon marks the session stopped (ms)
110
+ const DEAD_PID_THRESHOLD_MS = parseInt(process.env.DAEMON_DEAD_PID_THRESHOLD_MS || '60000', 10); // 60s default
90
111
 
91
112
  // Message routing configuration
92
113
  // REQUIRE_COMMAND_WHITELIST: If true, use legacy shell execution with command whitelist
@@ -110,6 +131,29 @@ const ROUTER_MAX_ESCALATIONS = parseInt(process.env.TELEPORTATION_ROUTER_MAX_ESC
110
131
  // Debug logging configuration
111
132
  const DEBUG = process.env.TELEPORTATION_DEBUG === 'true';
112
133
  const LOG_DIR = process.env.TELEPORTATION_LOG_DIR || tmpdir();
134
+ const SESSION_LOG_FILE = join(homedir(), '.teleportation', 'session-events.log');
135
+
136
+ /**
137
+ * Append a register event to the session log so daemon restarts can recover
138
+ * full session metadata (hostname, branch, etc.) and re-register correctly
139
+ * after a Redis TTL expiry.
140
+ */
141
+ function appendSessionRegisterLog(session) {
142
+ try {
143
+ const line = JSON.stringify({
144
+ type: 'register',
145
+ session_id: session.session_id,
146
+ claude_session_id: session.claude_session_id,
147
+ pid: session.pid || null,
148
+ cwd: session.cwd,
149
+ meta: session.meta,
150
+ timestamp: session.registered_at || Date.now()
151
+ }) + '\n';
152
+ appendFileSync(SESSION_LOG_FILE, line);
153
+ } catch (err) {
154
+ if (DEBUG) console.error(`[daemon] Failed to append session log: ${err.message}`);
155
+ }
156
+ }
113
157
 
114
158
  /**
115
159
  * Cross-platform debug logging utility
@@ -243,6 +287,13 @@ const stoppedSessions = new Set();
243
287
  // Session activity tracking for cleanup
244
288
  const sessionActivity = new Map(); // sessionId -> lastActivityTimestamp
245
289
 
290
+ // PID liveness tracking (Bug 2):
291
+ // Caches the Claude PID per session to avoid re-reading the session file every poll cycle.
292
+ // lastPidCheck throttles the PID check to at most once every PID_CHECK_INTERVAL_MS per session.
293
+ const sessionPidCache = new Map(); // sessionId -> number (claude_pid)
294
+ const lastPidCheck = new Map(); // sessionId -> number (timestamp of last check)
295
+ const PID_CHECK_INTERVAL_MS = parseInt(process.env.DAEMON_PID_CHECK_INTERVAL_MS || '30000', 10); // 30s default
296
+
246
297
  // Transcript ingestion throttling: Prevents concurrent ingestion runs per session
247
298
  // Map<session_id, Promise> tracks in-progress ingestion promises
248
299
  // If ingestion takes >5 seconds, prevents stacking multiple concurrent calls
@@ -271,6 +322,8 @@ setInterval(() => {
271
322
  sessionActivity.delete(sessionId);
272
323
  heartbeatState.delete(sessionId);
273
324
  heartbeatFailureLogged.delete(sessionId);
325
+ sessionPidCache.delete(sessionId);
326
+ lastPidCheck.delete(sessionId);
274
327
  sessionCleanedCount++;
275
328
 
276
329
  if (process.env.DEBUG) {
@@ -788,7 +841,11 @@ async function handleRequest(req, res) {
788
841
  }
789
842
  }
790
843
 
791
- sessions.set(session_id, {
844
+ // Clear stale PID cache so the new session's PID is read fresh from marker file
845
+ sessionPidCache.delete(session_id);
846
+ lastPidCheck.delete(session_id);
847
+
848
+ const sessionEntry = {
792
849
  session_id,
793
850
  claude_session_id: claude_session_id || session_id, // Fallback to session_id if not provided
794
851
  cwd: cwd || process.cwd(),
@@ -797,7 +854,10 @@ async function handleRequest(req, res) {
797
854
  daemon_pid: process.pid // Add daemon PID to metadata
798
855
  },
799
856
  registered_at: Date.now()
800
- });
857
+ };
858
+ sessions.set(session_id, sessionEntry);
859
+ // Persist to log so daemon restarts can recover full meta for re-registration
860
+ appendSessionRegisterLog(sessionEntry);
801
861
 
802
862
  console.log(`[daemon] Session registered: ${session_id} (claude_id: ${claude_session_id || session_id}) (daemon_pid: ${process.pid}) (cwd: ${cwd || process.cwd()})`);
803
863
 
@@ -1528,6 +1588,100 @@ async function sendHeartbeat(session_id) {
1528
1588
  }
1529
1589
  }
1530
1590
 
1591
+ /**
1592
+ * Check if a Claude PID is still alive.
1593
+ * Uses signal 0 (no signal sent, just existence check).
1594
+ * CROSS-REFERENCE: pid-liveness.test.js has an equivalent local implementation.
1595
+ * If this logic changes, update the test copy to match.
1596
+ * @param {number} pid - Process ID to check
1597
+ * @returns {boolean} true if process exists, false otherwise
1598
+ */
1599
+ function isPidAlive(pid) {
1600
+ try {
1601
+ process.kill(pid, 0);
1602
+ return true;
1603
+ } catch (e) {
1604
+ // EPERM means process exists but we lack permission to signal it (different user/container)
1605
+ return e.code === 'EPERM';
1606
+ }
1607
+ }
1608
+
1609
+ /**
1610
+ * Read Claude PID from session marker file.
1611
+ * Caches the PID per session to avoid reading the file on every poll cycle.
1612
+ * CROSS-REFERENCE: pid-liveness.test.js has an equivalent local implementation
1613
+ * (without cache). If this file-reading logic changes, update the test copy to match.
1614
+ * @param {string} session_id - Session ID
1615
+ * @returns {number|null} Claude PID or null if unavailable
1616
+ */
1617
+ function getSessionPid(session_id) {
1618
+ // Check cache first
1619
+ if (sessionPidCache.has(session_id)) {
1620
+ return sessionPidCache.get(session_id);
1621
+ }
1622
+
1623
+ // Read from marker file
1624
+ const sessionFile = join(tmpdir(), `teleportation-session-${session_id}.json`);
1625
+ try {
1626
+ const content = readFileSync(sessionFile, 'utf8');
1627
+ const data = JSON.parse(content);
1628
+ const pid = data.claude_pid;
1629
+ if (typeof pid === 'number' && pid > 0) {
1630
+ sessionPidCache.set(session_id, pid);
1631
+ return pid;
1632
+ }
1633
+ } catch {
1634
+ // File doesn't exist or can't be read - return null (no PID available)
1635
+ }
1636
+ return null;
1637
+ }
1638
+
1639
+ /**
1640
+ * Clean up a dead session: remove from all tracking maps, deregister from relay,
1641
+ * and delete marker file.
1642
+ * @param {string} session_id - Session ID to clean up
1643
+ * @param {number|null} pid - The dead PID (for logging)
1644
+ */
1645
+ async function cleanupDeadSession(session_id, pid) {
1646
+ logInfo(`[daemon] PID ${pid} for session ${session_id.slice(0, 8)}... is dead - cleaning up`);
1647
+
1648
+ // Remove from all tracking maps
1649
+ sessions.delete(session_id);
1650
+ sessionActivity.delete(session_id);
1651
+ heartbeatState.delete(session_id);
1652
+ heartbeatFailureLogged.delete(session_id);
1653
+ sessionPidCache.delete(session_id);
1654
+ lastPidCheck.delete(session_id);
1655
+ ingestionInProgress.delete(session_id);
1656
+ stoppedSessions.add(session_id); // Prevent re-activation from stale registration attempts
1657
+
1658
+ // Delete marker file
1659
+ const sessionFile = join(tmpdir(), `teleportation-session-${session_id}.json`);
1660
+ try {
1661
+ unlinkSync(sessionFile);
1662
+ } catch {
1663
+ // File may already be deleted
1664
+ }
1665
+
1666
+ // Deregister from relay (best-effort)
1667
+ try {
1668
+ await fetch(
1669
+ `${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/end`,
1670
+ {
1671
+ method: 'POST',
1672
+ headers: {
1673
+ 'Authorization': `Bearer ${RELAY_API_KEY}`,
1674
+ 'Content-Type': 'application/json'
1675
+ },
1676
+ body: JSON.stringify({ reason: 'pid_dead' }),
1677
+ signal: AbortSignal.timeout(5000)
1678
+ }
1679
+ );
1680
+ } catch {
1681
+ // Best-effort - relay will eventually expire the session via heartbeat timeout
1682
+ }
1683
+ }
1684
+
1531
1685
  /**
1532
1686
  * Relay API Polling Loop
1533
1687
  * Polls relay API every 5 seconds for approved requests
@@ -1553,8 +1707,25 @@ async function pollRelayAPI() {
1553
1707
  // Debug: Log to file for visibility
1554
1708
  debugLog('daemon-poll-debug.log', `Polling session ${session_id}`);
1555
1709
 
1556
- // Update activity timestamp for cleanup tracking
1557
- sessionActivity.set(session_id, Date.now());
1710
+ // PID liveness check (Bug 2):
1711
+ // Instead of unconditionally refreshing sessionActivity, check if the Claude
1712
+ // process is still alive. Only update activity if PID is confirmed alive.
1713
+ // Throttled to at most once per PID_CHECK_INTERVAL_MS per session.
1714
+ const pidCheckNow = Date.now();
1715
+ const lastCheck = lastPidCheck.get(session_id) || 0;
1716
+ if (pidCheckNow - lastCheck >= PID_CHECK_INTERVAL_MS) {
1717
+ lastPidCheck.set(session_id, pidCheckNow);
1718
+ const pid = getSessionPid(session_id);
1719
+ if (pid !== null && !isPidAlive(pid)) {
1720
+ // PID is dead - clean up and skip this session
1721
+ await cleanupDeadSession(session_id, pid);
1722
+ continue;
1723
+ }
1724
+ // PID is alive (or no PID file exists - backward compat: treat as alive)
1725
+ sessionActivity.set(session_id, pidCheckNow);
1726
+ }
1727
+ // Between PID checks, do NOT refresh sessionActivity - let the cleanup
1728
+ // sweep use the last confirmed-alive timestamp.
1558
1729
 
1559
1730
  // 0) Check for stop_requested flag (mobile stop button)
1560
1731
  // Uses daemon-state endpoint (lightweight) instead of full session fetch
@@ -2820,8 +2991,6 @@ async function cleanup() {
2820
2991
  * Enables daemon to recover sessions after restart
2821
2992
  */
2822
2993
  async function discoverSessionsFromLog() {
2823
- const SESSION_LOG_FILE = join(homedir(), '.teleportation', 'session-events.log');
2824
-
2825
2994
  try {
2826
2995
  const { readFile } = await import('fs/promises');
2827
2996
  const content = await readFile(SESSION_LOG_FILE, 'utf8');
@@ -2906,8 +3075,6 @@ async function discoverSessionsFromLog() {
2906
3075
  * Keeps only active sessions to prevent unbounded growth
2907
3076
  */
2908
3077
  async function compactSessionLog(activeSessions) {
2909
- const SESSION_LOG_FILE = join(homedir(), '.teleportation', 'session-events.log');
2910
-
2911
3078
  try {
2912
3079
  const { writeFile } = await import('fs/promises');
2913
3080
 
@@ -2948,22 +3115,39 @@ async function compactSessionLog(activeSessions) {
2948
3115
  async function main() {
2949
3116
  console.log('[daemon] Main function started.');
2950
3117
 
2951
- // Load credentials from encrypted file if not in environment
2952
- if (!RELAY_API_KEY) {
3118
+ // Load credentials from encrypted file if not in environment, OR if the env var
3119
+ // looks like a relay service key (raw hex, no 'tp_' prefix) rather than a user API key.
3120
+ // This handles the case where Bun auto-loads relay/.env when the daemon cwd is /relay,
3121
+ // injecting a hex service key that causes all heartbeats to fail with 404.
3122
+ const envKeyIsServiceKey = RELAY_API_KEY && !RELAY_API_KEY.startsWith('tp_');
3123
+ if (!RELAY_API_KEY || envKeyIsServiceKey) {
2953
3124
  try {
2954
- console.log('[daemon] RELAY_API_KEY not in environment, loading from credentials file...');
3125
+ console.log(
3126
+ envKeyIsServiceKey
3127
+ ? '[daemon] RELAY_API_KEY looks like a service key (no tp_ prefix), loading user credentials from encrypted file...'
3128
+ : '[daemon] RELAY_API_KEY not in environment, loading from credentials file...'
3129
+ );
2955
3130
  const credManager = new CredentialManager();
2956
3131
  const creds = await credManager.load();
2957
3132
  if (creds && creds.apiKey) {
2958
3133
  RELAY_API_KEY = creds.apiKey;
2959
- RELAY_API_URL = creds.relayUrl || RELAY_API_URL;
3134
+ RELAY_API_URL = creds.relayApiUrl || creds.relayUrl || RELAY_API_URL;
2960
3135
  console.log('[daemon] ✅ Loaded credentials from encrypted file');
3136
+ } else if (envKeyIsServiceKey) {
3137
+ console.warn('[daemon] ⚠️ No user credentials found — refusing to use service key for heartbeats');
3138
+ RELAY_API_KEY = '';
2961
3139
  } else {
2962
3140
  console.warn('[daemon] ⚠️ No API key found in credentials file');
2963
3141
  }
2964
3142
  } catch (e) {
2965
- console.warn('[daemon] ⚠️ Failed to load credentials:', e.message);
2966
- console.warn('[daemon] Daemon will run but cannot authenticate with relay API');
3143
+ if (envKeyIsServiceKey) {
3144
+ console.warn('[daemon] ⚠️ Failed to load credentials and env key is a service key:', e.message);
3145
+ console.warn('[daemon] Clearing service key — daemon will run without relay auth');
3146
+ RELAY_API_KEY = '';
3147
+ } else {
3148
+ console.warn('[daemon] ⚠️ Failed to load credentials:', e.message);
3149
+ console.warn('[daemon] Daemon will run but cannot authenticate with relay API');
3150
+ }
2967
3151
  }
2968
3152
  } else {
2969
3153
  console.log('[daemon] Using RELAY_API_KEY from environment');
@@ -3082,6 +3266,85 @@ async function main() {
3082
3266
  }, HEARTBEAT_INTERVAL_MS);
3083
3267
  console.log(`[daemon] Session heartbeat interval started (${HEARTBEAT_INTERVAL_MS / 1000}s)`);
3084
3268
 
3269
+ // PID-based session file scan: discover sessions written by session_start.mjs hooks
3270
+ // and check liveness via OS process table rather than waiting for hooks to re-register.
3271
+ const scanSessionFiles = async () => {
3272
+ if (isShuttingDown) return;
3273
+ const registry = await getRegistry();
3274
+ if (!registry.readAllSessionFiles || !registry.isClaudePidAlive) return;
3275
+
3276
+ let records;
3277
+ try { records = await registry.readAllSessionFiles(); } catch { return; }
3278
+
3279
+ for (const record of records) {
3280
+ const { session_id, claude_pid, cwd, meta, acked, ended } = record;
3281
+ if (!session_id || !claude_pid) continue;
3282
+
3283
+ // Fast path: session already ended
3284
+ if (ended) {
3285
+ if (sessions.has(session_id)) {
3286
+ sessions.delete(session_id);
3287
+ console.log(`[daemon] Session ${session_id.slice(0,8)} ended (file flag)`);
3288
+ }
3289
+ try { await registry.deleteSessionFile(session_id); } catch {}
3290
+ continue;
3291
+ }
3292
+
3293
+ const alive = await registry.isClaudePidAlive(claude_pid);
3294
+
3295
+ if (!sessions.has(session_id) && alive) {
3296
+ // New session discovered via file — add to Map and ack
3297
+ // claude_session_id is intentionally set to session_id here: session files
3298
+ // don't store the Anthropic-assigned claude session ID (not available at
3299
+ // hook time). The daemon only uses session_id for heartbeats/relay calls;
3300
+ // claude_session_id is only needed for --resume, which goes through the HTTP
3301
+ // registration path or agentic-executor, not file-based discovery.
3302
+ sessions.set(session_id, { session_id, claude_session_id: session_id, cwd, meta: meta || {}, claude_pid });
3303
+ try { await registry.ackSessionFile(session_id, process.pid); } catch {}
3304
+ console.log(`[daemon] Discovered session ${session_id.slice(0,8)} (PID ${claude_pid}, project: ${meta?.project_name || 'unknown'})`);
3305
+ // Send immediate heartbeat so relay shows it active right away
3306
+ try {
3307
+ await fetch(`${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/heartbeat`, {
3308
+ method: 'POST',
3309
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${RELAY_API_KEY}` },
3310
+ body: JSON.stringify({ timestamp: Date.now() }),
3311
+ signal: AbortSignal.timeout(5000)
3312
+ });
3313
+ } catch {}
3314
+ } else if (sessions.has(session_id) && !alive) {
3315
+ // Track when PID first went dead
3316
+ const sessionData = sessions.get(session_id);
3317
+ const now = Date.now();
3318
+ if (!sessionData._pid_dead_since) {
3319
+ sessionData._pid_dead_since = now;
3320
+ console.log(`[daemon] Session ${session_id.slice(0,8)} PID ${claude_pid} no longer alive — starting ${DEAD_PID_THRESHOLD_MS / 1000}s grace period`);
3321
+ } else if (now - sessionData._pid_dead_since >= DEAD_PID_THRESHOLD_MS) {
3322
+ // Grace period expired — mark stopped
3323
+ sessions.delete(session_id);
3324
+ try { await registry.deleteSessionFile(session_id); } catch {}
3325
+ console.log(`[daemon] Session ${session_id.slice(0,8)} marked stopped (PID dead > ${DEAD_PID_THRESHOLD_MS / 1000}s)`);
3326
+ try {
3327
+ await fetch(`${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/daemon-state`, {
3328
+ method: 'PATCH',
3329
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${RELAY_API_KEY}` },
3330
+ body: JSON.stringify({ status: 'stopped', stopped_reason: 'pid_dead' }),
3331
+ signal: AbortSignal.timeout(5000)
3332
+ });
3333
+ } catch {}
3334
+ }
3335
+ } else if (!sessions.has(session_id) && !alive) {
3336
+ // Stale file for a dead PID we never tracked — clean up
3337
+ try { await registry.deleteSessionFile(session_id); } catch {}
3338
+ }
3339
+ }
3340
+ };
3341
+
3342
+ // Run scan on startup to recover sessions from a daemon restart
3343
+ scanSessionFiles().catch(e => console.warn('[daemon] Initial session file scan failed:', e.message));
3344
+ // Then scan on every heartbeat cycle
3345
+ setInterval(scanSessionFiles, HEARTBEAT_INTERVAL_MS);
3346
+ console.log(`[daemon] PID-based session file scan active (${HEARTBEAT_INTERVAL_MS / 1000}s interval)`);
3347
+
3085
3348
  // Start polling loop
3086
3349
  console.log('[daemon] Starting relay API polling...');
3087
3350
  pollRelayAPI();
@@ -3152,7 +3415,14 @@ const __test = {
3152
3415
  // Stopped sessions test helpers
3153
3416
  _getStoppedSessions: () => stoppedSessions,
3154
3417
  _addStoppedSession: (session_id) => stoppedSessions.add(session_id),
3155
- _clearStoppedSessions: () => stoppedSessions.clear()
3418
+ _clearStoppedSessions: () => stoppedSessions.clear(),
3419
+ // PID liveness test helpers (Bug 2)
3420
+ isPidAlive,
3421
+ getSessionPid,
3422
+ cleanupDeadSession,
3423
+ _getSessionPidCache: () => sessionPidCache,
3424
+ _getLastPidCheck: () => lastPidCheck,
3425
+ _getSessionActivity: () => sessionActivity,
3156
3426
  };
3157
3427
 
3158
3428
  // Test helper to register a session