teleportation-cli 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -39,10 +39,29 @@ import { fileURLToPath } from 'url';
39
39
  import { spawn, exec } from 'child_process';
40
40
  import { promisify } from 'util';
41
41
  import { homedir, tmpdir } from 'os';
42
- import { existsSync, appendFileSync } from 'fs';
43
- import { join } from 'path';
42
+ import { existsSync, appendFileSync, readFileSync, unlinkSync } from 'fs';
43
+ import { join, dirname } from 'path';
44
44
  // NOTE: PID locking is handled by agent-process at the platform level (launchd/systemd/pm2).
45
45
  // Signal handling and heartbeat management are handled inline below.
46
+
47
+ // File-based session registry — imported lazily so daemon can start without it
48
+ // fileURLToPath is imported from 'url' at line 38.
49
+ // REGISTRY_UNAVAILABLE is a no-op sentinel returned on transient import failure.
50
+ // Callers guard with `if (!registry.readAllSessionFiles) return` so null-deref is safe.
51
+ // _registry stays null so the next scan cycle retries the import automatically.
52
+ const REGISTRY_UNAVAILABLE = Object.freeze({ readAllSessionFiles: null, isClaudePidAlive: null });
53
+ let _registry = null;
54
+ async function getRegistry() {
55
+ if (_registry) return _registry;
56
+ try {
57
+ const registryPath = join(dirname(fileURLToPath(import.meta.url)), 'session-file-registry.js');
58
+ _registry = await import(registryPath);
59
+ } catch (e) {
60
+ console.warn('[daemon] session-file-registry not available (will retry next scan):', e.message);
61
+ return REGISTRY_UNAVAILABLE;
62
+ }
63
+ return _registry;
64
+ }
46
65
  // The following were removed in PRD-0025 migration:
47
66
  // - pid-manager.js (replaced by agent-process platform locking)
48
67
  // - lifecycle.js (replaced by inline signal handlers)
@@ -54,6 +73,7 @@ import {
54
73
  executeTaskTurn,
55
74
  stopTask,
56
75
  stopAllTasks,
76
+ stopTasksForSession,
57
77
  } from './task-executor-v2.js';
58
78
 
59
79
  // Transcript ingestion for timeline completeness
@@ -86,6 +106,8 @@ const CLAUDE_CLI = process.env.CLAUDE_CLI_PATH || 'claude'; // Configurable Clau
86
106
  const ALLOW_ALL_COMMANDS = process.env.TELEPORTATION_DAEMON_ALLOW_ALL_COMMANDS === 'true';
87
107
  const HEARTBEAT_INTERVAL_MS = parseInt(process.env.DAEMON_HEARTBEAT_INTERVAL_MS || '30000', 10); // 30 sec default
88
108
  const HEARTBEAT_CHECK_INTERVAL_MS = parseInt(process.env.DAEMON_HEARTBEAT_CHECK_INTERVAL_MS || '60000', 10); // 1 min default
109
+ // How long a PID must be dead before the daemon marks the session stopped (ms)
110
+ const DEAD_PID_THRESHOLD_MS = parseInt(process.env.DAEMON_DEAD_PID_THRESHOLD_MS || '60000', 10); // 60s default
89
111
 
90
112
  // Message routing configuration
91
113
  // REQUIRE_COMMAND_WHITELIST: If true, use legacy shell execution with command whitelist
@@ -109,6 +131,29 @@ const ROUTER_MAX_ESCALATIONS = parseInt(process.env.TELEPORTATION_ROUTER_MAX_ESC
109
131
  // Debug logging configuration
110
132
  const DEBUG = process.env.TELEPORTATION_DEBUG === 'true';
111
133
  const LOG_DIR = process.env.TELEPORTATION_LOG_DIR || tmpdir();
134
+ const SESSION_LOG_FILE = join(homedir(), '.teleportation', 'session-events.log');
135
+
136
+ /**
137
+ * Append a register event to the session log so daemon restarts can recover
138
+ * full session metadata (hostname, branch, etc.) and re-register correctly
139
+ * after a Redis TTL expiry.
140
+ */
141
+ function appendSessionRegisterLog(session) {
142
+ try {
143
+ const line = JSON.stringify({
144
+ type: 'register',
145
+ session_id: session.session_id,
146
+ claude_session_id: session.claude_session_id,
147
+ pid: session.pid || null,
148
+ cwd: session.cwd,
149
+ meta: session.meta,
150
+ timestamp: session.registered_at || Date.now()
151
+ }) + '\n';
152
+ appendFileSync(SESSION_LOG_FILE, line);
153
+ } catch (err) {
154
+ if (DEBUG) console.error(`[daemon] Failed to append session log: ${err.message}`);
155
+ }
156
+ }
112
157
 
113
158
  /**
114
159
  * Cross-platform debug logging utility
@@ -242,6 +287,13 @@ const stoppedSessions = new Set();
242
287
  // Session activity tracking for cleanup
243
288
  const sessionActivity = new Map(); // sessionId -> lastActivityTimestamp
244
289
 
290
+ // PID liveness tracking (Bug 2):
291
+ // Caches the Claude PID per session to avoid re-reading the session file every poll cycle.
292
+ // lastPidCheck throttles the PID check to at most once every PID_CHECK_INTERVAL_MS per session.
293
+ const sessionPidCache = new Map(); // sessionId -> number (claude_pid)
294
+ const lastPidCheck = new Map(); // sessionId -> number (timestamp of last check)
295
+ const PID_CHECK_INTERVAL_MS = parseInt(process.env.DAEMON_PID_CHECK_INTERVAL_MS || '30000', 10); // 30s default
296
+
245
297
  // Transcript ingestion throttling: Prevents concurrent ingestion runs per session
246
298
  // Map<session_id, Promise> tracks in-progress ingestion promises
247
299
  // If ingestion takes >5 seconds, prevents stacking multiple concurrent calls
@@ -270,6 +322,8 @@ setInterval(() => {
270
322
  sessionActivity.delete(sessionId);
271
323
  heartbeatState.delete(sessionId);
272
324
  heartbeatFailureLogged.delete(sessionId);
325
+ sessionPidCache.delete(sessionId);
326
+ lastPidCheck.delete(sessionId);
273
327
  sessionCleanedCount++;
274
328
 
275
329
  if (process.env.DEBUG) {
@@ -787,7 +841,11 @@ async function handleRequest(req, res) {
787
841
  }
788
842
  }
789
843
 
790
- sessions.set(session_id, {
844
+ // Clear stale PID cache so the new session's PID is read fresh from marker file
845
+ sessionPidCache.delete(session_id);
846
+ lastPidCheck.delete(session_id);
847
+
848
+ const sessionEntry = {
791
849
  session_id,
792
850
  claude_session_id: claude_session_id || session_id, // Fallback to session_id if not provided
793
851
  cwd: cwd || process.cwd(),
@@ -796,7 +854,10 @@ async function handleRequest(req, res) {
796
854
  daemon_pid: process.pid // Add daemon PID to metadata
797
855
  },
798
856
  registered_at: Date.now()
799
- });
857
+ };
858
+ sessions.set(session_id, sessionEntry);
859
+ // Persist to log so daemon restarts can recover full meta for re-registration
860
+ appendSessionRegisterLog(sessionEntry);
800
861
 
801
862
  console.log(`[daemon] Session registered: ${session_id} (claude_id: ${claude_session_id || session_id}) (daemon_pid: ${process.pid}) (cwd: ${cwd || process.cwd()})`);
802
863
 
@@ -1144,6 +1205,67 @@ async function handleInboxMessage(session_id, message) {
1144
1205
  return;
1145
1206
  }
1146
1207
 
1208
+ // Check for paused tasks — route message to task instead of spawning new process.
1209
+ // This adds one relay round-trip per inbox message, but only fires for 'command' type
1210
+ // messages (user-initiated from mobile), not for auto-continue or approval messages.
1211
+ // Future optimization: include paused task info in the session polling response.
1212
+ try {
1213
+ const tasksResp = await fetch(
1214
+ `${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/tasks`,
1215
+ { headers: { 'Authorization': `Bearer ${RELAY_API_KEY}` }, signal: AbortSignal.timeout(5000) }
1216
+ );
1217
+ if (tasksResp.ok) {
1218
+ const tasks = await tasksResp.json();
1219
+ const pausedTasks = tasks.filter(t => t.status === 'paused' || t.status === 'waiting_input');
1220
+ if (pausedTasks.length > 1) {
1221
+ logWarn(`[daemon] ⚠️ Multiple paused tasks (${pausedTasks.length}) for session ${session_id} — routing to first`);
1222
+ }
1223
+ const pausedTask = pausedTasks[0];
1224
+ if (pausedTask) {
1225
+ logInfo(`[daemon] 📨 Routing message to paused task ${pausedTask.id.slice(0, 20)}... (status: ${pausedTask.status})`);
1226
+
1227
+ let routeResp;
1228
+ if (pausedTask.status === 'waiting_input') {
1229
+ // Task is waiting for user input — use the answer endpoint
1230
+ routeResp = await fetch(
1231
+ `${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/tasks/${encodeURIComponent(pausedTask.id)}/answer`,
1232
+ {
1233
+ method: 'POST',
1234
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${RELAY_API_KEY}` },
1235
+ body: JSON.stringify({ answer: commandText })
1236
+ }
1237
+ );
1238
+ } else {
1239
+ // Task is paused — use redirect to set new instructions and resume
1240
+ routeResp = await fetch(
1241
+ `${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/tasks/${encodeURIComponent(pausedTask.id)}/redirect`,
1242
+ {
1243
+ method: 'POST',
1244
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${RELAY_API_KEY}` },
1245
+ body: JSON.stringify({ instruction: commandText })
1246
+ }
1247
+ );
1248
+ }
1249
+
1250
+ if (routeResp.ok) {
1251
+ logInfo(`[daemon] ✅ Message routed to task, will resume on next poll cycle`);
1252
+ // Acknowledge the inbox message
1253
+ await fetch(`${RELAY_API_URL}/api/messages/${encodeURIComponent(message.id)}/ack`, {
1254
+ method: 'POST',
1255
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${RELAY_API_KEY}` },
1256
+ body: JSON.stringify({ session_id })
1257
+ }).catch(() => {});
1258
+ return; // Don't spawn a new process
1259
+ }
1260
+ // If route failed (e.g. wrong status), fall through to normal execution
1261
+ logWarn(`[daemon] Failed to route message to task (${routeResp.status}), falling back to normal execution`);
1262
+ }
1263
+ }
1264
+ } catch (taskCheckError) {
1265
+ // Non-critical — fall through to normal execution
1266
+ logWarn(`[daemon] Task check failed: ${taskCheckError.message}`);
1267
+ }
1268
+
1147
1269
  // Invalidate pending approvals BEFORE executing new command
1148
1270
  // This prevents race conditions where stale approvals could be acted upon
1149
1271
  try {
@@ -1466,6 +1588,100 @@ async function sendHeartbeat(session_id) {
1466
1588
  }
1467
1589
  }
1468
1590
 
1591
+ /**
1592
+ * Check if a Claude PID is still alive.
1593
+ * Uses signal 0 (no signal sent, just existence check).
1594
+ * CROSS-REFERENCE: pid-liveness.test.js has an equivalent local implementation.
1595
+ * If this logic changes, update the test copy to match.
1596
+ * @param {number} pid - Process ID to check
1597
+ * @returns {boolean} true if process exists, false otherwise
1598
+ */
1599
+ function isPidAlive(pid) {
1600
+ try {
1601
+ process.kill(pid, 0);
1602
+ return true;
1603
+ } catch (e) {
1604
+ // EPERM means process exists but we lack permission to signal it (different user/container)
1605
+ return e.code === 'EPERM';
1606
+ }
1607
+ }
1608
+
1609
+ /**
1610
+ * Read Claude PID from session marker file.
1611
+ * Caches the PID per session to avoid reading the file on every poll cycle.
1612
+ * CROSS-REFERENCE: pid-liveness.test.js has an equivalent local implementation
1613
+ * (without cache). If this file-reading logic changes, update the test copy to match.
1614
+ * @param {string} session_id - Session ID
1615
+ * @returns {number|null} Claude PID or null if unavailable
1616
+ */
1617
+ function getSessionPid(session_id) {
1618
+ // Check cache first
1619
+ if (sessionPidCache.has(session_id)) {
1620
+ return sessionPidCache.get(session_id);
1621
+ }
1622
+
1623
+ // Read from marker file
1624
+ const sessionFile = join(tmpdir(), `teleportation-session-${session_id}.json`);
1625
+ try {
1626
+ const content = readFileSync(sessionFile, 'utf8');
1627
+ const data = JSON.parse(content);
1628
+ const pid = data.claude_pid;
1629
+ if (typeof pid === 'number' && pid > 0) {
1630
+ sessionPidCache.set(session_id, pid);
1631
+ return pid;
1632
+ }
1633
+ } catch {
1634
+ // File doesn't exist or can't be read - return null (no PID available)
1635
+ }
1636
+ return null;
1637
+ }
1638
+
1639
+ /**
1640
+ * Clean up a dead session: remove from all tracking maps, deregister from relay,
1641
+ * and delete marker file.
1642
+ * @param {string} session_id - Session ID to clean up
1643
+ * @param {number|null} pid - The dead PID (for logging)
1644
+ */
1645
+ async function cleanupDeadSession(session_id, pid) {
1646
+ logInfo(`[daemon] PID ${pid} for session ${session_id.slice(0, 8)}... is dead - cleaning up`);
1647
+
1648
+ // Remove from all tracking maps
1649
+ sessions.delete(session_id);
1650
+ sessionActivity.delete(session_id);
1651
+ heartbeatState.delete(session_id);
1652
+ heartbeatFailureLogged.delete(session_id);
1653
+ sessionPidCache.delete(session_id);
1654
+ lastPidCheck.delete(session_id);
1655
+ ingestionInProgress.delete(session_id);
1656
+ stoppedSessions.add(session_id); // Prevent re-activation from stale registration attempts
1657
+
1658
+ // Delete marker file
1659
+ const sessionFile = join(tmpdir(), `teleportation-session-${session_id}.json`);
1660
+ try {
1661
+ unlinkSync(sessionFile);
1662
+ } catch {
1663
+ // File may already be deleted
1664
+ }
1665
+
1666
+ // Deregister from relay (best-effort)
1667
+ try {
1668
+ await fetch(
1669
+ `${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/end`,
1670
+ {
1671
+ method: 'POST',
1672
+ headers: {
1673
+ 'Authorization': `Bearer ${RELAY_API_KEY}`,
1674
+ 'Content-Type': 'application/json'
1675
+ },
1676
+ body: JSON.stringify({ reason: 'pid_dead' }),
1677
+ signal: AbortSignal.timeout(5000)
1678
+ }
1679
+ );
1680
+ } catch {
1681
+ // Best-effort - relay will eventually expire the session via heartbeat timeout
1682
+ }
1683
+ }
1684
+
1469
1685
  /**
1470
1686
  * Relay API Polling Loop
1471
1687
  * Polls relay API every 5 seconds for approved requests
@@ -1491,8 +1707,96 @@ async function pollRelayAPI() {
1491
1707
  // Debug: Log to file for visibility
1492
1708
  debugLog('daemon-poll-debug.log', `Polling session ${session_id}`);
1493
1709
 
1494
- // Update activity timestamp for cleanup tracking
1495
- sessionActivity.set(session_id, Date.now());
1710
+ // PID liveness check (Bug 2):
1711
+ // Instead of unconditionally refreshing sessionActivity, check if the Claude
1712
+ // process is still alive. Only update activity if PID is confirmed alive.
1713
+ // Throttled to at most once per PID_CHECK_INTERVAL_MS per session.
1714
+ const pidCheckNow = Date.now();
1715
+ const lastCheck = lastPidCheck.get(session_id) || 0;
1716
+ if (pidCheckNow - lastCheck >= PID_CHECK_INTERVAL_MS) {
1717
+ lastPidCheck.set(session_id, pidCheckNow);
1718
+ const pid = getSessionPid(session_id);
1719
+ if (pid !== null && !isPidAlive(pid)) {
1720
+ // PID is dead - clean up and skip this session
1721
+ await cleanupDeadSession(session_id, pid);
1722
+ continue;
1723
+ }
1724
+ // PID is alive (or no PID file exists - backward compat: treat as alive)
1725
+ sessionActivity.set(session_id, pidCheckNow);
1726
+ }
1727
+ // Between PID checks, do NOT refresh sessionActivity - let the cleanup
1728
+ // sweep use the last confirmed-alive timestamp.
1729
+
1730
+ // 0) Check for stop_requested flag (mobile stop button)
1731
+ // Uses daemon-state endpoint (lightweight) instead of full session fetch
1732
+ try {
1733
+ const stopCheckResponse = await fetch(
1734
+ `${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/daemon-state`,
1735
+ {
1736
+ headers: { 'Authorization': `Bearer ${RELAY_API_KEY}` },
1737
+ signal: AbortSignal.timeout(5000)
1738
+ }
1739
+ );
1740
+
1741
+ if (stopCheckResponse.ok) {
1742
+ const daemonState = await stopCheckResponse.json();
1743
+
1744
+ if (daemonState.stop_requested) {
1745
+ logInfo(`[daemon] 🛑 Stop requested for session ${session_id} — killing running processes`);
1746
+
1747
+ // Kill any running approval execution processes for this session
1748
+ let killedExecution = false;
1749
+ for (const [approval_id, exec] of executions) {
1750
+ if (exec.session_id === session_id && exec.status === 'executing' && exec.child_process) {
1751
+ try {
1752
+ const child = exec.child_process;
1753
+ child.kill('SIGTERM');
1754
+ // Track SIGKILL timer so it can be cancelled if process exits cleanly
1755
+ const killTimer = setTimeout(() => {
1756
+ try { child.kill('SIGKILL'); } catch {}
1757
+ }, 2000);
1758
+ child.once('exit', () => clearTimeout(killTimer));
1759
+ killedExecution = true;
1760
+ logInfo(`[daemon] Killed execution process for approval ${approval_id}`);
1761
+ } catch (killErr) {
1762
+ logWarn(`[daemon] Failed to kill execution ${approval_id}: ${killErr.message}`);
1763
+ }
1764
+ }
1765
+ }
1766
+
1767
+ // Kill any running task processes for this session
1768
+ const killedTaskCount = stopTasksForSession(session_id);
1769
+ const killedTask = killedTaskCount > 0;
1770
+ if (killedTask) {
1771
+ logInfo(`[daemon] Killed ${killedTaskCount} task process(es) for session ${session_id}`);
1772
+ }
1773
+
1774
+ // Clear stop_requested flag
1775
+ try {
1776
+ await fetch(`${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/daemon-state`, {
1777
+ method: 'PATCH',
1778
+ headers: {
1779
+ 'Content-Type': 'application/json',
1780
+ 'Authorization': `Bearer ${RELAY_API_KEY}`
1781
+ },
1782
+ body: JSON.stringify({ stop_requested: false })
1783
+ });
1784
+ logInfo(`[daemon] Cleared stop_requested flag for session ${session_id}`);
1785
+ } catch (clearErr) {
1786
+ logWarn(`[daemon] Failed to clear stop_requested: ${clearErr.message}`);
1787
+ }
1788
+
1789
+ if (!killedExecution && !killedTask) {
1790
+ logInfo(`[daemon] No running processes found to stop for session ${session_id}`);
1791
+ }
1792
+ }
1793
+ }
1794
+ } catch (stopCheckError) {
1795
+ // Don't block polling if stop check fails
1796
+ if (stopCheckError.name !== 'AbortError') {
1797
+ logWarn(`[daemon] Stop check error for ${session_id}: ${stopCheckError.message}`);
1798
+ }
1799
+ }
1496
1800
 
1497
1801
  // 1) Approvals polling (existing behavior)
1498
1802
  try {
@@ -1603,8 +1907,8 @@ async function pollRelayAPI() {
1603
1907
 
1604
1908
  // Process each task (stateless - queries timeline each time)
1605
1909
  for (const task of tasks) {
1606
- // Skip stopped/completed tasks
1607
- if (task.status === 'stopped' || task.status === 'completed') {
1910
+ // Skip non-runnable tasks (paused tasks wait for user message to resume)
1911
+ if (task.status === 'stopped' || task.status === 'completed' || task.status === 'paused') {
1608
1912
  continue;
1609
1913
  }
1610
1914
 
@@ -1643,11 +1947,21 @@ async function pollRelayAPI() {
1643
1947
  const claude_session_id = sessionData.claude_session_id || session_id;
1644
1948
  const cwd = sessionData.cwd || process.cwd();
1645
1949
 
1950
+ // 4) Heartbeat - send periodically to keep session alive
1951
+ // Must run before ingestion throttle check — ingestion `continue` must not skip heartbeats
1952
+ const now = Date.now();
1953
+ const sessionHeartbeat = heartbeatState.get(session_id);
1954
+ const lastSent = sessionHeartbeat?.lastSent || 0;
1955
+ if (now - lastSent >= SESSION_HEARTBEAT_INTERVAL_MS) {
1956
+ await sendHeartbeat(session_id);
1957
+ }
1958
+
1959
+ // 5) Transcript ingestion - backup to stop hook for timeline completeness
1646
1960
  // Throttling: Check if ingestion is already in progress for this session
1647
1961
  // Prevents concurrent ingestion runs that could cause race conditions
1648
1962
  if (ingestionInProgress.has(session_id)) {
1649
1963
  debugLog('daemon-transcript-debug.log', `Ingestion already in progress for ${session_id}, skipping`);
1650
- // Skip this polling cycle for this session
1964
+ // Skip ingestion this cycle (heartbeat already sent above)
1651
1965
  continue;
1652
1966
  }
1653
1967
 
@@ -1680,15 +1994,6 @@ async function pollRelayAPI() {
1680
1994
 
1681
1995
  // Track the promise (but don't await - fire-and-forget)
1682
1996
  ingestionInProgress.set(session_id, ingestionPromise);
1683
-
1684
- // 5) Heartbeat - send periodically to keep session alive
1685
- // Only send heartbeat if enough time has passed since last one (throttled per session)
1686
- const now = Date.now();
1687
- const sessionHeartbeat = heartbeatState.get(session_id);
1688
- const lastSent = sessionHeartbeat?.lastSent || 0;
1689
- if (now - lastSent >= SESSION_HEARTBEAT_INTERVAL_MS) {
1690
- await sendHeartbeat(session_id);
1691
- }
1692
1997
  }
1693
1998
 
1694
1999
  // Process approval queue
@@ -1823,6 +2128,7 @@ async function processQueue() {
1823
2128
  // Mark as executing (child_process will be set when spawnClaudeProcess is called)
1824
2129
  executions.set(approval_id, {
1825
2130
  approval_id,
2131
+ session_id,
1826
2132
  status: 'executing',
1827
2133
  started_at: Date.now(),
1828
2134
  completed_at: null,
@@ -2685,8 +2991,6 @@ async function cleanup() {
2685
2991
  * Enables daemon to recover sessions after restart
2686
2992
  */
2687
2993
  async function discoverSessionsFromLog() {
2688
- const SESSION_LOG_FILE = join(homedir(), '.teleportation', 'session-events.log');
2689
-
2690
2994
  try {
2691
2995
  const { readFile } = await import('fs/promises');
2692
2996
  const content = await readFile(SESSION_LOG_FILE, 'utf8');
@@ -2771,8 +3075,6 @@ async function discoverSessionsFromLog() {
2771
3075
  * Keeps only active sessions to prevent unbounded growth
2772
3076
  */
2773
3077
  async function compactSessionLog(activeSessions) {
2774
- const SESSION_LOG_FILE = join(homedir(), '.teleportation', 'session-events.log');
2775
-
2776
3078
  try {
2777
3079
  const { writeFile } = await import('fs/promises');
2778
3080
 
@@ -2813,22 +3115,39 @@ async function compactSessionLog(activeSessions) {
2813
3115
  async function main() {
2814
3116
  console.log('[daemon] Main function started.');
2815
3117
 
2816
- // Load credentials from encrypted file if not in environment
2817
- if (!RELAY_API_KEY) {
3118
+ // Load credentials from encrypted file if not in environment, OR if the env var
3119
+ // looks like a relay service key (raw hex, no 'tp_' prefix) rather than a user API key.
3120
+ // This handles the case where Bun auto-loads relay/.env when the daemon cwd is /relay,
3121
+ // injecting a hex service key that causes all heartbeats to fail with 404.
3122
+ const envKeyIsServiceKey = RELAY_API_KEY && !RELAY_API_KEY.startsWith('tp_');
3123
+ if (!RELAY_API_KEY || envKeyIsServiceKey) {
2818
3124
  try {
2819
- console.log('[daemon] RELAY_API_KEY not in environment, loading from credentials file...');
3125
+ console.log(
3126
+ envKeyIsServiceKey
3127
+ ? '[daemon] RELAY_API_KEY looks like a service key (no tp_ prefix), loading user credentials from encrypted file...'
3128
+ : '[daemon] RELAY_API_KEY not in environment, loading from credentials file...'
3129
+ );
2820
3130
  const credManager = new CredentialManager();
2821
3131
  const creds = await credManager.load();
2822
3132
  if (creds && creds.apiKey) {
2823
3133
  RELAY_API_KEY = creds.apiKey;
2824
- RELAY_API_URL = creds.relayUrl || RELAY_API_URL;
3134
+ RELAY_API_URL = creds.relayApiUrl || creds.relayUrl || RELAY_API_URL;
2825
3135
  console.log('[daemon] ✅ Loaded credentials from encrypted file');
3136
+ } else if (envKeyIsServiceKey) {
3137
+ console.warn('[daemon] ⚠️ No user credentials found — refusing to use service key for heartbeats');
3138
+ RELAY_API_KEY = '';
2826
3139
  } else {
2827
3140
  console.warn('[daemon] ⚠️ No API key found in credentials file');
2828
3141
  }
2829
3142
  } catch (e) {
2830
- console.warn('[daemon] ⚠️ Failed to load credentials:', e.message);
2831
- console.warn('[daemon] Daemon will run but cannot authenticate with relay API');
3143
+ if (envKeyIsServiceKey) {
3144
+ console.warn('[daemon] ⚠️ Failed to load credentials and env key is a service key:', e.message);
3145
+ console.warn('[daemon] Clearing service key — daemon will run without relay auth');
3146
+ RELAY_API_KEY = '';
3147
+ } else {
3148
+ console.warn('[daemon] ⚠️ Failed to load credentials:', e.message);
3149
+ console.warn('[daemon] Daemon will run but cannot authenticate with relay API');
3150
+ }
2832
3151
  }
2833
3152
  } else {
2834
3153
  console.log('[daemon] Using RELAY_API_KEY from environment');
@@ -2890,6 +3209,41 @@ async function main() {
2890
3209
  });
2891
3210
  if (!hbResponse.ok) {
2892
3211
  const errMsg = `HTTP ${hbResponse.status}`;
3212
+
3213
+ // 404 means session expired from Redis — try to re-register it.
3214
+ // The relay heartbeat endpoint also attempts recovery from mech-storage,
3215
+ // but if that fails (e.g., session never persisted), daemon-side re-registration
3216
+ // ensures the session is recreated with correct metadata.
3217
+ if (hbResponse.status === 404) {
3218
+ const sessionData = sessions.get(sessionId);
3219
+ try {
3220
+ const regResponse = await fetch(`${RELAY_API_URL}/api/sessions/register`, {
3221
+ method: 'POST',
3222
+ headers: {
3223
+ 'Content-Type': 'application/json',
3224
+ 'Authorization': `Bearer ${RELAY_API_KEY}`
3225
+ },
3226
+ body: JSON.stringify({
3227
+ session_id: sessionId,
3228
+ claude_session_id: sessionData?.claude_session_id || undefined,
3229
+ cwd: sessionData?.cwd || process.cwd(),
3230
+ meta: sessionData?.meta || {}
3231
+ }),
3232
+ signal: AbortSignal.timeout(5000)
3233
+ });
3234
+ if (regResponse.ok) {
3235
+ console.log(`[daemon] Re-registered expired session ${sessionId} after heartbeat 404`);
3236
+ // Clear failure tracking so next heartbeat is treated fresh
3237
+ heartbeatFailureLogged.delete(sessionId);
3238
+ continue; // Skip failure logging — session recovered
3239
+ } else {
3240
+ console.warn(`[daemon] Failed to re-register session ${sessionId}: HTTP ${regResponse.status}`);
3241
+ }
3242
+ } catch (regErr) {
3243
+ console.warn(`[daemon] Re-registration attempt failed for ${sessionId}: ${regErr.message}`);
3244
+ }
3245
+ }
3246
+
2893
3247
  if (!heartbeatFailureLogged.has(sessionId)) {
2894
3248
  heartbeatFailureLogged.add(sessionId);
2895
3249
  console.warn(`[daemon] Heartbeat rejected for ${sessionId}: ${errMsg} (further failures for this session suppressed unless DEBUG is set)`);
@@ -2912,6 +3266,85 @@ async function main() {
2912
3266
  }, HEARTBEAT_INTERVAL_MS);
2913
3267
  console.log(`[daemon] Session heartbeat interval started (${HEARTBEAT_INTERVAL_MS / 1000}s)`);
2914
3268
 
3269
+ // PID-based session file scan: discover sessions written by session_start.mjs hooks
3270
+ // and check liveness via OS process table rather than waiting for hooks to re-register.
3271
+ const scanSessionFiles = async () => {
3272
+ if (isShuttingDown) return;
3273
+ const registry = await getRegistry();
3274
+ if (!registry.readAllSessionFiles || !registry.isClaudePidAlive) return;
3275
+
3276
+ let records;
3277
+ try { records = await registry.readAllSessionFiles(); } catch { return; }
3278
+
3279
+ for (const record of records) {
3280
+ const { session_id, claude_pid, cwd, meta, acked, ended } = record;
3281
+ if (!session_id || !claude_pid) continue;
3282
+
3283
+ // Fast path: session already ended
3284
+ if (ended) {
3285
+ if (sessions.has(session_id)) {
3286
+ sessions.delete(session_id);
3287
+ console.log(`[daemon] Session ${session_id.slice(0,8)} ended (file flag)`);
3288
+ }
3289
+ try { await registry.deleteSessionFile(session_id); } catch {}
3290
+ continue;
3291
+ }
3292
+
3293
+ const alive = await registry.isClaudePidAlive(claude_pid);
3294
+
3295
+ if (!sessions.has(session_id) && alive) {
3296
+ // New session discovered via file — add to Map and ack
3297
+ // claude_session_id is intentionally set to session_id here: session files
3298
+ // don't store the Anthropic-assigned claude session ID (not available at
3299
+ // hook time). The daemon only uses session_id for heartbeats/relay calls;
3300
+ // claude_session_id is only needed for --resume, which goes through the HTTP
3301
+ // registration path or agentic-executor, not file-based discovery.
3302
+ sessions.set(session_id, { session_id, claude_session_id: session_id, cwd, meta: meta || {}, claude_pid });
3303
+ try { await registry.ackSessionFile(session_id, process.pid); } catch {}
3304
+ console.log(`[daemon] Discovered session ${session_id.slice(0,8)} (PID ${claude_pid}, project: ${meta?.project_name || 'unknown'})`);
3305
+ // Send immediate heartbeat so relay shows it active right away
3306
+ try {
3307
+ await fetch(`${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/heartbeat`, {
3308
+ method: 'POST',
3309
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${RELAY_API_KEY}` },
3310
+ body: JSON.stringify({ timestamp: Date.now() }),
3311
+ signal: AbortSignal.timeout(5000)
3312
+ });
3313
+ } catch {}
3314
+ } else if (sessions.has(session_id) && !alive) {
3315
+ // Track when PID first went dead
3316
+ const sessionData = sessions.get(session_id);
3317
+ const now = Date.now();
3318
+ if (!sessionData._pid_dead_since) {
3319
+ sessionData._pid_dead_since = now;
3320
+ console.log(`[daemon] Session ${session_id.slice(0,8)} PID ${claude_pid} no longer alive — starting ${DEAD_PID_THRESHOLD_MS / 1000}s grace period`);
3321
+ } else if (now - sessionData._pid_dead_since >= DEAD_PID_THRESHOLD_MS) {
3322
+ // Grace period expired — mark stopped
3323
+ sessions.delete(session_id);
3324
+ try { await registry.deleteSessionFile(session_id); } catch {}
3325
+ console.log(`[daemon] Session ${session_id.slice(0,8)} marked stopped (PID dead > ${DEAD_PID_THRESHOLD_MS / 1000}s)`);
3326
+ try {
3327
+ await fetch(`${RELAY_API_URL}/api/sessions/${encodeURIComponent(session_id)}/daemon-state`, {
3328
+ method: 'PATCH',
3329
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${RELAY_API_KEY}` },
3330
+ body: JSON.stringify({ status: 'stopped', stopped_reason: 'pid_dead' }),
3331
+ signal: AbortSignal.timeout(5000)
3332
+ });
3333
+ } catch {}
3334
+ }
3335
+ } else if (!sessions.has(session_id) && !alive) {
3336
+ // Stale file for a dead PID we never tracked — clean up
3337
+ try { await registry.deleteSessionFile(session_id); } catch {}
3338
+ }
3339
+ }
3340
+ };
3341
+
3342
+ // Run scan on startup to recover sessions from a daemon restart
3343
+ scanSessionFiles().catch(e => console.warn('[daemon] Initial session file scan failed:', e.message));
3344
+ // Then scan on every heartbeat cycle
3345
+ setInterval(scanSessionFiles, HEARTBEAT_INTERVAL_MS);
3346
+ console.log(`[daemon] PID-based session file scan active (${HEARTBEAT_INTERVAL_MS / 1000}s interval)`);
3347
+
2915
3348
  // Start polling loop
2916
3349
  console.log('[daemon] Starting relay API polling...');
2917
3350
  pollRelayAPI();
@@ -2982,7 +3415,14 @@ const __test = {
2982
3415
  // Stopped sessions test helpers
2983
3416
  _getStoppedSessions: () => stoppedSessions,
2984
3417
  _addStoppedSession: (session_id) => stoppedSessions.add(session_id),
2985
- _clearStoppedSessions: () => stoppedSessions.clear()
3418
+ _clearStoppedSessions: () => stoppedSessions.clear(),
3419
+ // PID liveness test helpers (Bug 2)
3420
+ isPidAlive,
3421
+ getSessionPid,
3422
+ cleanupDeadSession,
3423
+ _getSessionPidCache: () => sessionPidCache,
3424
+ _getLastPidCheck: () => lastPidCheck,
3425
+ _getSessionActivity: () => sessionActivity,
2986
3426
  };
2987
3427
 
2988
3428
  // Test helper to register a session