@dmsdc-ai/aigentry-telepty 0.1.96 → 0.1.98
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +16 -8
- package/daemon.js +68 -25
- package/package.json +4 -4
- package/session-state.js +154 -51
- package/skills/telepty/SKILL.md +39 -118
- package/skills/telepty-allow/SKILL.md +78 -0
- package/skills/telepty-attach/SKILL.md +52 -0
- package/skills/telepty-broadcast/SKILL.md +63 -0
- package/skills/telepty-daemon/SKILL.md +94 -0
- package/skills/telepty-inject/SKILL.md +93 -0
- package/skills/telepty-list/SKILL.md +81 -0
- package/skills/telepty-listen/SKILL.md +86 -0
- package/skills/telepty-rename/SKILL.md +63 -0
- package/skills/telepty-session/SKILL.md +83 -0
- package/specs/codex-inject-spec.md +201 -0
- package/src/mailbox/config.js +4 -0
- package/src/mailbox/delivery.js +93 -32
- package/src/mailbox/index.js +11 -0
- package/src/mailbox/storage.js +84 -5
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: telepty-listen
|
|
3
|
+
description: Monitor telepty events and read session screen output. Covers listen (event bus) and read-screen commands.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# telepty-listen — Event Monitoring and Screen Reading
|
|
7
|
+
|
|
8
|
+
## listen — Subscribe to event bus
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
telepty listen
|
|
12
|
+
telepty listen --json
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Natural-language: "이벤트 보여줘", "listen to the bus", "monitor session events"
|
|
16
|
+
|
|
17
|
+
Connects to the daemon's WebSocket event bus and streams all events in real-time.
|
|
18
|
+
|
|
19
|
+
### Event types
|
|
20
|
+
|
|
21
|
+
| Event | Description |
|
|
22
|
+
|-------|-------------|
|
|
23
|
+
| `session_health` | Periodic health status for all sessions |
|
|
24
|
+
| `inject_written` | Message delivered to a session |
|
|
25
|
+
| `inject_failed` | Delivery failure with error code |
|
|
26
|
+
| `session_register` | New session registered |
|
|
27
|
+
| `session_rename` | Session ID changed |
|
|
28
|
+
| `session_stale` | Session disconnected beyond stale threshold |
|
|
29
|
+
| `session_cleanup` | Stale session auto-removed |
|
|
30
|
+
| `submit` | Enter keystroke sent |
|
|
31
|
+
| `mailbox_delivered` | Mailbox message successfully delivered |
|
|
32
|
+
| `mailbox_delivery_failed` | Mailbox delivery failed, will retry |
|
|
33
|
+
|
|
34
|
+
### Examples
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# Human-readable event stream
|
|
38
|
+
telepty listen
|
|
39
|
+
|
|
40
|
+
# JSON format for scripting
|
|
41
|
+
telepty listen --json
|
|
42
|
+
|
|
43
|
+
# Filter specific events with jq
|
|
44
|
+
telepty listen --json | jq 'select(.type == "inject_written")'
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## read-screen — Read session screen buffer
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
telepty read-screen <session_id> [--lines N] [--raw]
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Natural-language: "세션 화면 읽어줘", "what's on the analyst's screen", "read screen output"
|
|
54
|
+
|
|
55
|
+
Reads the last N lines of a session's PTY output buffer (default: 50 lines).
|
|
56
|
+
|
|
57
|
+
### Options
|
|
58
|
+
|
|
59
|
+
| Flag | Description |
|
|
60
|
+
|------|-------------|
|
|
61
|
+
| `--lines N` | Number of lines to read (default: 50) |
|
|
62
|
+
| `--raw` | Return raw output with ANSI escape sequences |
|
|
63
|
+
|
|
64
|
+
### Examples
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Read last 50 lines (cleaned)
|
|
68
|
+
telepty read-screen my-claude
|
|
69
|
+
|
|
70
|
+
# Read last 100 lines
|
|
71
|
+
telepty read-screen my-claude --lines 100
|
|
72
|
+
|
|
73
|
+
# Raw output with escape sequences
|
|
74
|
+
telepty read-screen my-claude --raw
|
|
75
|
+
|
|
76
|
+
# Use in scripts
|
|
77
|
+
SCREEN=$(telepty read-screen my-claude --lines 10)
|
|
78
|
+
echo "$SCREEN" | grep "error"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Common Errors
|
|
82
|
+
|
|
83
|
+
| Error | Cause | Fix |
|
|
84
|
+
|-------|-------|-----|
|
|
85
|
+
| `Session not found` | Session doesn't exist | Check `telepty list` |
|
|
86
|
+
| `(empty screen)` | No output captured yet | Wait for session to produce output |
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: telepty-rename
|
|
3
|
+
description: Rename, delete, and clean up telepty sessions. Session lifecycle management.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# telepty-rename — Session Lifecycle Management
|
|
7
|
+
|
|
8
|
+
## rename — Change a session's ID
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
telepty rename <old_id> <new_id>
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Natural-language: "세션 이름 바꿔줘", "rename the session"
|
|
15
|
+
|
|
16
|
+
Renames a session while preserving all state, connections, and attached clients. Publishes a `session_rename` event on the bus.
|
|
17
|
+
|
|
18
|
+
### Examples
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
telepty rename temp-session analyst-claude
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## delete — Remove a session
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
telepty delete <session_id>
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Natural-language: "세션 삭제해줘", "kill that session", "remove the dead session"
|
|
31
|
+
|
|
32
|
+
Forcefully closes the session's PTY process, disconnects all clients, and removes it from the daemon registry.
|
|
33
|
+
|
|
34
|
+
### Examples
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
telepty delete stale-session
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## clean — Remove ghost sessions
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
telepty clean
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Natural-language: "고스트 세션 정리해줘", "clean up stale sessions"
|
|
47
|
+
|
|
48
|
+
Scans all sessions and removes those with `STALE` or `DISCONNECTED` health status. Safe to run periodically.
|
|
49
|
+
|
|
50
|
+
### Example output
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
🗑 Removed ghost: old-brain-claude (STALE)
|
|
54
|
+
🗑 Removed ghost: temp-test (DISCONNECTED)
|
|
55
|
+
✅ Cleaned 2 ghost session(s).
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Common Errors
|
|
59
|
+
|
|
60
|
+
| Error | Cause | Fix |
|
|
61
|
+
|-------|-------|-----|
|
|
62
|
+
| `Session not found` | Session doesn't exist or already removed | Check `telepty list` |
|
|
63
|
+
| `Session ID already active` | New name conflicts with existing session | Choose a different name |
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: telepty-session
|
|
3
|
+
description: Multi-session orchestration — start multiple sessions at once and arrange terminal layouts. Covers session start and layout commands.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# telepty-session — Multi-Session Orchestration
|
|
7
|
+
|
|
8
|
+
## session start — Launch multiple sessions
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
telepty session start [--launch]
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Natural-language: "세션 여러 개 시작해줘", "start all sessions", "launch the ecosystem"
|
|
15
|
+
|
|
16
|
+
Starts pre-configured sessions (from aigentry ecosystem or custom config). With `--launch`, opens each session in a new terminal tab/window.
|
|
17
|
+
|
|
18
|
+
### Options
|
|
19
|
+
|
|
20
|
+
| Flag | Description |
|
|
21
|
+
|------|-------------|
|
|
22
|
+
| `--launch` | Open each session in a new kitty/ghostty tab |
|
|
23
|
+
|
|
24
|
+
### Examples
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Start sessions interactively
|
|
28
|
+
telepty session start
|
|
29
|
+
|
|
30
|
+
# Start and launch in terminal tabs
|
|
31
|
+
telepty session start --launch
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## layout — Arrange terminal windows in a grid
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
telepty layout [columns]
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Natural-language: "터미널 배치해줘", "arrange the windows", "layout the sessions"
|
|
41
|
+
|
|
42
|
+
Arranges all terminal windows in a grid layout on the screen. Defaults to auto-calculated columns based on session count.
|
|
43
|
+
|
|
44
|
+
### Examples
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# Auto-layout
|
|
48
|
+
telepty layout
|
|
49
|
+
|
|
50
|
+
# Force 3-column grid
|
|
51
|
+
telepty layout 3
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## session info — Detailed session metadata
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
telepty session info <session_id>
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Shows comprehensive session details including:
|
|
61
|
+
- Session type, command, CWD
|
|
62
|
+
- Terminal detection (ghostty, kitty, aterm)
|
|
63
|
+
- Health status and reason
|
|
64
|
+
- Transport block (delivery endpoint, backend)
|
|
65
|
+
- Semantic state (phase, current task, blocker)
|
|
66
|
+
- Mailbox stats (pending, dead-letter count)
|
|
67
|
+
|
|
68
|
+
## deliberate — Start multi-session deliberation
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
telepty deliberate "<topic>"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Natural-language: "토론 시작해줘", "start a deliberation"
|
|
75
|
+
|
|
76
|
+
Initiates a structured multi-session deliberation thread on the given topic.
|
|
77
|
+
|
|
78
|
+
## Common Errors
|
|
79
|
+
|
|
80
|
+
| Error | Cause | Fix |
|
|
81
|
+
|-------|-------|-----|
|
|
82
|
+
| `No sessions configured` | No aigentry session config found | Configure sessions first |
|
|
83
|
+
| `Terminal not supported` | Layout requires kitty or ghostty | Use a supported terminal |
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# SPEC: Codex inject reliability — 4 issues
|
|
2
|
+
|
|
3
|
+
**Bug source:** orchestrator inject e9f41301...
|
|
4
|
+
**Session:** aigentry-telepty
|
|
5
|
+
**Status:** SPEC — awaiting orchestrator approval
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Goal
|
|
10
|
+
|
|
11
|
+
Make `telepty inject` work reliably with codex sessions. Currently 4 failure
|
|
12
|
+
modes: Enter not pressed, active work overwrite, REPORT not sent, multi-task
|
|
13
|
+
partial processing.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Root Cause Analysis
|
|
18
|
+
|
|
19
|
+
### Issue 1: inject succeeds but Enter NOT pressed
|
|
20
|
+
|
|
21
|
+
**Flow:** daemon `deliverInjectionToSession()` → mailbox → `tick()` →
|
|
22
|
+
`writeDataToSession()` sends text via WS → allow-bridge → `child.write(text)`.
|
|
23
|
+
Then 500ms later, `writeDataToSession(id, session, '\r')` → WS → allow-bridge →
|
|
24
|
+
`child.write('\r')`.
|
|
25
|
+
|
|
26
|
+
**Root cause:** codex CLI puts terminal in raw mode with custom input handling.
|
|
27
|
+
PTY-level `\r` via `child.write('\r')` is NOT equivalent to pressing Enter in
|
|
28
|
+
codex's input model. codex reads PTY input character by character in raw mode
|
|
29
|
+
and interprets `\r` differently than a keyboard Enter event.
|
|
30
|
+
|
|
31
|
+
**Evidence:** Project memory: "PTY `\r` 직접 의존 금지" — don't depend on PTY
|
|
32
|
+
`\r` directly. "inject submit은 항상 osascript/kitty terminal-level submit 우선".
|
|
33
|
+
|
|
34
|
+
The `--submit` flag exists in CLI but POST /submit also uses `submitViaPty()` →
|
|
35
|
+
same `\r` via WS. It does NOT use terminal-level submit (kitty/cmux).
|
|
36
|
+
|
|
37
|
+
### Issue 2: New inject overwrites active work
|
|
38
|
+
|
|
39
|
+
**Flow:** `deliverInjectionToSession()` enqueues to mailbox and calls
|
|
40
|
+
`mailboxDelivery.tick()` immediately. Text goes via WS → allow-bridge.
|
|
41
|
+
|
|
42
|
+
Allow-bridge has queuing: if `isIdle()` is false, text goes to
|
|
43
|
+
`enqueueBridgeMessage()`. The safety timer flushes after 5s regardless. But the
|
|
44
|
+
daemon doesn't check session state — it pushes immediately.
|
|
45
|
+
|
|
46
|
+
**Root cause:** Two layers of the problem:
|
|
47
|
+
1. Daemon sends inject regardless of session state (working/thinking/idle)
|
|
48
|
+
2. Allow-bridge 5s safety flush writes queued text to PTY even if session is
|
|
49
|
+
still working, which interrupts codex's current task
|
|
50
|
+
|
|
51
|
+
### Issue 3: REPORT not sent after completion
|
|
52
|
+
|
|
53
|
+
**Flow:** Auto-report mechanism (`pendingReports`) triggers when allow-bridge
|
|
54
|
+
sends `{ type: 'ready' }` WS message. The `ready` signal fires when
|
|
55
|
+
`promptPattern.test(data)` matches in the PTY output.
|
|
56
|
+
|
|
57
|
+
**Root cause:** codex prompt pattern `codex: /[❯>]\s*$/` doesn't reliably match
|
|
58
|
+
codex's actual prompt output. If prompt is never detected → `ready` never sent →
|
|
59
|
+
`pendingReports` never cleared → auto-report never fires.
|
|
60
|
+
|
|
61
|
+
The new session state machine (#185) detects `idle` via OSC 133 + silence
|
|
62
|
+
timeout, but auto-report still uses the legacy `ready` WS signal (daemon.js
|
|
63
|
+
line 2290-2315), not the `session_auto_state` transitions.
|
|
64
|
+
|
|
65
|
+
### Issue 4: Multiple tasks in one inject — partial processing
|
|
66
|
+
|
|
67
|
+
**Root cause:** AI behavior, not telepty bug. When a --ref file contains Task A
|
|
68
|
+
+ Task B, codex processes Task A and returns to prompt. This is standard LLM
|
|
69
|
+
behavior — no telepty fix needed.
|
|
70
|
+
|
|
71
|
+
**Mitigation:** Orchestrator should split multi-task injects into separate
|
|
72
|
+
sequential calls with idle-gating between them (orchestrator-side logic).
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Scope
|
|
77
|
+
|
|
78
|
+
**Phase 1 (this spec):** Fix Issues 1 and 3 (guaranteed Enter + guaranteed
|
|
79
|
+
REPORT). These are telepty-side fixes.
|
|
80
|
+
|
|
81
|
+
**Phase 2 (separate task):** Fix Issue 2 (inject queuing during active work).
|
|
82
|
+
Requires daemon-side session state awareness.
|
|
83
|
+
|
|
84
|
+
**Out of scope:** Issue 4 (orchestrator-level task splitting).
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Files to Modify
|
|
89
|
+
|
|
90
|
+
| File | Change |
|
|
91
|
+
|---|---|
|
|
92
|
+
| `daemon.js` | Fix 1: `deliverInjectionToSession()` — use `sendViaKitty()` for CR instead of PTY `\r`. Fix 3: Wire auto-report to session state `idle` transition instead of legacy `ready` signal. |
|
|
93
|
+
| `daemon.js` | Fix 1: POST `/submit` endpoint — use kitty send-text with cmux fallback instead of `submitViaPty()`. |
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Approach
|
|
98
|
+
|
|
99
|
+
### Fix 1: Terminal-level submit for wrapped sessions
|
|
100
|
+
|
|
101
|
+
Replace PTY `\r` with `sendViaKitty()` in `deliverInjectionToSession()`:
|
|
102
|
+
|
|
103
|
+
```js
|
|
104
|
+
// BEFORE (daemon.js ~line 590):
|
|
105
|
+
if (!options.noEnter && session.type !== 'aterm') {
|
|
106
|
+
const submitDelay = session.type === 'wrapped' ? 500 : 300;
|
|
107
|
+
setTimeout(async () => {
|
|
108
|
+
const submitResult = await writeDataToSession(id, session, '\r');
|
|
109
|
+
// ...
|
|
110
|
+
}, submitDelay);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// AFTER:
|
|
114
|
+
if (!options.noEnter && session.type !== 'aterm') {
|
|
115
|
+
const submitDelay = session.type === 'wrapped' ? 500 : 300;
|
|
116
|
+
setTimeout(async () => {
|
|
117
|
+
let submitted = false;
|
|
118
|
+
// Priority 1: kitty send-text (terminal-level, bypasses PTY quirks)
|
|
119
|
+
if (session.type === 'wrapped') {
|
|
120
|
+
submitted = sendViaKitty(id, '\r');
|
|
121
|
+
}
|
|
122
|
+
// Priority 2: cmux send-key (for cmux-managed sessions)
|
|
123
|
+
if (!submitted && session.backend === 'cmux' && session.cmuxWorkspaceId) {
|
|
124
|
+
submitted = submitViaCmux(id);
|
|
125
|
+
}
|
|
126
|
+
// Priority 3: PTY fallback (spawned sessions without kitty)
|
|
127
|
+
if (!submitted) {
|
|
128
|
+
const submitResult = await writeDataToSession(id, session, '\r');
|
|
129
|
+
if (!submitResult.success) {
|
|
130
|
+
emitInjectFailureEvent(id, submitResult.code, submitResult.error, {
|
|
131
|
+
phase: 'submit', source: options.source || 'inject'
|
|
132
|
+
}, session);
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}, submitDelay);
|
|
136
|
+
}
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Also update POST `/submit` endpoint to use same priority chain instead of
|
|
140
|
+
always calling `submitViaPty()`.
|
|
141
|
+
|
|
142
|
+
### Fix 3: Auto-report via session state machine
|
|
143
|
+
|
|
144
|
+
Wire auto-report to the `session_auto_state` transition event (already emitted
|
|
145
|
+
by `sessionStateManager.onTransition()`). When a session transitions to `idle`
|
|
146
|
+
and has a pending report, fire the auto-report.
|
|
147
|
+
|
|
148
|
+
```js
|
|
149
|
+
// In the existing sessionStateManager.onTransition callback (daemon.js ~line 37):
|
|
150
|
+
sessionStateManager.onTransition((sessionId, from, to, detail) => {
|
|
151
|
+
const session = sessions[sessionId];
|
|
152
|
+
if (!session) return;
|
|
153
|
+
broadcastSessionEvent('session_auto_state', sessionId, session, {
|
|
154
|
+
extra: { auto_state: to, auto_state_from: from, auto_detail: detail }
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
// Auto-report: fire when session transitions to idle after inject
|
|
158
|
+
if (to === 'idle' && pendingReports[sessionId]) {
|
|
159
|
+
const pendingReport = pendingReports[sessionId];
|
|
160
|
+
delete pendingReports[sessionId];
|
|
161
|
+
const elapsed = ((Date.now() - new Date(pendingReport.injectedAt).getTime()) / 1000).toFixed(1);
|
|
162
|
+
const reportMsg = `TASK_COMPLETE: ${sessionId} is now idle after processing inject (${elapsed}s)`;
|
|
163
|
+
const srcId = resolveSessionAlias(pendingReport.source) || pendingReport.source;
|
|
164
|
+
const srcSession = sessions[srcId];
|
|
165
|
+
if (srcSession) {
|
|
166
|
+
deliverInjectionToSession(srcId, srcSession, reportMsg, { noEnter: false, source: 'auto_report' });
|
|
167
|
+
console.log(`[AUTO-REPORT] ${sessionId} → ${srcId}: idle after ${elapsed}s`);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
});
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Keep the legacy `ready`-based auto-report as fallback (don't remove it).
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Verification
|
|
178
|
+
|
|
179
|
+
1. **Test:** `telepty inject xtem-rtm "echo hello"` → codex processes it
|
|
180
|
+
(Enter pressed via kitty send-text)
|
|
181
|
+
2. **Test:** `telepty inject --ref --from orchestrator xtem-rtm 'task'` → after
|
|
182
|
+
codex completes → auto-report fires via idle state transition
|
|
183
|
+
3. **Test:** Sessions without kitty (spawned) → PTY `\r` fallback still works
|
|
184
|
+
4. **Test:** Existing 131 tests still pass
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## Risks
|
|
189
|
+
|
|
190
|
+
1. **kitty not available.** Mitigated: 3-tier fallback (kitty → cmux → PTY).
|
|
191
|
+
PTY path preserved as last resort.
|
|
192
|
+
2. **`sendViaKitty()` needs kitty socket + window ID match.** Already
|
|
193
|
+
implemented and working for other features. If kitty window not found,
|
|
194
|
+
falls through to PTY.
|
|
195
|
+
3. **Auto-report via state machine may fire too early.** The idle detection
|
|
196
|
+
uses 5s silence timeout. If codex pauses >5s mid-task, it may fire
|
|
197
|
+
prematurely. Mitigated: auto-report has `AUTO_REPORT_IDLE_SECONDS` (10s)
|
|
198
|
+
threshold. Can add a minimum elapsed time guard.
|
|
199
|
+
4. **Dual auto-report paths (state machine + legacy ready).** Could fire
|
|
200
|
+
twice. Mitigated: `delete pendingReports[sessionId]` in both paths —
|
|
201
|
+
whichever fires first consumes the pending report.
|
package/src/mailbox/config.js
CHANGED
|
@@ -27,6 +27,10 @@ const DEFAULTS = {
|
|
|
27
27
|
deliveryPollMs: 200,
|
|
28
28
|
/** Notification coalesce window in ms. */
|
|
29
29
|
notifyCoalesceMs: 25,
|
|
30
|
+
/** Lock age threshold in seconds — break locks older than this (handles PID recycling). */
|
|
31
|
+
staleLockAgeSecs: 60,
|
|
32
|
+
/** Force-break lock after this many consecutive lock timeout failures per session. */
|
|
33
|
+
lockBreakAfterFailures: 3,
|
|
30
34
|
};
|
|
31
35
|
|
|
32
36
|
function createConfig(overrides = {}) {
|
package/src/mailbox/delivery.js
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
|
|
3
6
|
/**
|
|
4
7
|
* DeliveryEngine — polls mailbox for pending messages and delivers them.
|
|
5
8
|
*
|
|
@@ -9,6 +12,7 @@
|
|
|
9
12
|
* Also handles:
|
|
10
13
|
* - In-flight timeout recovery (auto-nack stuck messages)
|
|
11
14
|
* - TTL expiry (expire stale pending messages)
|
|
15
|
+
* - Stale lock detection: consecutive failure tracking + exponential backoff
|
|
12
16
|
*/
|
|
13
17
|
class DeliveryEngine {
|
|
14
18
|
/**
|
|
@@ -28,6 +32,10 @@ class DeliveryEngine {
|
|
|
28
32
|
this._timer = null;
|
|
29
33
|
this._running = false;
|
|
30
34
|
this._tickInProgress = false;
|
|
35
|
+
// Fix 4: Per-session consecutive lock failure count
|
|
36
|
+
this._lockFailures = new Map(); // sessionId → count
|
|
37
|
+
// Fix 5: Per-session skip-until timestamp for backoff
|
|
38
|
+
this._skipUntil = new Map(); // sessionId → timestamp
|
|
31
39
|
}
|
|
32
40
|
|
|
33
41
|
/**
|
|
@@ -66,8 +74,15 @@ class DeliveryEngine {
|
|
|
66
74
|
|
|
67
75
|
try {
|
|
68
76
|
const sessionIds = this.sessionResolver();
|
|
77
|
+
const lockBreakThreshold = this.mailbox.config.lockBreakAfterFailures || 3;
|
|
69
78
|
|
|
70
79
|
for (const sessionId of sessionIds) {
|
|
80
|
+
// Fix 5: Skip sessions in backoff
|
|
81
|
+
const skipUntil = this._skipUntil.get(sessionId) || 0;
|
|
82
|
+
if (Date.now() < skipUntil) continue;
|
|
83
|
+
|
|
84
|
+
let lockFailed = false;
|
|
85
|
+
|
|
71
86
|
// 1. Recover in-flight timeouts
|
|
72
87
|
try {
|
|
73
88
|
const recovered = this.mailbox.recoverInflight(sessionId);
|
|
@@ -75,52 +90,98 @@ class DeliveryEngine {
|
|
|
75
90
|
console.log(`[MAILBOX] Recovered ${recovered} in-flight message(s) for ${sessionId}`);
|
|
76
91
|
}
|
|
77
92
|
} catch (err) {
|
|
78
|
-
|
|
93
|
+
if (err.message.includes('lock timeout')) {
|
|
94
|
+
lockFailed = true;
|
|
95
|
+
} else {
|
|
96
|
+
console.error(`[MAILBOX] recoverInflight error for ${sessionId}: ${err.message}`);
|
|
97
|
+
}
|
|
79
98
|
}
|
|
80
99
|
|
|
81
100
|
// 2. Expire stale messages
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
101
|
+
if (!lockFailed) {
|
|
102
|
+
try {
|
|
103
|
+
const expired = this.mailbox.expireStale(sessionId);
|
|
104
|
+
if (expired > 0) {
|
|
105
|
+
console.log(`[MAILBOX] Expired ${expired} stale message(s) for ${sessionId}`);
|
|
106
|
+
}
|
|
107
|
+
} catch (err) {
|
|
108
|
+
if (err.message.includes('lock timeout')) {
|
|
109
|
+
lockFailed = true;
|
|
110
|
+
} else {
|
|
111
|
+
console.error(`[MAILBOX] expireStale error for ${sessionId}: ${err.message}`);
|
|
112
|
+
}
|
|
86
113
|
}
|
|
87
|
-
} catch (err) {
|
|
88
|
-
console.error(`[MAILBOX] expireStale error for ${sessionId}: ${err.message}`);
|
|
89
114
|
}
|
|
90
115
|
|
|
91
116
|
// 3. Dequeue and deliver
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
117
|
+
if (!lockFailed) {
|
|
118
|
+
try {
|
|
119
|
+
const msg = this.mailbox.dequeue(sessionId);
|
|
120
|
+
if (!msg) {
|
|
121
|
+
// Success path (no message but lock acquired OK)
|
|
122
|
+
this._lockFailures.delete(sessionId);
|
|
123
|
+
this._skipUntil.delete(sessionId);
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
95
126
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
127
|
+
if (!this.deliverFn) {
|
|
128
|
+
// No delivery function — auto-ack (testing mode)
|
|
129
|
+
this.mailbox.ack(sessionId, msg.msg_id);
|
|
130
|
+
this._lockFailures.delete(sessionId);
|
|
131
|
+
this._skipUntil.delete(sessionId);
|
|
132
|
+
continue;
|
|
133
|
+
}
|
|
101
134
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
135
|
+
let result;
|
|
136
|
+
try {
|
|
137
|
+
result = await this.deliverFn(sessionId, msg);
|
|
138
|
+
} catch (err) {
|
|
139
|
+
result = { success: false, error: err.message };
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (result && result.success) {
|
|
143
|
+
this.mailbox.ack(sessionId, msg.msg_id);
|
|
144
|
+
if (this.onDelivery) {
|
|
145
|
+
this.onDelivery(sessionId, msg.msg_id, { success: true });
|
|
146
|
+
}
|
|
147
|
+
} else {
|
|
148
|
+
const reason = (result && result.error) || 'delivery failed';
|
|
149
|
+
this.mailbox.nack(sessionId, msg.msg_id, reason);
|
|
150
|
+
console.log(`[MAILBOX] Delivery failed for ${sessionId}/${msg.msg_id}: ${reason} (attempt ${msg.attempt})`);
|
|
151
|
+
if (this.onDelivery) {
|
|
152
|
+
this.onDelivery(sessionId, msg.msg_id, { success: false, error: reason });
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Success path (lock was acquired)
|
|
157
|
+
this._lockFailures.delete(sessionId);
|
|
158
|
+
this._skipUntil.delete(sessionId);
|
|
105
159
|
} catch (err) {
|
|
106
|
-
|
|
160
|
+
if (err.message.includes('lock timeout')) {
|
|
161
|
+
lockFailed = true;
|
|
162
|
+
} else {
|
|
163
|
+
console.error(`[MAILBOX] Delivery loop error for ${sessionId}: ${err.message}`);
|
|
164
|
+
}
|
|
107
165
|
}
|
|
166
|
+
}
|
|
108
167
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
168
|
+
// Fix 4 & 5: Handle lock failure — track, force-break, backoff
|
|
169
|
+
if (lockFailed) {
|
|
170
|
+
const failCount = (this._lockFailures.get(sessionId) || 0) + 1;
|
|
171
|
+
this._lockFailures.set(sessionId, failCount);
|
|
172
|
+
|
|
173
|
+
// Fix 4: Force-break after N consecutive failures
|
|
174
|
+
if (failCount >= lockBreakThreshold) {
|
|
175
|
+
const lockPath = path.join(this.mailbox._sessionDir(sessionId), '.lock');
|
|
176
|
+
try { fs.unlinkSync(lockPath); } catch {}
|
|
177
|
+
console.warn(`[MAILBOX] Force-broke stale lock for ${sessionId} after ${failCount} consecutive failures`);
|
|
178
|
+
this._lockFailures.delete(sessionId);
|
|
179
|
+
this._skipUntil.delete(sessionId);
|
|
114
180
|
} else {
|
|
115
|
-
|
|
116
|
-
this.
|
|
117
|
-
|
|
118
|
-
if (this.onDelivery) {
|
|
119
|
-
this.onDelivery(sessionId, msg.msg_id, { success: false, error: reason });
|
|
120
|
-
}
|
|
181
|
+
// Fix 5: Exponential backoff — skip this session for increasing intervals
|
|
182
|
+
const backoffMs = Math.min(this.pollMs * (1 << failCount), 30000);
|
|
183
|
+
this._skipUntil.set(sessionId, Date.now() + backoffMs);
|
|
121
184
|
}
|
|
122
|
-
} catch (err) {
|
|
123
|
-
console.error(`[MAILBOX] Delivery loop error for ${sessionId}: ${err.message}`);
|
|
124
185
|
}
|
|
125
186
|
}
|
|
126
187
|
} finally {
|
package/src/mailbox/index.js
CHANGED
|
@@ -4,6 +4,7 @@ const path = require('path');
|
|
|
4
4
|
const fs = require('fs');
|
|
5
5
|
const {
|
|
6
6
|
acquireLock,
|
|
7
|
+
breakStaleLocks,
|
|
7
8
|
ensureSessionDir,
|
|
8
9
|
loadStates,
|
|
9
10
|
appendState,
|
|
@@ -278,6 +279,16 @@ class FileMailbox {
|
|
|
278
279
|
try { fs.writeFileSync(p, ''); } catch {}
|
|
279
280
|
}
|
|
280
281
|
|
|
282
|
+
/**
|
|
283
|
+
* Break stale lock files across all session directories.
|
|
284
|
+
* Call at daemon startup before DeliveryEngine.start().
|
|
285
|
+
* Returns count of broken locks.
|
|
286
|
+
*/
|
|
287
|
+
breakStaleLocks() {
|
|
288
|
+
const staleLockAgeMs = (this.config.staleLockAgeSecs || 60) * 1000;
|
|
289
|
+
return breakStaleLocks(this.config.root, { staleLockAgeMs });
|
|
290
|
+
}
|
|
291
|
+
|
|
281
292
|
/**
|
|
282
293
|
* List all session IDs that have a mailbox directory.
|
|
283
294
|
*/
|