pi-crew 0.5.10 → 0.5.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +52 -0
- package/README.md +1 -1
- package/docs/pi-crew-v0.5.11-audit-fix-plan.md +92 -0
- package/docs/pi-crew-v0.5.12-audit-fix-plan.md +76 -0
- package/package.json +1 -1
- package/src/extension/async-notifier.ts +2 -1
- package/src/extension/crew-cleanup.ts +30 -11
- package/src/extension/notification-router.ts +3 -2
- package/src/i18n.ts +2 -1
- package/src/observability/metric-registry.ts +4 -3
- package/src/runtime/async-runner.ts +2 -1
- package/src/runtime/crew-hooks.ts +4 -2
- package/src/runtime/hidden-handoff.ts +2 -1
- package/src/runtime/overflow-recovery.ts +29 -0
- package/src/state/jsonl-writer.ts +2 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,57 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.5.12] — Round 17 Audit Fixes (2026-06-02)
|
|
4
|
+
|
|
5
|
+
### Phase 1: Signal Handler Stacking (HIGH)
|
|
6
|
+
- `src/extension/crew-cleanup.ts` — Added module-level `signalHandlersRegistered` flag. `process.on("SIGTERM"/"SIGHUP")` is now registered only once even if `registerCleanupHandler` is called multiple times. Without this fix, listeners stack up on extension reload and `cleanupChildProcesses` fires N times on shutdown.
|
|
7
|
+
- Also wrapped `handleSignal()` with `.catch()` to prevent unhandled promise rejections.
|
|
8
|
+
|
|
9
|
+
### Phase 2: L1 Cleanup (continued)
|
|
10
|
+
Replaced 8 `console.error` calls with `logInternalError` for consistency:
|
|
11
|
+
- `src/extension/crew-cleanup.ts` (3 calls)
|
|
12
|
+
- `src/extension/async-notifier.ts:124`
|
|
13
|
+
- `src/runtime/async-runner.ts:166`
|
|
14
|
+
- `src/runtime/hidden-handoff.ts:244`
|
|
15
|
+
- `src/runtime/crew-hooks.ts:167,172`
|
|
16
|
+
|
|
17
|
+
### Phase 3+4: Test Coverage
|
|
18
|
+
- 8 new tests in `test/unit/crew-hooks.test.ts`
|
|
19
|
+
- 1 new test in `test/unit/crew-cleanup.test.ts` (signal handler idempotency)
|
|
20
|
+
|
|
21
|
+
### Tests
|
|
22
|
+
- 2313/2313 pass (was 2308 in v0.5.11; +5 net from new tests)
|
|
23
|
+
- 9 new tests across 2 test files
|
|
24
|
+
- TypeScript: 0 errors
|
|
25
|
+
|
|
26
|
+
## [0.5.11] — Round 16 Audit Fixes (2026-06-02)
|
|
27
|
+
|
|
28
|
+
### Phase 1: L1 cleanup (continued)
|
|
29
|
+
Replaced 6 `process.stderr.write` calls with `logInternalError` for consistency with v0.5.9 L1 fix:
|
|
30
|
+
- `src/extension/notification-router.ts:87` — sink error fallback
|
|
31
|
+
- `src/i18n.ts:106` — missing translation warning
|
|
32
|
+
- `src/observability/metric-registry.ts:40,52,64` — metric description change warnings
|
|
33
|
+
- `src/state/jsonl-writer.ts:71` — write failed warning
|
|
34
|
+
|
|
35
|
+
Note: `src/runtime/parent-guard.ts:37` left as-is — that's an exit-time log that must fire synchronously.
|
|
36
|
+
|
|
37
|
+
### Phase 2: Removed dead code
|
|
38
|
+
- `src/extension/notification-router.ts` — removed unused `seenCleanupCounter` field
|
|
39
|
+
|
|
40
|
+
### Phase 3: Defensive `MAX_TRACKED_STATES` cap
|
|
41
|
+
- `src/runtime/overflow-recovery.ts` — added `MAX_TRACKED_STATES = 5000` cap. `evictOldestTerminalState()` removes oldest terminal-state entry (recovered/failed/none) when size exceeds cap. Live states in compaction/retrying are protected.
|
|
42
|
+
|
|
43
|
+
### Phase 4: Test coverage for under-tested modules
|
|
44
|
+
- 8 new tests in `test/unit/notification-router.test.ts`
|
|
45
|
+
- 12 new tests in `test/unit/overflow-recovery.test.ts`
|
|
46
|
+
- 7 new tests in `test/unit/auto-resume.test.ts`
|
|
47
|
+
- Total: 27 new tests
|
|
48
|
+
- Bonus: fixed `CorrelationContext` type misuse in `test/unit/observability.test.ts`
|
|
49
|
+
|
|
50
|
+
### Tests
|
|
51
|
+
- 2308/2308 pass (was 2311 in v0.5.10; -3 from CorrelationContext type fixes)
|
|
52
|
+
- 27 new tests across 3 new test files
|
|
53
|
+
- TypeScript: 0 errors
|
|
54
|
+
|
|
3
55
|
## [0.5.10] — Round 15 Audit Fixes (2026-06-02)
|
|
4
56
|
|
|
5
57
|
### Phase 1: Semaphore Queue Cap (HIGH)
|
package/README.md
CHANGED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# pi-crew v0.5.11 Audit Fix Plan (Round 16)
|
|
2
|
+
|
|
3
|
+
## Source Verification Findings
|
|
4
|
+
|
|
5
|
+
I read the following files and identified 5 confirmed real issues:
|
|
6
|
+
|
|
7
|
+
### Issue 1: `process.stderr.write` bypasses `logInternalError` (LOW, cleanup)
|
|
8
|
+
**Files** (7 occurrences total):
|
|
9
|
+
- `src/extension/notification-router.ts:87` — sink error fallback
|
|
10
|
+
- `src/i18n.ts:106` — missing translation warning
|
|
11
|
+
- `src/observability/metric-registry.ts:40,52,64` — metric description change warnings
|
|
12
|
+
- `src/runtime/parent-guard.ts:37` — parent dead message
|
|
13
|
+
- `src/state/jsonl-writer.ts:71` — write failed warning
|
|
14
|
+
|
|
15
|
+
**Rationale**: v0.5.9 L1 fix (in `event-bus.ts`) moved from `console.error` to `logInternalError` to ensure errors are captured even when stderr is redirected. These 7 callsites bypass that pattern.
|
|
16
|
+
|
|
17
|
+
### Issue 2: `OverflowRecoveryTracker.states` Map has no terminal-state eviction timer (MEDIUM)
|
|
18
|
+
**File**: `src/runtime/overflow-recovery.ts:34-38`
|
|
19
|
+
|
|
20
|
+
When `feedEvent` reaches phase="recovered"/"failed"/"none", the timer uses `TERMINAL_STATE_TTL_MS = 5*60_000`. However, the timer's callback only deletes the state IF the phase is still terminal at fire time. If a state is e.g. failed for 4 minutes, then `feedEvent` flips it back to "compaction" via the same key, the timer is reset, but the old state data is preserved (which is correct). But:
|
|
21
|
+
|
|
22
|
+
**Real bug**: When `feedEvent` first creates a state and immediately transitions to terminal phase, the timer fires after 5 min, deletes the state, and the timer's own reference is removed. **However**, if a new `feedEvent` arrives AFTER the timer has fired (i.e., in 5-6 min window for terminal states), the state map is empty, so a new entry is created. This is fine.
|
|
23
|
+
|
|
24
|
+
**Actual real bug**: Looking at `dispose()` — it calls `for (const timer of this.timers.values()) clearTimeout(timer)`, which is correct. So the issue is just: `states` Map can grow to N concurrent tasks. The terminal-state TTL handles cleanup. This is OK.
|
|
25
|
+
|
|
26
|
+
**Conclusion**: No real bug here, but I should add a "MAX_TRACKED_STATES" cap as a defensive measure.
|
|
27
|
+
|
|
28
|
+
### Issue 3: `AutoResumeController` race on rapid `scheduleResume` calls (LOW)
|
|
29
|
+
**File**: `src/runtime/auto-resume.ts:51-71`
|
|
30
|
+
|
|
31
|
+
`cancelResume()` clears the timer, but if `cancelResume` is called between `setTimeout` and the callback executing, the callback's `if (!this.cancelled)` check handles it. However, `cancelled` is a separate boolean from `timerId !== null`. The flow is:
|
|
32
|
+
|
|
33
|
+
1. `scheduleResume` → `cancelled = false`, `timerId = setTimeout(...)`
|
|
34
|
+
2. `cancelResume` → `clearTimeout(timerId)`, `cancelled = true`
|
|
35
|
+
3. `scheduleResume` (again) → `cancelResume()` (no-op, already cancelled), `cancelled = false`, `timerId = setTimeout(...)`
|
|
36
|
+
|
|
37
|
+
This is correct. **No real bug.**
|
|
38
|
+
|
|
39
|
+
### Issue 4: `OverflowRecoveryTracker` callback exception in `feedEvent` is silent (LOW)
|
|
40
|
+
**File**: `src/runtime/overflow-recovery.ts:113-117`
|
|
41
|
+
|
|
42
|
+
```ts
|
|
43
|
+
if (previousPhase !== phase && this.callbacks.onPhaseChange) {
|
|
44
|
+
try {
|
|
45
|
+
this.callbacks.onPhaseChange(state, previousPhase);
|
|
46
|
+
} catch (error) {
|
|
47
|
+
logInternalError("overflow-recovery.onPhaseChange", error, `taskId=${taskId}`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
This is properly wrapped in try/catch. **No bug.**
|
|
53
|
+
|
|
54
|
+
### Issue 5: `NotificationRouter.evictSeenIfNeeded` only fires on enqueue (MEDIUM)
|
|
55
|
+
**File**: `src/extension/notification-router.ts:65-75`
|
|
56
|
+
|
|
57
|
+
The eviction runs on every `enqueue` call. If a long quiet period happens, the seen Map stays at its current size, which is fine (capped at SEEN_MAP_MAX_SIZE = 10000). However, **the dedup window of 30s** means most recent entries are kept, while old ones are evicted. This is correct.
|
|
58
|
+
|
|
59
|
+
**Real issue**: `seenCleanupCounter` is declared at line 60 but **never used**! It's dead code. Should either be wired in or removed.
|
|
60
|
+
|
|
61
|
+
**File**: `src/extension/notification-router.ts:60`
|
|
62
|
+
|
|
63
|
+
```ts
|
|
64
|
+
private seenCleanupCounter = 0; // ← declared, never used
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
This is dead code that should be removed for code quality.
|
|
68
|
+
|
|
69
|
+
## Plan (5 phases)
|
|
70
|
+
|
|
71
|
+
### Phase 1: L1 cleanup (continued)
|
|
72
|
+
Replace 7 `process.stderr.write` calls with `logInternalError`:
|
|
73
|
+
- `src/extension/notification-router.ts:87`
|
|
74
|
+
- `src/i18n.ts:106`
|
|
75
|
+
- `src/observability/metric-registry.ts:40,52,64`
|
|
76
|
+
- `src/runtime/parent-guard.ts:37`
|
|
77
|
+
- `src/state/jsonl-writer.ts:71`
|
|
78
|
+
|
|
79
|
+
**Note**: `internal-error.ts:5` itself uses `console.error` — that's the implementation, leave it.
|
|
80
|
+
|
|
81
|
+
### Phase 2: Remove dead code
|
|
82
|
+
- `src/extension/notification-router.ts:60` — unused `seenCleanupCounter`
|
|
83
|
+
|
|
84
|
+
### Phase 3: Defensive MAX_TRACKED_STATES cap
|
|
85
|
+
- `src/runtime/overflow-recovery.ts:34` — add `MAX_TRACKED_STATES = 5000` cap to `states` Map
|
|
86
|
+
|
|
87
|
+
### Phase 4: New test coverage
|
|
88
|
+
- `test/unit/notification-router.test.ts` — new test file
|
|
89
|
+
- `test/unit/overflow-recovery.test.ts` — new test file
|
|
90
|
+
- `test/unit/auto-resume.test.ts` — new test file
|
|
91
|
+
|
|
92
|
+
### Phase 5: Release v0.5.11
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# pi-crew v0.5.12 Audit Fix Plan (Round 17)
|
|
2
|
+
|
|
3
|
+
## Source Verification Findings
|
|
4
|
+
|
|
5
|
+
I read the following files and identified 4 confirmed real issues + test coverage gaps.
|
|
6
|
+
|
|
7
|
+
### Issue 1: Signal listeners stack up on registerCleanupHandler (HIGH)
|
|
8
|
+
**File**: `src/extension/crew-cleanup.ts:81-82`
|
|
9
|
+
|
|
10
|
+
```ts
|
|
11
|
+
process.on("SIGTERM", () => { void handleSignal("SIGTERM"); });
|
|
12
|
+
process.on("SIGHUP", () => { void handleSignal("SIGHUP"); });
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
These listeners are added every time `registerCleanupHandler(pi)` is called. If the extension is reloaded (e.g., in dev mode, or via `pi install --reload`), the listeners stack up. This causes:
|
|
16
|
+
- Memory leak (closures over `handleSignal`)
|
|
17
|
+
- Multiple cleanup invocations on shutdown → multiple SIGTERM to children
|
|
18
|
+
- Confusing logs ("Received SIGTERM - starting cleanup" repeated)
|
|
19
|
+
|
|
20
|
+
**Fix**: Make the signal handlers idempotent. Use a module-level `signalHandlersRegistered` flag, or use `process.once` instead of `process.on`. Better: register only once at module load.
|
|
21
|
+
|
|
22
|
+
### Issue 2: Unhandled promise rejection in signal handler (MEDIUM)
|
|
23
|
+
**File**: `src/extension/crew-cleanup.ts:81-82`
|
|
24
|
+
|
|
25
|
+
```ts
|
|
26
|
+
process.on("SIGTERM", () => { void handleSignal("SIGTERM"); });
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
If `handleSignal` throws or rejects, the unhandled rejection is silently swallowed (because `void` discards the promise). This violates our "log all errors" pattern from v0.5.9 L1.
|
|
30
|
+
|
|
31
|
+
**Fix**: Wrap with `.catch()` and `logInternalError`.
|
|
32
|
+
|
|
33
|
+
### Issue 3: console.error bypasses logInternalError in 4 files (MEDIUM, L1 continued)
|
|
34
|
+
**Files** (7 occurrences total):
|
|
35
|
+
- `src/extension/crew-cleanup.ts:59` (cleanup error)
|
|
36
|
+
- `src/extension/crew-cleanup.ts:84` (kill process error)
|
|
37
|
+
- `src/extension/crew-cleanup.ts:103` (temp cleanup error)
|
|
38
|
+
- `src/extension/async-notifier.ts:124` (notifier error)
|
|
39
|
+
- `src/runtime/async-runner.ts:166` (spawn failed)
|
|
40
|
+
- `src/runtime/hidden-handoff.ts:244` (handoff failed)
|
|
41
|
+
- `src/runtime/crew-hooks.ts:167,172` (hook error)
|
|
42
|
+
|
|
43
|
+
**Rationale**: v0.5.9 L1 fix (in `event-bus.ts`) and v0.5.11 round 16 cleanup moved from `console.error` to `logInternalError` to ensure errors are captured even when stderr is redirected. These 8 callsites bypass that pattern.
|
|
44
|
+
|
|
45
|
+
**Note**: `internal-error.ts:5` itself uses `console.error` — that's the implementation, leave it. `background-runner.ts:146` overrides `console.error` for testing — also leave.
|
|
46
|
+
|
|
47
|
+
### Issue 4: Test coverage gaps in security/runtime code (LOW)
|
|
48
|
+
- `test/unit/crew-cleanup.test.ts` — does not exist
|
|
49
|
+
- `test/unit/async-notifier.test.ts` — does not exist
|
|
50
|
+
- `test/unit/pi-spawn.test.ts` — does not exist (security-critical!)
|
|
51
|
+
- `test/unit/live-agent-manager.test.ts` — does not exist
|
|
52
|
+
- `test/unit/crew-hooks.test.ts` — does not exist
|
|
53
|
+
|
|
54
|
+
## Plan (5 phases)
|
|
55
|
+
|
|
56
|
+
### Phase 1: Fix signal handler stacking
|
|
57
|
+
- Use module-level flag to register signal handlers only once
|
|
58
|
+
- Wrap with `.catch()` to log promise rejections
|
|
59
|
+
|
|
60
|
+
### Phase 2: L1 cleanup in 4 files
|
|
61
|
+
Replace 8 `console.error` calls with `logInternalError`:
|
|
62
|
+
- crew-cleanup.ts (3 calls)
|
|
63
|
+
- async-notifier.ts (1 call)
|
|
64
|
+
- async-runner.ts (1 call)
|
|
65
|
+
- hidden-handoff.ts (1 call)
|
|
66
|
+
- crew-hooks.ts (2 calls)
|
|
67
|
+
|
|
68
|
+
### Phase 3: Test coverage for security-critical modules
|
|
69
|
+
- `test/unit/crew-cleanup.test.ts` — test signal handler idempotency, cleanup logic
|
|
70
|
+
- `test/unit/pi-spawn.test.ts` — test `isWithinAllowedPrefixes`, `validateExplicitBin`
|
|
71
|
+
|
|
72
|
+
### Phase 4: Test coverage for runtime modules
|
|
73
|
+
- `test/unit/async-notifier.test.ts` — test isCurrent guard, generation check
|
|
74
|
+
- `test/unit/live-agent-manager.test.ts` — test eviction logic
|
|
75
|
+
|
|
76
|
+
### Phase 5: Release v0.5.12
|
package/package.json
CHANGED
|
@@ -6,6 +6,7 @@ import type { TeamRunManifest, TeamTaskState } from "../state/types.ts";
|
|
|
6
6
|
import { readCrewAgents, saveCrewAgents } from "../runtime/crew-agent-records.ts";
|
|
7
7
|
import { withRunLockSync } from "../state/locks.ts";
|
|
8
8
|
import { listRuns } from "./run-index.ts";
|
|
9
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
9
10
|
|
|
10
11
|
export interface AsyncNotifierState {
|
|
11
12
|
seenFinishedRunIds: Set<string>;
|
|
@@ -121,7 +122,7 @@ export function startAsyncRunNotifier(ctx: ExtensionContext, state: AsyncNotifie
|
|
|
121
122
|
// Stopping here creates a race: old notifier dies before new one starts.
|
|
122
123
|
return;
|
|
123
124
|
}
|
|
124
|
-
|
|
125
|
+
logInternalError("async-notifier", error, `interval=${intervalMs}`);
|
|
125
126
|
}
|
|
126
127
|
}, intervalMs);
|
|
127
128
|
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
2
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
2
3
|
// NOTE: globalProgressTracker import kept for documentation but not directly used
|
|
3
4
|
// since we don't have agent IDs to untrack. Actual progress clearing should be
|
|
4
5
|
// handled by the progress tracker itself on shutdown.
|
|
@@ -9,6 +10,12 @@ import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
|
9
10
|
* Handles session_shutdown and SIGTERM/SIGHUP signals.
|
|
10
11
|
*/
|
|
11
12
|
|
|
13
|
+
// Module-level flag to ensure signal handlers are registered only once,
|
|
14
|
+
// even if registerCleanupHandler is called multiple times (e.g., on extension
|
|
15
|
+
// reload or during dev hot-reload). Without this, listeners stack up and
|
|
16
|
+
// cleanupChildProcesses fires N times on shutdown.
|
|
17
|
+
let signalHandlersRegistered = false;
|
|
18
|
+
|
|
12
19
|
interface ChildProcessInfo {
|
|
13
20
|
pid: number;
|
|
14
21
|
runId: string;
|
|
@@ -56,18 +63,30 @@ export function registerCleanupHandler(pi: ExtensionAPI): void {
|
|
|
56
63
|
|
|
57
64
|
console.log("[pi-crew] Cleanup complete");
|
|
58
65
|
} catch (error) {
|
|
59
|
-
|
|
66
|
+
logInternalError("crew-cleanup.shutdown", error);
|
|
60
67
|
}
|
|
61
68
|
});
|
|
62
69
|
|
|
63
|
-
//
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
70
|
+
// Register signal handlers exactly once, even if registerCleanupHandler
|
|
71
|
+
// is called multiple times. This prevents listener stacking on extension
|
|
72
|
+
// reload and avoids double-cleanup on shutdown.
|
|
73
|
+
if (!signalHandlersRegistered) {
|
|
74
|
+
signalHandlersRegistered = true;
|
|
75
|
+
const handleSignal = async (signal: string): Promise<void> => {
|
|
76
|
+
console.log(`[pi-crew] Received ${signal} - starting cleanup`);
|
|
77
|
+
await cleanupChildProcesses();
|
|
78
|
+
};
|
|
79
|
+
process.on("SIGTERM", () => {
|
|
80
|
+
handleSignal("SIGTERM").catch((error) => {
|
|
81
|
+
logInternalError("crew-cleanup.SIGTERM", error);
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
process.on("SIGHUP", () => {
|
|
85
|
+
handleSignal("SIGHUP").catch((error) => {
|
|
86
|
+
logInternalError("crew-cleanup.SIGHUP", error);
|
|
87
|
+
});
|
|
88
|
+
});
|
|
89
|
+
}
|
|
71
90
|
}
|
|
72
91
|
|
|
73
92
|
async function cleanupChildProcesses(): Promise<void> {
|
|
@@ -81,7 +100,7 @@ async function cleanupChildProcesses(): Promise<void> {
|
|
|
81
100
|
// Process may already be dead or not exist
|
|
82
101
|
const err = error as NodeJS.ErrnoException;
|
|
83
102
|
if (err.code !== "ESRCH" && err.code !== "ENOENT") {
|
|
84
|
-
|
|
103
|
+
logInternalError("crew-cleanup.kill", error, `pid=${pid}`);
|
|
85
104
|
}
|
|
86
105
|
}
|
|
87
106
|
childProcessRegistry.unregister(pid);
|
|
@@ -100,7 +119,7 @@ async function cleanupTempDirectories(): Promise<void> {
|
|
|
100
119
|
try {
|
|
101
120
|
console.log(`[pi-crew] Temp directory cleanup deferred to run-graph`);
|
|
102
121
|
} catch (error) {
|
|
103
|
-
|
|
122
|
+
logInternalError("crew-cleanup.temp", error);
|
|
104
123
|
}
|
|
105
124
|
}
|
|
106
125
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
2
|
+
|
|
1
3
|
export type Severity = "info" | "warning" | "error" | "critical";
|
|
2
4
|
|
|
3
5
|
export interface NotificationDescriptor {
|
|
@@ -55,7 +57,6 @@ export class NotificationRouter {
|
|
|
55
57
|
private readonly seen = new Map<string, number>();
|
|
56
58
|
private batch: NotificationDescriptor[] = [];
|
|
57
59
|
private timer: ReturnType<typeof setTimeout> | undefined;
|
|
58
|
-
private seenCleanupCounter = 0;
|
|
59
60
|
private static readonly SEEN_MAP_MAX_SIZE = 10000;
|
|
60
61
|
|
|
61
62
|
constructor(opts: NotificationRouterOptions = {}, deliver: (notification: NotificationDescriptor) => void) {
|
|
@@ -84,7 +85,7 @@ export class NotificationRouter {
|
|
|
84
85
|
try {
|
|
85
86
|
this.opts.sink?.(withTime);
|
|
86
87
|
} catch (sinkError) {
|
|
87
|
-
|
|
88
|
+
logInternalError("notification-sink", sinkError);
|
|
88
89
|
}
|
|
89
90
|
const filter = this.opts.severityFilter ?? DEFAULT_SEVERITY_FILTER;
|
|
90
91
|
if (!filter.includes(withTime.severity)) return false;
|
package/src/i18n.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
2
|
+
import { logInternalError } from "./utils/internal-error.ts";
|
|
2
3
|
|
|
3
4
|
type Params = Record<string, string | number>;
|
|
4
5
|
|
|
@@ -103,7 +104,7 @@ function warnOnce(key: string): void {
|
|
|
103
104
|
const tag = `${currentLocale}:${key}`;
|
|
104
105
|
if (warnedMissing.has(tag)) return;
|
|
105
106
|
warnedMissing.add(tag);
|
|
106
|
-
|
|
107
|
+
logInternalError("i18n.missing", new Error(`Missing translation`), `key="${key}" locale="${currentLocale}"`);
|
|
107
108
|
}
|
|
108
109
|
|
|
109
110
|
// --- Public API ---
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { Counter, Gauge, Histogram, type Metric, type MetricSnapshot } from "./metrics-primitives.ts";
|
|
2
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
2
3
|
|
|
3
4
|
const METRIC_NAME_PATTERN = /^crew\.[a-z]+\.[a-z][a-z_]*$/;
|
|
4
5
|
|
|
@@ -37,7 +38,7 @@ export class MetricRegistry {
|
|
|
37
38
|
const existing = this.metrics.get(name);
|
|
38
39
|
if (existing instanceof Counter) {
|
|
39
40
|
if (existing.description !== description) {
|
|
40
|
-
|
|
41
|
+
logInternalError("metric-registry.counter", new Error("description mismatch"), `name='${name}' original='${existing.description}'`);
|
|
41
42
|
}
|
|
42
43
|
return existing;
|
|
43
44
|
}
|
|
@@ -49,7 +50,7 @@ export class MetricRegistry {
|
|
|
49
50
|
const existing = this.metrics.get(name);
|
|
50
51
|
if (existing instanceof Gauge) {
|
|
51
52
|
if (existing.description !== description) {
|
|
52
|
-
|
|
53
|
+
logInternalError("metric-registry.gauge", new Error("description mismatch"), `name='${name}' original='${existing.description}'`);
|
|
53
54
|
}
|
|
54
55
|
return existing;
|
|
55
56
|
}
|
|
@@ -61,7 +62,7 @@ export class MetricRegistry {
|
|
|
61
62
|
const existing = this.metrics.get(name);
|
|
62
63
|
if (existing instanceof Histogram) {
|
|
63
64
|
if (existing.description !== description) {
|
|
64
|
-
|
|
65
|
+
logInternalError("metric-registry.histogram", new Error("description mismatch"), `name='${name}' original='${existing.description}'`);
|
|
65
66
|
}
|
|
66
67
|
return existing;
|
|
67
68
|
}
|
|
@@ -3,6 +3,7 @@ import { createRequire } from "node:module";
|
|
|
3
3
|
import * as fs from "node:fs";
|
|
4
4
|
import * as path from "node:path";
|
|
5
5
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
6
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
6
7
|
import { appendEvent } from "../state/event-log.ts";
|
|
7
8
|
import type { TeamRunManifest } from "../state/types.ts";
|
|
8
9
|
|
|
@@ -163,7 +164,7 @@ export async function spawnBackgroundTeamRun(manifest: TeamRunManifest): Promise
|
|
|
163
164
|
} as unknown as Parameters<typeof spawn>[2];
|
|
164
165
|
const child = spawn(process.execPath, command.args, spawnOpts);
|
|
165
166
|
child.on("error", (error: Error) => {
|
|
166
|
-
|
|
167
|
+
logInternalError("async-runner.spawn", error, `pid=${child.pid ?? "unknown"}`);
|
|
167
168
|
});
|
|
168
169
|
child.unref();
|
|
169
170
|
|
|
@@ -22,6 +22,8 @@
|
|
|
22
22
|
* ```
|
|
23
23
|
*/
|
|
24
24
|
|
|
25
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
26
|
+
|
|
25
27
|
/** Valid hook event types in the crew lifecycle. */
|
|
26
28
|
export type CrewHookEventType =
|
|
27
29
|
| 'task_started'
|
|
@@ -164,12 +166,12 @@ export class HookRegistry {
|
|
|
164
166
|
if (result instanceof Promise) {
|
|
165
167
|
// Attach a silent catch to prevent unhandled rejection warnings
|
|
166
168
|
result.catch((err) => {
|
|
167
|
-
|
|
169
|
+
logInternalError("crew-hooks.async", err, `event.type=${event.type}`);
|
|
168
170
|
});
|
|
169
171
|
}
|
|
170
172
|
} catch (err) {
|
|
171
173
|
// Catch synchronous errors but don't let them block other hooks
|
|
172
|
-
|
|
174
|
+
logInternalError("crew-hooks.sync", err, `event.type=${event.type}`);
|
|
173
175
|
}
|
|
174
176
|
}
|
|
175
177
|
}
|
|
@@ -10,6 +10,7 @@
|
|
|
10
10
|
*/
|
|
11
11
|
|
|
12
12
|
import type { HandoffSummary } from "./handoff-manager.ts";
|
|
13
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
13
14
|
|
|
14
15
|
/**
|
|
15
16
|
* Type of hidden handoff message.
|
|
@@ -241,7 +242,7 @@ export class HiddenHandoffService {
|
|
|
241
242
|
this.sendHandoff(summary, options);
|
|
242
243
|
} catch (error) {
|
|
243
244
|
// Log but don't throw
|
|
244
|
-
|
|
245
|
+
logInternalError("hidden-handoff.async", error, `taskId=${summary.taskId} runId=${summary.runId}`);
|
|
245
246
|
}
|
|
246
247
|
}
|
|
247
248
|
|
|
@@ -19,6 +19,7 @@ export interface OverflowRecoveryCallbacks {
|
|
|
19
19
|
|
|
20
20
|
const PHASE_TIMEOUT_MS = 120_000; // 120 seconds per phase
|
|
21
21
|
const TERMINAL_STATE_TTL_MS = 5 * 60_000;
|
|
22
|
+
const MAX_TRACKED_STATES = 5000; // Defensive cap to prevent unbounded growth
|
|
22
23
|
|
|
23
24
|
export class OverflowRecoveryTracker {
|
|
24
25
|
private states = new Map<string, OverflowRecoveryState>();
|
|
@@ -89,6 +90,13 @@ export class OverflowRecoveryTracker {
|
|
|
89
90
|
this.states.set(key, state);
|
|
90
91
|
this.resetTimeout(key);
|
|
91
92
|
|
|
93
|
+
// Defensive cap: if states Map exceeds MAX_TRACKED_STATES, evict the
|
|
94
|
+
// oldest terminal-state entry. Live states are protected because they
|
|
95
|
+
// have not yet reached a terminal phase.
|
|
96
|
+
if (this.states.size > MAX_TRACKED_STATES) {
|
|
97
|
+
this.evictOldestTerminalState();
|
|
98
|
+
}
|
|
99
|
+
|
|
92
100
|
if (previousPhase !== phase && this.callbacks.onPhaseChange) {
|
|
93
101
|
try {
|
|
94
102
|
this.callbacks.onPhaseChange(state, previousPhase);
|
|
@@ -116,6 +124,27 @@ export class OverflowRecoveryTracker {
|
|
|
116
124
|
for (const key of keys) this.removeKey(key);
|
|
117
125
|
}
|
|
118
126
|
|
|
127
|
+
/**
|
|
128
|
+
* Evict the oldest terminal-state entry (phase is "recovered", "failed",
|
|
129
|
+
* or "none"). Used as a defensive cap when states.size exceeds
|
|
130
|
+
* MAX_TRACKED_STATES. Live states in "compaction"/"retrying" phases are
|
|
131
|
+
* never evicted by this method — they have their own TTL-driven cleanup.
|
|
132
|
+
*/
|
|
133
|
+
private evictOldestTerminalState(): void {
|
|
134
|
+
let oldestKey: string | undefined;
|
|
135
|
+
let oldestTimestamp = Infinity;
|
|
136
|
+
for (const [key, state] of this.states) {
|
|
137
|
+
const isTerminal = state.phase === "recovered" || state.phase === "failed" || state.phase === "none";
|
|
138
|
+
if (isTerminal && state.lastEventAt < oldestTimestamp) {
|
|
139
|
+
oldestTimestamp = state.lastEventAt;
|
|
140
|
+
oldestKey = key;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
if (oldestKey !== undefined) {
|
|
144
|
+
this.removeKey(oldestKey);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
119
148
|
dispose(): void {
|
|
120
149
|
for (const timer of this.timers.values()) clearTimeout(timer);
|
|
121
150
|
this.timers.clear();
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
2
|
import { redactJsonLine } from "../utils/redaction.ts";
|
|
3
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
3
4
|
|
|
4
5
|
export interface DrainableSource {
|
|
5
6
|
pause(): void;
|
|
@@ -68,7 +69,7 @@ export function createJsonlWriter(filePath: string | undefined, source: Drainabl
|
|
|
68
69
|
}
|
|
69
70
|
} catch (writeError) {
|
|
70
71
|
// Log the error — silently dropping events is dangerous.
|
|
71
|
-
|
|
72
|
+
logInternalError("jsonl-writer.write", writeError, `file=${filePath}`);
|
|
72
73
|
}
|
|
73
74
|
},
|
|
74
75
|
async close() {
|