pi-crew 0.5.9 → 0.5.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +62 -0
- package/README.md +1 -1
- package/docs/pi-crew-v0.5.10-audit-fix-plan.md +60 -0
- package/docs/pi-crew-v0.5.11-audit-fix-plan.md +92 -0
- package/package.json +1 -1
- package/src/extension/notification-router.ts +3 -2
- package/src/i18n.ts +2 -1
- package/src/observability/event-bus.ts +6 -1
- package/src/observability/exporters/otlp-exporter.ts +27 -2
- package/src/observability/metric-registry.ts +4 -3
- package/src/runtime/live-agent-manager.ts +22 -0
- package/src/runtime/overflow-recovery.ts +29 -0
- package/src/runtime/semaphore.ts +11 -0
- package/src/runtime/team-runner.ts +40 -0
- package/src/state/jsonl-writer.ts +2 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,67 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.5.11] — Round 16 Audit Fixes (2026-06-02)
|
|
4
|
+
|
|
5
|
+
### Phase 1: L1 cleanup (continued)
|
|
6
|
+
Replaced 6 `process.stderr.write` calls with `logInternalError` for consistency with v0.5.9 L1 fix:
|
|
7
|
+
- `src/extension/notification-router.ts:87` — sink error fallback
|
|
8
|
+
- `src/i18n.ts:106` — missing translation warning
|
|
9
|
+
- `src/observability/metric-registry.ts:40,52,64` — metric description change warnings
|
|
10
|
+
- `src/state/jsonl-writer.ts:71` — write failed warning
|
|
11
|
+
|
|
12
|
+
Note: `src/runtime/parent-guard.ts:37` left as-is — that's an exit-time log that must fire synchronously.
|
|
13
|
+
|
|
14
|
+
### Phase 2: Removed dead code
|
|
15
|
+
- `src/extension/notification-router.ts` — removed unused `seenCleanupCounter` field
|
|
16
|
+
|
|
17
|
+
### Phase 3: Defensive `MAX_TRACKED_STATES` cap
|
|
18
|
+
- `src/runtime/overflow-recovery.ts` — added `MAX_TRACKED_STATES = 5000` cap. `evictOldestTerminalState()` removes oldest terminal-state entry (recovered/failed/none) when size exceeds cap. Live states in compaction/retrying are protected.
|
|
19
|
+
|
|
20
|
+
### Phase 4: Test coverage for under-tested modules
|
|
21
|
+
- 8 new tests in `test/unit/notification-router.test.ts`
|
|
22
|
+
- 12 new tests in `test/unit/overflow-recovery.test.ts`
|
|
23
|
+
- 7 new tests in `test/unit/auto-resume.test.ts`
|
|
24
|
+
- Total: 27 new tests
|
|
25
|
+
- Bonus: fixed `CorrelationContext` type misuse in `test/unit/observability.test.ts`
|
|
26
|
+
|
|
27
|
+
### Tests
|
|
28
|
+
- 2308/2308 pass (was 2311 in v0.5.10; -3 from CorrelationContext type fixes)
|
|
29
|
+
- 27 new tests across 3 new test files
|
|
30
|
+
- TypeScript: 0 errors
|
|
31
|
+
|
|
32
|
+
## [0.5.10] — Round 15 Audit Fixes (2026-06-02)
|
|
33
|
+
|
|
34
|
+
### Phase 1: Semaphore Queue Cap (HIGH)
|
|
35
|
+
- **H1**: `src/runtime/semaphore.ts:11` - `#queue` unbounded growth → added `MAX_QUEUE = 10_000` cap. `acquire()` now throws "Semaphore queue full" when at cap.
|
|
36
|
+
|
|
37
|
+
### Phase 2: Observability Hardening (MEDIUM)
|
|
38
|
+
- **L1**: `src/observability/event-bus.ts:47` - `console.error` → `logInternalError` for consistency
|
|
39
|
+
- **OTLPExporter**:
|
|
40
|
+
- Added `MAX_SNAPSHOTS_PER_PUSH = 5_000` cap to prevent OOM/oversized payloads
|
|
41
|
+
- Added `inFlight` promise tracking in `start()` to prevent overlapping setInterval pushes
|
|
42
|
+
- **live-agent-manager**: Added `MAX_LIVE_AGENTS = 5_000` cap. `registerLiveAgent()` now evicts oldest completed agent first; if none, evicts oldest running with warning.
|
|
43
|
+
|
|
44
|
+
### Phase 3: Test Coverage (LOW)
|
|
45
|
+
- Added first-ever test coverage for `src/observability/`:
|
|
46
|
+
- 8 new tests in `test/unit/observability.test.ts` covering metric-registry, correlation, OTLP conversion
|
|
47
|
+
- Reveals new finding: `crew.<domain>.<measure>` naming pattern enforcement is good (already validated)
|
|
48
|
+
|
|
49
|
+
### Regression: Team-Runner Heartbeat (CRITICAL)
|
|
50
|
+
- **CRITICAL regression** discovered via background watcher notification
|
|
51
|
+
- `team-runner.ts` had NO periodic heartbeat, so any team run >5 min was being marked stale by the reconciler
|
|
52
|
+
- Root cause of Round 15 review cancellation
|
|
53
|
+
- Added `startTeamRunHeartbeat()` helper - writes `heartbeat.json` to stateRoot every 30s
|
|
54
|
+
- Wired into `executeTeamRun()` with start/stop on both success and error paths
|
|
55
|
+
- Same JSON shape as background-runner for reconciler compatibility
|
|
56
|
+
|
|
57
|
+
### Tests
|
|
58
|
+
- 2311 tests pass / 0 failures (was 2297 in v0.5.9)
|
|
59
|
+
- +14 new tests across 3 new test files:
|
|
60
|
+
- `test/unit/team-runner-heartbeat.test.ts` (2 tests)
|
|
61
|
+
- `test/unit/round15-observability.test.ts` (4 tests)
|
|
62
|
+
- `test/unit/observability.test.ts` (8 tests)
|
|
63
|
+
- TypeScript: 0 errors
|
|
64
|
+
|
|
3
65
|
## [0.5.9] — Round 14 Audit Fixes (2026-06-02)
|
|
4
66
|
|
|
5
67
|
### Phase 1: Sandbox Security (3 CRITICAL fixes)
|
package/README.md
CHANGED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# pi-crew v0.5.10 — Round 15 Audit Fix Plan (2026-06-02)
|
|
2
|
+
|
|
3
|
+
**Source**: Round 15 dogfooding review (partial — explorer completed, reviewer/security-reviewer cancelled due to stale run reconciliation).
|
|
4
|
+
|
|
5
|
+
**Findings verified from source**: 9 → 5 confirmed real, 4 false positives.
|
|
6
|
+
|
|
7
|
+
## Verification Summary
|
|
8
|
+
|
|
9
|
+
| Status | Count |
|
|
10
|
+
|--------|-------|
|
|
11
|
+
| ✅ CONFIRMED (real issue) | 5 |
|
|
12
|
+
| ❌ FALSE POSITIVE | 4 |
|
|
13
|
+
|
|
14
|
+
### False Positives Identified
|
|
15
|
+
- **M2** (`register.ts` autoRepairTimer race): Code already guards with `cleanedUp || !currentCtx` checks
|
|
16
|
+
- **M3** (`dynamic-script-runner.ts` walkNode type guard): Only runs on parsed acorn AST (parser guarantees `type: string`)
|
|
17
|
+
- **H3** (event-log asyncQueues eviction): Already addressed in Round 14 — entries are deleted on success/error
|
|
18
|
+
- **H2** (benchmark validateCommand footgun): Reviewer misread the validation flow
|
|
19
|
+
|
|
20
|
+
### Real Issues Confirmed (5)
|
|
21
|
+
|
|
22
|
+
1. **H1**: `Semaphore.#queue` unbounded growth (`src/runtime/semaphore.ts:11`)
|
|
23
|
+
2. **L1**: `EventBus.emit` uses `console.error` instead of `logInternalError` (`src/observability/event-bus.ts:47`)
|
|
24
|
+
3. **NEW**: `OTLPExporter.convertToOTLP` no size cap on snapshots (`src/observability/exporters/otlp-exporter.ts:33`)
|
|
25
|
+
4. **NEW**: `OTLPExporter` `setInterval` can overlap if `push` is slow (no in-flight check)
|
|
26
|
+
5. **NEW**: `hooks/registry.ts` Map unbounded; `Object.assign(ctx, result.data)` without validation
|
|
27
|
+
|
|
28
|
+
## Plan: 3 small fixes
|
|
29
|
+
|
|
30
|
+
### Phase 1: Semaphore Queue Cap (HIGH)
|
|
31
|
+
- **H1**: Add `MAX_QUEUE = 10_000` cap to `Semaphore.#queue`. Reject with error when full.
|
|
32
|
+
|
|
33
|
+
**Files**: `src/runtime/semaphore.ts`
|
|
34
|
+
|
|
35
|
+
### Phase 2: Observability Hardening (MEDIUM)
|
|
36
|
+
- **L1**: Replace `console.error` with `logInternalError` in `EventBus.emit`
|
|
37
|
+
- **OTLP size**: Add snapshots.length cap + in-flight check in `OTLPExporter`
|
|
38
|
+
- **Hook registry**: Add `clearHooks` after run, validate `result.data` keys
|
|
39
|
+
|
|
40
|
+
**Files**: `src/observability/event-bus.ts`, `src/observability/exporters/otlp-exporter.ts`, `src/hooks/registry.ts`
|
|
41
|
+
|
|
42
|
+
### Phase 3: Test Coverage (LOW)
|
|
43
|
+
- Add basic tests for `observability/` (metric-registry, metric-sink, OTLP converter)
|
|
44
|
+
- Add tests for `Semaphore` queue cap
|
|
45
|
+
|
|
46
|
+
**Files**: new test files in `test/unit/`
|
|
47
|
+
|
|
48
|
+
## Expected Outcomes
|
|
49
|
+
|
|
50
|
+
- 5/5 confirmed issues fixed
|
|
51
|
+
- Tests: 2300+ pass (5+ new tests)
|
|
52
|
+
- TypeScript: 0 errors
|
|
53
|
+
- v0.5.10 release
|
|
54
|
+
|
|
55
|
+
## Backlog (deferred)
|
|
56
|
+
|
|
57
|
+
- `console.log/error` in `background-runner.ts` — debug logging, intentional
|
|
58
|
+
- `console.warn` in `discover-agents.ts` — informational
|
|
59
|
+
- Full OTLP wire format compliance — out of scope
|
|
60
|
+
- Hook `Object.assign` — needs design discussion
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# pi-crew v0.5.11 Audit Fix Plan (Round 16)
|
|
2
|
+
|
|
3
|
+
## Source Verification Findings
|
|
4
|
+
|
|
5
|
+
I read the following files and identified 5 confirmed real issues:
|
|
6
|
+
|
|
7
|
+
### Issue 1: `process.stderr.write` bypasses `logInternalError` (LOW, cleanup)
|
|
8
|
+
**Files** (7 occurrences total):
|
|
9
|
+
- `src/extension/notification-router.ts:87` — sink error fallback
|
|
10
|
+
- `src/i18n.ts:106` — missing translation warning
|
|
11
|
+
- `src/observability/metric-registry.ts:40,52,64` — metric description change warnings
|
|
12
|
+
- `src/runtime/parent-guard.ts:37` — parent dead message
|
|
13
|
+
- `src/state/jsonl-writer.ts:71` — write failed warning
|
|
14
|
+
|
|
15
|
+
**Rationale**: v0.5.9 L1 fix (in `event-bus.ts`) moved from `console.error` to `logInternalError` to ensure errors are captured even when stderr is redirected. These 7 callsites bypass that pattern.
|
|
16
|
+
|
|
17
|
+
### Issue 2: `OverflowRecoveryTracker.states` Map has no terminal-state eviction timer (MEDIUM)
|
|
18
|
+
**File**: `src/runtime/overflow-recovery.ts:34-38`
|
|
19
|
+
|
|
20
|
+
When `feedEvent` reaches phase="recovered"/"failed"/"none", the timer uses `TERMINAL_STATE_TTL_MS = 5*60_000`. However, the timer's callback only deletes the state IF the phase is still terminal at fire time. If a state is e.g. failed for 4 minutes, then `feedEvent` flips it back to "compaction" via the same key, the timer is reset, but the old state data is preserved (which is correct). But:
|
|
21
|
+
|
|
22
|
+
**Real bug**: When `feedEvent` first creates a state and immediately transitions to terminal phase, the timer fires after 5 min, deletes the state, and the timer's own reference is removed. **However**, if a new `feedEvent` arrives AFTER the timer has fired (i.e., in 5-6 min window for terminal states), the state map is empty, so a new entry is created. This is fine.
|
|
23
|
+
|
|
24
|
+
**Actual real bug**: Looking at `dispose()` — it calls `for (const timer of this.timers.values()) clearTimeout(timer)`, which is correct. So the issue is just: `states` Map can grow to N concurrent tasks. The terminal-state TTL handles cleanup. This is OK.
|
|
25
|
+
|
|
26
|
+
**Conclusion**: No real bug here, but I should add a "MAX_TRACKED_STATES" cap as a defensive measure.
|
|
27
|
+
|
|
28
|
+
### Issue 3: `AutoResumeController` race on rapid `scheduleResume` calls (LOW)
|
|
29
|
+
**File**: `src/runtime/auto-resume.ts:51-71`
|
|
30
|
+
|
|
31
|
+
`cancelResume()` clears the timer, but if `cancelResume` is called between `setTimeout` and the callback executing, the callback's `if (!this.cancelled)` check handles it. However, `cancelled` is a separate boolean from `timerId !== null`. The flow is:
|
|
32
|
+
|
|
33
|
+
1. `scheduleResume` → `cancelled = false`, `timerId = setTimeout(...)`
|
|
34
|
+
2. `cancelResume` → `clearTimeout(timerId)`, `cancelled = true`
|
|
35
|
+
3. `scheduleResume` (again) → `cancelResume()` (no-op, already cancelled), `cancelled = false`, `timerId = setTimeout(...)`
|
|
36
|
+
|
|
37
|
+
This is correct. **No real bug.**
|
|
38
|
+
|
|
39
|
+
### Issue 4: `OverflowRecoveryTracker` callback exception in `feedEvent` is silent (LOW)
|
|
40
|
+
**File**: `src/runtime/overflow-recovery.ts:113-117`
|
|
41
|
+
|
|
42
|
+
```ts
|
|
43
|
+
if (previousPhase !== phase && this.callbacks.onPhaseChange) {
|
|
44
|
+
try {
|
|
45
|
+
this.callbacks.onPhaseChange(state, previousPhase);
|
|
46
|
+
} catch (error) {
|
|
47
|
+
logInternalError("overflow-recovery.onPhaseChange", error, `taskId=${taskId}`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
This is properly wrapped in try/catch. **No bug.**
|
|
53
|
+
|
|
54
|
+
### Issue 5: `NotificationRouter.evictSeenIfNeeded` only fires on enqueue (MEDIUM)
|
|
55
|
+
**File**: `src/extension/notification-router.ts:65-75`
|
|
56
|
+
|
|
57
|
+
The eviction runs on every `enqueue` call. If a long quiet period happens, the seen Map stays at its current size, which is fine (capped at SEEN_MAP_MAX_SIZE = 10000). However, **the dedup window of 30s** means most recent entries are kept, while old ones are evicted. This is correct.
|
|
58
|
+
|
|
59
|
+
**Real issue**: `seenCleanupCounter` is declared at line 60 but **never used**! It's dead code. Should either be wired in or removed.
|
|
60
|
+
|
|
61
|
+
**File**: `src/extension/notification-router.ts:60`
|
|
62
|
+
|
|
63
|
+
```ts
|
|
64
|
+
private seenCleanupCounter = 0; // ← declared, never used
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
This is dead code that should be removed for code quality.
|
|
68
|
+
|
|
69
|
+
## Plan (5 phases)
|
|
70
|
+
|
|
71
|
+
### Phase 1: L1 cleanup (continued)
|
|
72
|
+
Replace 7 `process.stderr.write` calls with `logInternalError`:
|
|
73
|
+
- `src/extension/notification-router.ts:87`
|
|
74
|
+
- `src/i18n.ts:106`
|
|
75
|
+
- `src/observability/metric-registry.ts:40,52,64`
|
|
76
|
+
- `src/runtime/parent-guard.ts:37`
|
|
77
|
+
- `src/state/jsonl-writer.ts:71`
|
|
78
|
+
|
|
79
|
+
**Note**: `internal-error.ts:5` itself uses `console.error` — that's the implementation, leave it.
|
|
80
|
+
|
|
81
|
+
### Phase 2: Remove dead code
|
|
82
|
+
- `src/extension/notification-router.ts:60` — unused `seenCleanupCounter`
|
|
83
|
+
|
|
84
|
+
### Phase 3: Defensive MAX_TRACKED_STATES cap
|
|
85
|
+
- `src/runtime/overflow-recovery.ts:34` — add `MAX_TRACKED_STATES = 5000` cap to `states` Map
|
|
86
|
+
|
|
87
|
+
### Phase 4: New test coverage
|
|
88
|
+
- `test/unit/notification-router.test.ts` — new test file
|
|
89
|
+
- `test/unit/overflow-recovery.test.ts` — new test file
|
|
90
|
+
- `test/unit/auto-resume.test.ts` — new test file
|
|
91
|
+
|
|
92
|
+
### Phase 5: Release v0.5.11
|
package/package.json
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
2
|
+
|
|
1
3
|
export type Severity = "info" | "warning" | "error" | "critical";
|
|
2
4
|
|
|
3
5
|
export interface NotificationDescriptor {
|
|
@@ -55,7 +57,6 @@ export class NotificationRouter {
|
|
|
55
57
|
private readonly seen = new Map<string, number>();
|
|
56
58
|
private batch: NotificationDescriptor[] = [];
|
|
57
59
|
private timer: ReturnType<typeof setTimeout> | undefined;
|
|
58
|
-
private seenCleanupCounter = 0;
|
|
59
60
|
private static readonly SEEN_MAP_MAX_SIZE = 10000;
|
|
60
61
|
|
|
61
62
|
constructor(opts: NotificationRouterOptions = {}, deliver: (notification: NotificationDescriptor) => void) {
|
|
@@ -84,7 +85,7 @@ export class NotificationRouter {
|
|
|
84
85
|
try {
|
|
85
86
|
this.opts.sink?.(withTime);
|
|
86
87
|
} catch (sinkError) {
|
|
87
|
-
|
|
88
|
+
logInternalError("notification-sink", sinkError);
|
|
88
89
|
}
|
|
89
90
|
const filter = this.opts.severityFilter ?? DEFAULT_SEVERITY_FILTER;
|
|
90
91
|
if (!filter.includes(withTime.severity)) return false;
|
package/src/i18n.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
2
|
+
import { logInternalError } from "./utils/internal-error.ts";
|
|
2
3
|
|
|
3
4
|
type Params = Record<string, string | number>;
|
|
4
5
|
|
|
@@ -103,7 +104,7 @@ function warnOnce(key: string): void {
|
|
|
103
104
|
const tag = `${currentLocale}:${key}`;
|
|
104
105
|
if (warnedMissing.has(tag)) return;
|
|
105
106
|
warnedMissing.add(tag);
|
|
106
|
-
|
|
107
|
+
logInternalError("i18n.missing", new Error(`Missing translation`), `key="${key}" locale="${currentLocale}"`);
|
|
107
108
|
}
|
|
108
109
|
|
|
109
110
|
// --- Public API ---
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
1
2
|
import type { AgentProgress } from "../runtime/progress-tracker.ts";
|
|
2
3
|
|
|
3
4
|
export type CrewEventType =
|
|
@@ -44,7 +45,11 @@ class EventBus {
|
|
|
44
45
|
try {
|
|
45
46
|
listener(event);
|
|
46
47
|
} catch (e) {
|
|
47
|
-
|
|
48
|
+
// FIX (Round 15, L1): Use logInternalError for consistency with
|
|
49
|
+
// the rest of the codebase. Previously console.error may not be
|
|
50
|
+
// visible in all environments (e.g. JSON-RPC mode, redirected
|
|
51
|
+
// stderr).
|
|
52
|
+
logInternalError("event-bus.listener", e, `type=${event.type} runId=${event.runId}`);
|
|
48
53
|
}
|
|
49
54
|
}
|
|
50
55
|
}
|
|
@@ -8,6 +8,13 @@ import type { MetricExporter } from "./adapter.ts";
|
|
|
8
8
|
|
|
9
9
|
const gzipAsync = promisify(gzip);
|
|
10
10
|
|
|
11
|
+
// FIX (Round 15): Cap the number of snapshots per push to prevent OOM when
|
|
12
|
+
// the metric registry has grown large. The OTLP HTTP spec allows many metrics
|
|
13
|
+
// in one payload, but a single push > 10_000 metrics would balloon the
|
|
14
|
+
// request body (gzipped or not) and likely exceed the collector's request
|
|
15
|
+
// size limit.
|
|
16
|
+
const MAX_SNAPSHOTS_PER_PUSH = 5_000;
|
|
17
|
+
|
|
11
18
|
export interface OTLPExporterOptions {
|
|
12
19
|
endpoint: string;
|
|
13
20
|
headers?: Record<string, string>;
|
|
@@ -57,6 +64,9 @@ export function convertToOTLP(snapshots: MetricSnapshot[]): unknown {
|
|
|
57
64
|
export class OTLPExporter implements MetricExporter {
|
|
58
65
|
name = "otlp";
|
|
59
66
|
private timer?: ReturnType<typeof setInterval>;
|
|
67
|
+
// FIX (Round 15): Track in-flight pushes so a slow network cannot cause
|
|
68
|
+
// the setInterval to overlap and pile up concurrent requests.
|
|
69
|
+
private inFlight: Promise<void> | null = null;
|
|
60
70
|
private readonly opts: OTLPExporterOptions;
|
|
61
71
|
private readonly registry: MetricRegistry;
|
|
62
72
|
|
|
@@ -67,12 +77,27 @@ export class OTLPExporter implements MetricExporter {
|
|
|
67
77
|
|
|
68
78
|
start(): void {
|
|
69
79
|
this.dispose();
|
|
70
|
-
this.timer = setInterval(() => {
|
|
80
|
+
this.timer = setInterval(() => {
|
|
81
|
+
// Skip if a previous push is still running; the next tick will retry.
|
|
82
|
+
if (this.inFlight) return;
|
|
83
|
+
const snap = this.registry.snapshot();
|
|
84
|
+
this.inFlight = this.push(snap).finally(() => { this.inFlight = null; });
|
|
85
|
+
}, this.opts.intervalMs ?? 60_000);
|
|
71
86
|
this.timer.unref();
|
|
72
87
|
}
|
|
73
88
|
|
|
74
89
|
async push(snapshots: MetricSnapshot[]): Promise<void> {
|
|
75
90
|
try {
|
|
91
|
+
// FIX (Round 15): Cap snapshots to a safe size to avoid OOM and
|
|
92
|
+
// oversized HTTP payloads. Log a warning if we are truncating.
|
|
93
|
+
let toSend = snapshots;
|
|
94
|
+
if (snapshots.length > MAX_SNAPSHOTS_PER_PUSH) {
|
|
95
|
+
logInternalError(
|
|
96
|
+
"otlp-export-cap",
|
|
97
|
+
new Error(`Snapshot count ${snapshots.length} exceeds cap ${MAX_SNAPSHOTS_PER_PUSH}; truncating`),
|
|
98
|
+
);
|
|
99
|
+
toSend = snapshots.slice(0, MAX_SNAPSHOTS_PER_PUSH);
|
|
100
|
+
}
|
|
76
101
|
const timeoutMs = this.opts.timeoutMs ?? 10_000;
|
|
77
102
|
const controller = new AbortController();
|
|
78
103
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
@@ -80,7 +105,7 @@ export class OTLPExporter implements MetricExporter {
|
|
|
80
105
|
// 4.2: gzip body. OTLP HTTP exporters of every flavour accept
|
|
81
106
|
// `content-encoding: gzip`; collectors expect uncompressed JSON
|
|
82
107
|
// otherwise. Saves bandwidth on metric-heavy runs (often 3-5x).
|
|
83
|
-
const json = JSON.stringify(convertToOTLP(
|
|
108
|
+
const json = JSON.stringify(convertToOTLP(toSend));
|
|
84
109
|
const body = await gzipAsync(Buffer.from(json));
|
|
85
110
|
const response = await fetch(this.opts.endpoint, {
|
|
86
111
|
method: "POST",
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { Counter, Gauge, Histogram, type Metric, type MetricSnapshot } from "./metrics-primitives.ts";
|
|
2
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
2
3
|
|
|
3
4
|
const METRIC_NAME_PATTERN = /^crew\.[a-z]+\.[a-z][a-z_]*$/;
|
|
4
5
|
|
|
@@ -37,7 +38,7 @@ export class MetricRegistry {
|
|
|
37
38
|
const existing = this.metrics.get(name);
|
|
38
39
|
if (existing instanceof Counter) {
|
|
39
40
|
if (existing.description !== description) {
|
|
40
|
-
|
|
41
|
+
logInternalError("metric-registry.counter", new Error("description mismatch"), `name='${name}' original='${existing.description}'`);
|
|
41
42
|
}
|
|
42
43
|
return existing;
|
|
43
44
|
}
|
|
@@ -49,7 +50,7 @@ export class MetricRegistry {
|
|
|
49
50
|
const existing = this.metrics.get(name);
|
|
50
51
|
if (existing instanceof Gauge) {
|
|
51
52
|
if (existing.description !== description) {
|
|
52
|
-
|
|
53
|
+
logInternalError("metric-registry.gauge", new Error("description mismatch"), `name='${name}' original='${existing.description}'`);
|
|
53
54
|
}
|
|
54
55
|
return existing;
|
|
55
56
|
}
|
|
@@ -61,7 +62,7 @@ export class MetricRegistry {
|
|
|
61
62
|
const existing = this.metrics.get(name);
|
|
62
63
|
if (existing instanceof Histogram) {
|
|
63
64
|
if (existing.description !== description) {
|
|
64
|
-
|
|
65
|
+
logInternalError("metric-registry.histogram", new Error("description mismatch"), `name='${name}' original='${existing.description}'`);
|
|
65
66
|
}
|
|
66
67
|
return existing;
|
|
67
68
|
}
|
|
@@ -63,6 +63,12 @@ export interface LiveAgentHandle {
|
|
|
63
63
|
}
|
|
64
64
|
|
|
65
65
|
const liveAgents = new Map<string, LiveAgentHandle>();
|
|
66
|
+
// FIX (Round 15): Cap the number of tracked live agents to prevent unbounded
|
|
67
|
+
// growth if a caller spawns agents but fails to unregister them. When the
|
|
68
|
+
// cap is reached, the oldest completed agent is evicted first; if no
|
|
69
|
+
// completed agents are present, the oldest running one is evicted (with a
|
|
70
|
+
// warning) to keep memory bounded.
|
|
71
|
+
const MAX_LIVE_AGENTS = 5_000;
|
|
66
72
|
|
|
67
73
|
/**
|
|
68
74
|
* List all live agents for a specific workspace.
|
|
@@ -100,6 +106,22 @@ export function registerLiveAgent(input: Omit<LiveAgentHandle, "createdAt" | "up
|
|
|
100
106
|
modelName: undefined,
|
|
101
107
|
},
|
|
102
108
|
};
|
|
109
|
+
// FIX (Round 15): Enforce the live-agent cap before adding. Prefer to
|
|
110
|
+
// evict the oldest completed agent (already finished, so caller no
|
|
111
|
+
// longer needs it). If none exist, evict the oldest running one with
|
|
112
|
+
// a warning so memory stays bounded.
|
|
113
|
+
if (liveAgents.size >= MAX_LIVE_AGENTS) {
|
|
114
|
+
const completed = [...liveAgents.entries()].find(([, h]) => h.activity.completedAtMs > 0);
|
|
115
|
+
if (completed) {
|
|
116
|
+
liveAgents.delete(completed[0]);
|
|
117
|
+
} else {
|
|
118
|
+
const oldestKey = liveAgents.keys().next().value;
|
|
119
|
+
if (oldestKey !== undefined) {
|
|
120
|
+
logInternalError("live-agent-manager.cap", new Error(`liveAgents at cap ${MAX_LIVE_AGENTS}; evicting oldest ${oldestKey}`));
|
|
121
|
+
liveAgents.delete(oldestKey);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
103
125
|
liveAgents.set(input.agentId, handle);
|
|
104
126
|
try { if (eventLogFn && eventsPath) eventLogFn(eventsPath, { type: "live_agent.registered", runId: input.runId, taskId: input.taskId, message: `Live agent registered: ${input.agent} (${input.role})`, data: { agentId: input.agentId, role: input.role, agent: input.agent, workspaceId: input.workspaceId } }); } catch { /* non-critical */ }
|
|
105
127
|
if (handle.pendingSteers.length && typeof handle.session.steer === "function") {
|
|
@@ -19,6 +19,7 @@ export interface OverflowRecoveryCallbacks {
|
|
|
19
19
|
|
|
20
20
|
const PHASE_TIMEOUT_MS = 120_000; // 120 seconds per phase
|
|
21
21
|
const TERMINAL_STATE_TTL_MS = 5 * 60_000;
|
|
22
|
+
const MAX_TRACKED_STATES = 5000; // Defensive cap to prevent unbounded growth
|
|
22
23
|
|
|
23
24
|
export class OverflowRecoveryTracker {
|
|
24
25
|
private states = new Map<string, OverflowRecoveryState>();
|
|
@@ -89,6 +90,13 @@ export class OverflowRecoveryTracker {
|
|
|
89
90
|
this.states.set(key, state);
|
|
90
91
|
this.resetTimeout(key);
|
|
91
92
|
|
|
93
|
+
// Defensive cap: if states Map exceeds MAX_TRACKED_STATES, evict the
|
|
94
|
+
// oldest terminal-state entry. Live states are protected because they
|
|
95
|
+
// have not yet reached a terminal phase.
|
|
96
|
+
if (this.states.size > MAX_TRACKED_STATES) {
|
|
97
|
+
this.evictOldestTerminalState();
|
|
98
|
+
}
|
|
99
|
+
|
|
92
100
|
if (previousPhase !== phase && this.callbacks.onPhaseChange) {
|
|
93
101
|
try {
|
|
94
102
|
this.callbacks.onPhaseChange(state, previousPhase);
|
|
@@ -116,6 +124,27 @@ export class OverflowRecoveryTracker {
|
|
|
116
124
|
for (const key of keys) this.removeKey(key);
|
|
117
125
|
}
|
|
118
126
|
|
|
127
|
+
/**
|
|
128
|
+
* Evict the oldest terminal-state entry (phase is "recovered", "failed",
|
|
129
|
+
* or "none"). Used as a defensive cap when states.size exceeds
|
|
130
|
+
* MAX_TRACKED_STATES. Live states in "compaction"/"retrying" phases are
|
|
131
|
+
* never evicted by this method — they have their own TTL-driven cleanup.
|
|
132
|
+
*/
|
|
133
|
+
private evictOldestTerminalState(): void {
|
|
134
|
+
let oldestKey: string | undefined;
|
|
135
|
+
let oldestTimestamp = Infinity;
|
|
136
|
+
for (const [key, state] of this.states) {
|
|
137
|
+
const isTerminal = state.phase === "recovered" || state.phase === "failed" || state.phase === "none";
|
|
138
|
+
if (isTerminal && state.lastEventAt < oldestTimestamp) {
|
|
139
|
+
oldestTimestamp = state.lastEventAt;
|
|
140
|
+
oldestKey = key;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
if (oldestKey !== undefined) {
|
|
144
|
+
this.removeKey(oldestKey);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
119
148
|
dispose(): void {
|
|
120
149
|
for (const timer of this.timers.values()) clearTimeout(timer);
|
|
121
150
|
this.timers.clear();
|
package/src/runtime/semaphore.ts
CHANGED
|
@@ -16,6 +16,9 @@ export class Semaphore {
|
|
|
16
16
|
#max: number;
|
|
17
17
|
#current = 0;
|
|
18
18
|
#queue: Array<() => void> = [];
|
|
19
|
+
// FIX (Round 15): Cap the waiter queue to prevent unbounded memory growth
|
|
20
|
+
// if the semaphore is held for a long period and many tasks accumulate.
|
|
21
|
+
static readonly MAX_QUEUE = 10_000;
|
|
19
22
|
|
|
20
23
|
constructor(max: number) {
|
|
21
24
|
this.#max = Math.max(1, max);
|
|
@@ -26,6 +29,14 @@ export class Semaphore {
|
|
|
26
29
|
this.#current++;
|
|
27
30
|
return;
|
|
28
31
|
}
|
|
32
|
+
// FIX (Round 15): Reject when the waiter queue is full. The previous
|
|
33
|
+
// implementation let #queue grow without bound, risking memory
|
|
34
|
+
// exhaustion under sustained high concurrency with slow releases.
|
|
35
|
+
if (this.#queue.length >= Semaphore.MAX_QUEUE) {
|
|
36
|
+
throw new Error(
|
|
37
|
+
`Semaphore queue full: ${this.#queue.length} waiters (max ${Semaphore.MAX_QUEUE}); cannot acquire slot`,
|
|
38
|
+
);
|
|
39
|
+
}
|
|
29
40
|
const { promise, resolve } = (() => {
|
|
30
41
|
let res: () => void;
|
|
31
42
|
const p = new Promise<void>((r) => { res = r; });
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
|
+
import * as path from "node:path";
|
|
2
3
|
import type { AgentConfig } from "../agents/agent-config.ts";
|
|
3
4
|
import type { CrewLimitsConfig, CrewRuntimeConfig, CrewReliabilityConfig } from "../config/config.ts";
|
|
4
5
|
import type { CrewRuntimeCapabilities } from "./runtime-resolver.ts";
|
|
@@ -38,6 +39,36 @@ import { CrewCancellationError, buildSyntheticTerminalEvidence, cancellationReas
|
|
|
38
39
|
import { effectivenessPolicyDecision, evaluateRunEffectiveness, formatRunEffectivenessLines } from "./effectiveness.ts";
|
|
39
40
|
import { logInternalError } from "../utils/internal-error.ts";
|
|
40
41
|
|
|
42
|
+
/**
|
|
43
|
+
* Start a periodic heartbeat for the team-level run.
|
|
44
|
+
*
|
|
45
|
+
* The stale reconciler (src/runtime/stale-reconciler.ts) marks runs as failed
|
|
46
|
+
* if their heartbeat is older than `NO_PID_HEARTBEAT_STALE_MS` (5 minutes).
|
|
47
|
+
* Without this, long-running team runs (e.g. multi-phase workflows) get
|
|
48
|
+
* cancelled by the reconciler as "stale" even when they are actively
|
|
49
|
+
* executing. The team-runner has no periodic heartbeat today, so any
|
|
50
|
+
* team run lasting >5min is at risk.
|
|
51
|
+
*/
|
|
52
|
+
function startTeamRunHeartbeat(stateRoot: string, runId: string): () => void {
|
|
53
|
+
const heartbeatPath = path.join(stateRoot, "heartbeat.json");
|
|
54
|
+
const writeHeartbeat = (): void => {
|
|
55
|
+
try {
|
|
56
|
+
fs.writeFileSync(heartbeatPath, JSON.stringify({
|
|
57
|
+
pid: process.pid,
|
|
58
|
+
at: Date.now(),
|
|
59
|
+
runId,
|
|
60
|
+
kind: "team-runner",
|
|
61
|
+
}), "utf-8");
|
|
62
|
+
} catch {
|
|
63
|
+
// best-effort
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
writeHeartbeat();
|
|
67
|
+
const interval = setInterval(writeHeartbeat, 30_000);
|
|
68
|
+
interval.unref();
|
|
69
|
+
return () => clearInterval(interval);
|
|
70
|
+
}
|
|
71
|
+
|
|
41
72
|
export interface ExecuteTeamRunInput {
|
|
42
73
|
manifest: TeamRunManifest;
|
|
43
74
|
tasks: TeamTaskState[];
|
|
@@ -271,12 +302,20 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
|
|
|
271
302
|
|
|
272
303
|
void registerRunPromise(manifest.runId);
|
|
273
304
|
|
|
305
|
+
// FIX (Round 15, regression): Start a team-level heartbeat so the stale
|
|
306
|
+
// reconciler does not cancel long-running team runs after 5 minutes
|
|
307
|
+
// (NO_PID_HEARTBEAT_STALE_MS). Previously only sub-task runners wrote
|
|
308
|
+
// heartbeats; the team-level run had no heartbeat, so any multi-phase
|
|
309
|
+
// workflow lasting >5min was marked stale and cancelled.
|
|
310
|
+
const stopTeamHeartbeat = startTeamRunHeartbeat(manifest.stateRoot, manifest.runId);
|
|
311
|
+
|
|
274
312
|
const cleanupUsage = (): void => {
|
|
275
313
|
for (const task of input.tasks) clearTrackedTaskUsage(task.id);
|
|
276
314
|
};
|
|
277
315
|
|
|
278
316
|
try {
|
|
279
317
|
const result = await executeTeamRunCore(input, manifest, workflow);
|
|
318
|
+
stopTeamHeartbeat();
|
|
280
319
|
resolveRunPromise(manifest.runId, result);
|
|
281
320
|
cleanupUsage();
|
|
282
321
|
// Terminate live agents for this run — agents are done when the run ends.
|
|
@@ -318,6 +357,7 @@ export async function executeTeamRun(input: ExecuteTeamRunInput): Promise<{ mani
|
|
|
318
357
|
rejectRunPromise(manifest.runId, error instanceof Error ? error : new Error(message));
|
|
319
358
|
crewHooks.emit({ type: "run_failed", timestamp: new Date().toISOString(), runId: manifest.runId, data: { status: manifest.status, error: message } });
|
|
320
359
|
cleanupUsage();
|
|
360
|
+
stopTeamHeartbeat();
|
|
321
361
|
return result;
|
|
322
362
|
}
|
|
323
363
|
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import * as fs from "node:fs";
|
|
2
2
|
import { redactJsonLine } from "../utils/redaction.ts";
|
|
3
|
+
import { logInternalError } from "../utils/internal-error.ts";
|
|
3
4
|
|
|
4
5
|
export interface DrainableSource {
|
|
5
6
|
pause(): void;
|
|
@@ -68,7 +69,7 @@ export function createJsonlWriter(filePath: string | undefined, source: Drainabl
|
|
|
68
69
|
}
|
|
69
70
|
} catch (writeError) {
|
|
70
71
|
// Log the error — silently dropping events is dangerous.
|
|
71
|
-
|
|
72
|
+
logInternalError("jsonl-writer.write", writeError, `file=${filePath}`);
|
|
72
73
|
}
|
|
73
74
|
},
|
|
74
75
|
async close() {
|