instar 1.3.567 → 1.3.568

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"uncaughtExceptionPolicy.d.ts","sourceRoot":"","sources":["../../src/core/uncaughtExceptionPolicy.ts"],"names":[],"mappings":"AA2CA;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,OAAO,GAAG,OAAO,CAIxD;AAyBD,wBAAgB,yBAAyB,CAAC,GAAG,EAAE,OAAO,GAAG,OAAO,CAQ/D;AAED,gFAAgF;AAChF,wBAAgB,kCAAkC,IAAI,IAAI,CAEzD"}
1
+ {"version":3,"file":"uncaughtExceptionPolicy.d.ts","sourceRoot":"","sources":["../../src/core/uncaughtExceptionPolicy.ts"],"names":[],"mappings":"AA6DA;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,GAAG,EAAE,OAAO,GAAG,OAAO,CAIxD;AAyBD,wBAAgB,yBAAyB,CAAC,GAAG,EAAE,OAAO,GAAG,OAAO,CAQ/D;AAED,gFAAgF;AAChF,wBAAgB,kCAAkC,IAAI,IAAI,CAEzD"}
@@ -39,6 +39,24 @@ const NON_FATAL_UNCAUGHT_PATTERNS = [
39
39
  // This is a crash backstop ONLY — it does not change lease/standby behavior;
40
40
  // it stops a standby write from taking the process down.
41
41
  'StateManager is read-only',
42
+ // Network-class outbound failures (transient upstream/peer outage). A failed
43
+ // outbound fetch — the multi-machine lease-wire peer broadcast, a Slack
44
+ // connect/reconnect, any HTTP call — is ISOLATED by nature: the call has
45
+ // already unwound and SQLite, HTTP, and the other subsystems are intact. The
46
+ // owning subsystem retries/self-heals (lease-wire re-broadcasts; the socket
47
+ // reconnects with backoff). Crashing the whole agent on a transient network
48
+ // blip is strictly worse than logging + continuing — it was the cause of the
49
+ // 2026-06-15 crash-during-API-instability (an uncaught `fetch failed` took the
50
+ // server down mid-outage). The first-seen-stack logging below still surfaces
51
+ // the un-guarded callsite so the real missing `.catch` gets fixed; this is the
52
+ // crash backstop, NOT a license to skip the catch. (CMT-1548)
53
+ 'fetch failed', // undici / Node global fetch network failure
54
+ 'ECONNREFUSED',
55
+ 'ECONNRESET',
56
+ 'ETIMEDOUT',
57
+ 'ENOTFOUND',
58
+ 'EAI_AGAIN',
59
+ 'socket hang up',
42
60
  ];
43
61
  /**
44
62
  * True when an uncaught exception is a known isolated/recoverable error that the
@@ -1 +1 @@
1
- {"version":3,"file":"uncaughtExceptionPolicy.js","sourceRoot":"","sources":["../../src/core/uncaughtExceptionPolicy.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AACH,MAAM,2BAA2B,GAAG;IAClC,0EAA0E;IAC1E,8EAA8E;IAC9E,wCAAwC;IACxC,iBAAiB;IACjB,uBAAuB;IACvB,4BAA4B;IAC5B,2EAA2E;IAC3E,2EAA2E;IAC3E,0EAA0E;IAC1E,4EAA4E;IAC5E,6EAA6E;IAC7E,4EAA4E;IAC5E,uBAAuB;IACvB,6EAA6E;IAC7E,sEAAsE;IACtE,8EAA8E;IAC9E,uEAAuE;IACvE,2EAA2E;IAC3E,4EAA4E;IAC5E,6EAA6E;IAC7E,gFAAgF;IAChF,6EAA6E;IAC7E,yDAAyD;IACzD,2BAA2B;CAC5B,CAAC;AAEF;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB,CAAC,GAAY;IAC7C,MAAM,GAAG,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;IACpF,IAAI,CAAC,GAAG;QAAE,OAAO,KAAK,CAAC;IACvB,OAAO,2BAA2B,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;AAClE,CAAC;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,kBAAkB,GAAG,IAAI,GAAG,EAAU,CAAC;AAC7C,MAAM,kBAAkB,GAAG,GAAG,CAAC;AAE/B,MAAM,UAAU,yBAAyB,CAAC,GAAY;IACpD,IAAI,CAAC,CAAC,GAAG,YAAY,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK;QAAE,OAAO,KAAK,CAAC;IACxD,IAAI,kBAAkB,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACpD,0EAA0E;IAC1E,+EAA+E;IAC/E,IAAI,kBAAkB,CAAC,IAAI,IAAI,kBAAkB;QAAE,kBAAkB,CAAC,KAAK,EAAE,CAAC;IAC9E,kBAAkB,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAClC,OAAO,IAAI,CAAC;AACd,CAAC;AAED,gFAAgF;AAChF,MAAM,UAAU,kCAAkC;IAChD,kBAAkB,CAAC,KAAK,EAAE,CAAC;AAC7B,CAAC"}
1
+ {"version":3,"file":"uncaughtExceptionPolicy.js","sourceRoot":"","sources":["../../src/core/uncaughtExceptionPolicy.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AACH,MAAM,2BAA2B,GAAG;IAClC,0EAA0E;IAC1E,8EAA8E;IAC9E,wCAAwC;IACxC,iBAAiB;IACjB,uBAAuB;IACvB,4BAA4B;IAC5B,2EAA2E;IAC3E,2EAA2E;IAC3E,0EAA0E;IAC1E,4EAA4E;IAC5E,6EAA6E;IAC7E,4EAA4E;IAC5E,uBAAuB;IACvB,6EAA6E;IAC7E,sEAAsE;IACtE,8EAA8E;IAC9E,uEAAuE;IACvE,2EAA2E;IAC3E,4EAA4E;IAC5E,6EAA6E;IAC7E,gFAAgF;IAChF,6EAA6E;IAC7E,yDAAyD;IACzD,2BAA2B;IAC3B,6EAA6E;IAC7E,wEAAwE;IACxE,yEAAyE;IACzE,6EAA6E;IAC7E,4EAA4E;IAC5E,4EAA4E;IAC5E,6EAA6E;IAC7E,+EAA+E;IAC/E,6EAA6E;IAC7E,+EAA+E;IAC/E,8DAA8D;IAC9D,cAAc,EAAW,6CAA6C;IACtE,cAAc;IACd,YAAY;IACZ,WAAW;IACX,WAAW;IACX,WAAW;IACX,gBAAgB;CACjB,CAAC;AAEF;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB,CAAC,GAAY;IAC7C,MAAM,GAAG,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC;IACpF,IAAI,CAAC,GAAG;QAAE,OAAO,KAAK,CAAC;IACvB,OAAO,2BAA2B,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;AAClE,CAAC;AAED;;;;;;;;;;;;;;;;;;;GAmBG;AACH,MAAM,kBAAkB,GAAG,IAAI,GAAG,EAAU,CAAC;AAC7C,MAAM,kBAAkB,GAAG,GAAG,CAAC;AAE/B,MAAM,UAAU,yBAAyB,CAAC,GAAY;IACpD,IAAI,CAAC,CAAC,GAAG,YAAY,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK;QAAE,OAAO,KAAK,CAAC;IACxD,IAAI,kBAAkB,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACpD,0EAA0E;IAC1E,+EAA+E;IAC/E,IAAI,kBAAkB,CAAC,IAAI,IAAI,kBAAkB;QAAE,kBAAkB,CAAC,KAAK,EAAE,CAAC;IAC9E,kBAAkB,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAClC,OAAO,IAAI,CAAC;AACd,CAAC;AAED,gFAAgF;AAChF,MAAM,UAAU,kCAAkC;IAChD,kBAAkB,CAAC,KAAK,EAAE,CAAC;AAC7B,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "instar",
3
- "version": "1.3.567",
3
+ "version": "1.3.568",
4
4
  "description": "Coherence infrastructure for self-evolving AI agents — on the Claude Code or Codex subscription you already have.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "$schema": "./builtin-manifest.schema.json",
3
3
  "schemaVersion": 1,
4
- "generatedAt": "2026-06-15T00:30:17.779Z",
5
- "instarVersion": "1.3.567",
4
+ "generatedAt": "2026-06-15T03:41:03.300Z",
5
+ "instarVersion": "1.3.568",
6
6
  "entryCount": 201,
7
7
  "entries": {
8
8
  "hook:session-start": {
@@ -0,0 +1,22 @@
1
+ # Upgrade Guide — vNEXT
2
+
3
+ <!-- assembled-by: assemble-next-md -->
4
+ <!-- bump: patch -->
5
+
6
+ ## What Changed
7
+
8
+ Fixed a robustness gap where a transient network failure could crash the whole agent server. The top-level uncaught-exception handler crashes by default and only log-and-continues for a tight allowlist of known-isolated errors (HTTP double-response races, the Slack reconnect race, standby read-only writes). Network-class failures weren't on that list — so on 2026-06-15, during an API/network rough patch, an uncaught `fetch failed` (the multi-machine lease-wire broadcasting to an offline peer) took the server down (it auto-restarted ~50s later). Network-class tokens (`fetch failed`, `ECONNREFUSED`, `ECONNRESET`, `ETIMEDOUT`, `ENOTFOUND`, `EAI_AGAIN`, `socket hang up`) are now on the recoverable allowlist, so an isolated outbound-fetch failure degrades (log + continue) instead of crashing. The handler logic is untouched; any unrecognized error still crashes (the safe default), and the first-seen-stack logging still surfaces the un-guarded callsite so the real missing `.catch` gets fixed.
9
+
10
+ ## What to Tell Your User
11
+
12
+ Nothing they need to do. The practical effect is fewer unexpected restarts: during a brief internet/API/peer-machine outage, the agent now rides it out and keeps serving instead of restarting itself over a failed background network call.
13
+
14
+ ## Summary of New Capabilities
15
+
16
+ - The crash safety net treats transient network-class failures (`fetch failed` and common connection error codes) as recoverable — log-and-continue, not crash — while keeping the safe default (crash) for every unrecognized error.
17
+
18
+ ## Evidence
19
+
20
+ - Unit tests: `tests/unit/uncaughtExceptionPolicy.test.ts` — 11/11 pass, including the new network-class positive cases (incl. undici `TypeError('fetch failed')`) and boundary cases proving `assertion failed` / `migration failed` / sqlite / undefined-property errors still crash.
21
+ - Root cause in logs: `logs/server.log` 2026-06-15T01:50:28Z `[FATAL] Uncaught exception — closing databases before crash: fetch failed`, preceded by `[lease-wire] broadcast to m_4cbc… became unreachable: fetch failed`.
22
+ - Side-effects review + independent adversarial second pass (concur): `upgrades/side-effects/cmt1548-uncaught-fetch-degrade.md`.
@@ -0,0 +1,99 @@
1
+ # Side-Effects Review — network-class uncaught exceptions degrade, not crash (CMT-1548)
2
+
3
+ **Version / slug:** `cmt1548-uncaught-fetch-degrade`
4
+ **Date:** `2026-06-15`
5
+ **Author:** `Echo (instar-dev agent)`
6
+ **Second-pass reviewer:** `adversarial reviewer subagent (see §Second-pass)`
7
+
8
+ ## Summary of the change
9
+
10
+ The server's top-level `process.on('uncaughtException')` handler (`src/commands/server.ts`) crashes by default and only log-and-continues for errors whose message matches `NON_FATAL_UNCAUGHT_PATTERNS` in `src/core/uncaughtExceptionPolicy.ts`. That allowlist already covers HTTP double-response races, the Slack `Sent before connected` reconnect race, and standby read-only writes — but NOT network-class failures. On 2026-06-15 a transient `fetch failed` (the multi-machine lease-wire broadcasting to an offline peer, during an upstream/API outage) hit the handler as an uncaught exception and crashed the whole server (it auto-restarted ~50s later). This change adds network-class tokens (`fetch failed`, `ECONNREFUSED`, `ECONNRESET`, `ETIMEDOUT`, `ENOTFOUND`, `EAI_AGAIN`, `socket hang up`) to the allowlist, with a justification comment, plus unit tests on both sides of the boundary. Files: `src/core/uncaughtExceptionPolicy.ts` (+8 patterns +comment), `tests/unit/uncaughtExceptionPolicy.test.ts` (+1 positive block, +2 negative boundary cases). Handler logic untouched.
11
+
12
+ ## Decision-point inventory
13
+
14
+ - `isNonFatalUncaught() recoverable-vs-fatal boundary` (`src/core/uncaughtExceptionPolicy.ts`) — **modify** — extends the existing recoverable allowlist with network-class tokens. Default for any unmatched error stays crash (the safe default).
15
+
16
+ ---
17
+
18
+ ## 1. Over-block
19
+
20
+ In this context "over-block" = **suppressing an uncaught error that SHOULD have crashed** (false-recoverable). The risk: an error whose message merely *contains* a network token (e.g. a programming bug whose message happens to include "socket hang up") is now suppressed and the process keeps running in a possibly-degraded state instead of crashing clean.
21
+
22
+ Concrete shapes considered: `'fetch failed'` and the `E*` codes are emitted by Node/undici for genuine network failures only — they are not substrings of common logic-bug messages. `'socket hang up'` and `'ECONNRESET'` are the broadest; both are still network-transport phrases, not general-purpose words. The boundary test asserts `'assertion failed'` / `'migration failed'` stay fatal (a bare "failed" is NOT matched — we match specific tokens). Residual risk is low and is the deliberate trade: a transient network blip must not crash the agent, and the first-seen-stack log surfaces any wrongly-suppressed origin for follow-up.
23
+
24
+ ---
25
+
26
+ ## 2. Under-block
27
+
28
+ "Under-block" = **still crashing on something that was recoverable**. A network failure surfaced with a message outside this token set (e.g. a custom wrapper that rethrows "upstream unavailable" without the underlying code) would still crash. That is acceptable — the allowlist is intentionally tight; we add tokens as real crashes prove them recoverable rather than matching a broad "any network-ish word." The belt-and-suspenders follow-up (a `.catch` at the lease-wire broadcast + slack reconnect fetch paths) is the primary fix; this policy entry is the backstop.
29
+
30
+ ---
31
+
32
+ ## 3. Level-of-abstraction fit
33
+
34
+ Correct layer. This is the process-level last-resort crash backstop — the established home for "this isolated error must not take the whole agent down." It is intentionally a low-level substring detector defaulting to the SAFE direction (crash on anything unrecognized). It does not replace the proper fix (guard the originating fetch with `.catch`); it prevents a missing-catch from escalating a transient outage into a crash-loop. A higher-level gate is not appropriate for a synchronous uncaught-exception handler.
35
+
36
+ ---
37
+
38
+ ## 4. Signal vs authority compliance
39
+
40
+ **Required reference:** docs/signal-vs-authority.md
41
+
42
+ - [x] Yes — but the logic is the EXISTING, blessed crash-backstop pattern, and it fails toward the safe default (crash) for anything unmatched.
43
+
44
+ This entry extends an established allowlist whose whole design is: recognized-isolated → continue; everything else → crash. It holds "crash vs continue" authority, but with the conservative default (unknown ⇒ crash). It is not a new brittle detector owning block-authority over user input; it is the same precedent already shipping for HTTP races, Slack reconnects, and standby writes. The first-seen-stack diagnostic preserves the path to fixing the real missing-catch. No reshaping needed; the design matches the existing pattern exactly.
45
+
46
+ ---
47
+
48
+ ## 5. Interactions
49
+
50
+ - **Shadowing:** runs only inside the top-level `uncaughtException` handler; no ordering against other checks. A matched error returns before `closeAllSqlite()` + `process.exit(1)`. No other check is shadowed.
51
+ - **Double-fire:** none — a single uncaught exception is handled once.
52
+ - **Races:** none — the policy is a pure substring function over the error message; no shared state. (`shouldLogStackForUncaught`'s dedup set is unchanged.)
53
+ - **Feedback loops:** the change REDUCES a feedback loop — it stops the boot→transient-fetch-fail→FATAL→respawn→… crash-loop on a sustained outage.
54
+
55
+ ---
56
+
57
+ ## 6. External surfaces
58
+
59
+ Fleet-wide runtime behavior change: every instar server will now log-and-continue (instead of crash+respawn) on an uncaught network-class error. User-visible effect is strictly positive (fewer unexpected restarts during upstream/network outages). No change to response formats, ledgers, databases, or any persistent state. No external system (Telegram/Slack/GitHub/Cloudflare) is called differently. No timing dependence. **Operator surface:** none — this change adds no operator-facing action.
60
+
61
+ ---
62
+
63
+ ## 6b. Operator-surface quality (Operator-Surface Quality standard)
64
+
65
+ No operator surface — not applicable. This change touches no dashboard renderer, approval page, or grant/revoke/secret-drop form.
66
+
67
+ ---
68
+
69
+ ## 7. Multi-machine posture (Cross-Machine Coherence)
70
+
71
+ **machine-local BY DESIGN** — the crash handler is per-process; each machine's server runs its own `uncaughtException` handler, and that is correct (a crash decision is inherently about the local process). There is no cross-machine state to replicate. Note the *motivating* failure was a multi-machine path (the lease-wire peer broadcast to an offline peer threw the uncaught `fetch failed`), so the benefit accrues most on multi-machine installs — but the fix itself is correctly per-process. Emits no user-facing notices (no one-voice gating needed). Holds no durable state (nothing strands on topic transfer). Generates no URLs.
72
+
73
+ ---
74
+
75
+ ## 8. Rollback cost
76
+
77
+ Pure code change — revert the two files and ship as the next patch. No data migration, no persistent state, no agent-state repair, no user-visible regression during the rollback window. Reversible by removing the added patterns; the allowlist returns to its prior behavior immediately on the reverted build.
78
+
79
+ ---
80
+
81
+ ## Conclusion
82
+
83
+ A tight, additive, well-precedented fix to the existing crash-backstop allowlist that resolves a real fleet-wide failure mode (a transient network error crashing the whole server during an outage). The review surfaced one genuine residual risk — over-suppression of a non-network error whose message coincidentally contains a network token — mitigated by keeping the token set specific (boundary test proves a bare "failed" is not matched) and by the first-seen-stack diagnostic that surfaces any wrongly-suppressed origin. Clear to ship as a Tier-1 change.
84
+
85
+ ---
86
+
87
+ ## Second-pass review (if required)
88
+
89
+ **Reviewer:** adversarial reviewer subagent
90
+ **Independent read of the artifact: concur**
91
+
92
+ Independently verified the test green (11/11 via `vitest run`). Decisive adversarial check: every genuinely-fatal Node/TS error family — OOM (`JavaScript heap out of memory`), `SQLITE_CORRUPT` / `database is locked` / `database is closed`, `EMFILE` / `ENOSPC`, `EPIPE`, `Maximum call stack size exceeded`, and the classic `Cannot read properties of undefined` — all still CRASH, because the network-transport tokens are disjoint from those messages (confirmed by direct substring testing). The `E*` codes are SCREAMING_SNAKE and never appear inside ordinary logic-bug prose, so the `includes` matcher is safe. The only over-suppression constructible (`fetch failed` hiding a non-transient TLS/DNS `.cause`, or a contrived property literally named `'ECONNRESET'`) is either implausible or genuinely isolated-and-recoverable per the handler's contract, with the first-seen-stack diagnostic still surfacing the origin. The boundary tests (`assertion failed`/`migration failed` stay fatal) are load-bearing and correctly prove no bare-`failed` overreach. Clear to ship.
93
+
94
+ ---
95
+
96
+ ## Evidence pointers
97
+
98
+ - Unit test: `tests/unit/uncaughtExceptionPolicy.test.ts` — 11/11 pass (verified locally via `vitest run`), incl. the new network-class positive block and the `assertion failed`/`migration failed` negative boundary cases.
99
+ - Root-cause log evidence: `logs/server.log` 2026-06-15T01:50:28Z `[FATAL] Uncaught exception — closing databases before crash: fetch failed`, preceded by `[lease-wire] broadcast to m_4cbc... became unreachable: fetch failed`.