@druumen/sessions-db 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/CHANGELOG.md +249 -0
  2. package/LICENSE +201 -0
  3. package/NOTICE +10 -0
  4. package/README.md +250 -0
  5. package/cli/_write-helpers.mjs +99 -0
  6. package/cli/alias.mjs +115 -0
  7. package/cli/argparse.mjs +296 -0
  8. package/cli/close.mjs +116 -0
  9. package/cli/find.mjs +185 -0
  10. package/cli/format.mjs +277 -0
  11. package/cli/link-parent.mjs +133 -0
  12. package/cli/link.mjs +132 -0
  13. package/cli/rebuild.mjs +98 -0
  14. package/cli/sessions-db-session-start-main.mjs +454 -0
  15. package/cli/sessions-db-session-start.mjs +56 -0
  16. package/cli/sessions-db.mjs +119 -0
  17. package/cli/sweep.mjs +171 -0
  18. package/cli/tree.mjs +127 -0
  19. package/lib/git-context.mjs +479 -0
  20. package/lib/identity.mjs +616 -0
  21. package/lib/index.mjs +145 -0
  22. package/lib/init.mjs +185 -0
  23. package/lib/lock.mjs +86 -0
  24. package/lib/operations.mjs +490 -0
  25. package/lib/paths.mjs +199 -0
  26. package/lib/projection.mjs +496 -0
  27. package/lib/sanitize.mjs +131 -0
  28. package/lib/storage.mjs +759 -0
  29. package/lib/sweep.mjs +209 -0
  30. package/lib/transcript.mjs +230 -0
  31. package/lib/types.mjs +276 -0
  32. package/lib/uuid.mjs +116 -0
  33. package/lib/watch.mjs +217 -0
  34. package/package.json +53 -0
  35. package/types/git-context.d.mts +98 -0
  36. package/types/identity.d.mts +658 -0
  37. package/types/index.d.mts +10 -0
  38. package/types/index.d.ts +127 -0
  39. package/types/init.d.mts +53 -0
  40. package/types/lock.d.mts +18 -0
  41. package/types/operations.d.mts +204 -0
  42. package/types/paths.d.mts +54 -0
  43. package/types/projection.d.mts +79 -0
  44. package/types/sanitize.d.mts +39 -0
  45. package/types/storage.d.mts +276 -0
  46. package/types/sweep.d.mts +58 -0
  47. package/types/transcript.d.mts +59 -0
  48. package/types/types.d.mts +255 -0
  49. package/types/uuid.d.mts +17 -0
  50. package/types/watch.d.mts +33 -0
@@ -0,0 +1,496 @@
1
+ /**
2
+ * Pure projection logic for sessions-db.
3
+ *
4
+ * Events are appended to `events.jsonl` (SSoT). The projection cache
5
+ * (`sessions-db.json`) is a fold of all events: `events → reduce → state`.
6
+ * This module contains zero IO — it only knows how to fold one or more
7
+ * events into a projection object. The `storage.mjs` wrapper handles disk.
8
+ *
9
+ * Schema v0.2 — see Phase 1 ticket §2 "Projection schema".
10
+ *
11
+ * Idempotency contract:
12
+ * - Applying the same event sequence twice yields equivalent projections
13
+ * (sessions are merged, not duplicated; arrays are deduped where their
14
+ * identity is well-defined; counters are recomputed from event count, not
15
+ * incremented).
16
+ * - Reducers mutate `projection` in place and return the same reference;
17
+ * callers can use either the return value or the mutated input.
18
+ */
19
+
20
+ const SCHEMA_VERSION = 2;
21
+ const FINGERPRINT_VERSIONS = ['first_human_prompt_v1', 'lineage_prefix_v1'];
22
+
23
+ /**
24
+ * Build an empty projection skeleton. Sessions map starts empty; metadata
25
+ * has `event_count = 0` and `last_event_id = null`.
26
+ *
27
+ * @returns {{ _meta: object, sessions: Record<string, object> }}
28
+ */
29
+ export function emptyProjection() {
30
+ return {
31
+ _meta: {
32
+ schema_version: SCHEMA_VERSION,
33
+ fingerprint_versions: [...FINGERPRINT_VERSIONS],
34
+ updated: null,
35
+ event_count: 0,
36
+ last_event_id: null,
37
+ },
38
+ sessions: {},
39
+ };
40
+ }
41
+
42
+ /**
43
+ * Build a default session record. Caller passes the stable_id and the
44
+ * `created_at` timestamp (typically the first observing event's `ts`).
45
+ *
46
+ * @param {string} stableId
47
+ * @param {string} ts - ISO timestamp string used for both created_at and
48
+ * last_progress_at.
49
+ */
50
+ export function emptySession(stableId, ts) {
51
+ return {
52
+ stable_id: stableId,
53
+ alias: null,
54
+ claude_session_ids: [],
55
+ transcript_files: [],
56
+ fingerprints: {
57
+ first_human_prompt_v1: null,
58
+ lineage_prefix_v1: null,
59
+ },
60
+ parent_session_id: null,
61
+ parent_candidate_ids: [],
62
+ // Count of parent candidates that resolveIdentity omitted from the most
63
+ // recent session_seen due to the MAX_PARENT_CANDIDATES cap. 0 means the
64
+ // surfaced parent_candidate_ids are complete; >0 means CLI / audit
65
+ // should render "+ N more" or trigger a rebuild-from-events drill-down.
66
+ // Last-write-wins (mirrors identity_resolution semantics).
67
+ parent_candidates_omitted_count: 0,
68
+ // Audit trail of how the most recent session_seen resolved this stable_id
69
+ // — overwritten on every session_seen (always reflects the latest signal
70
+ // set). Null on first creation; populated by reduceSessionSeen when the
71
+ // event payload carries it. See identity.mjs / recordSessionSeen.
72
+ identity_resolution: null,
73
+ worktree_path_observed: null,
74
+ worktree_realpath: null,
75
+ worktree_registry_name: null,
76
+ git_common_dir: null,
77
+ branch_at_start: null,
78
+ branch_current: null,
79
+ head_at_start: null,
80
+ head_last_seen: null,
81
+ tasks: [],
82
+ projects: [],
83
+ activity_state: 'active',
84
+ outcome: 'open',
85
+ closed_at: null,
86
+ closed_reason: null,
87
+ created_at: ts,
88
+ last_progress_at: ts,
89
+ first_prompt_preview: null,
90
+ };
91
+ }
92
+
93
+ /**
94
+ * Apply a single event to a projection (mutating). Returns the same
95
+ * projection reference for fluent chaining.
96
+ *
97
+ * Unknown ops are tolerated — they update _meta but otherwise no-op so a
98
+ * future schema bump applied against an older binary degrades cleanly. We
99
+ * still bump `event_count` so the rebuild detector remains accurate.
100
+ *
101
+ * @param {object} projection
102
+ * @param {{ ts: string, event_id: string, op: string, stable_id: string,
103
+ * payload?: object }} event
104
+ * @returns {object} projection
105
+ */
106
+ export function applyEvent(projection, event) {
107
+ if (!projection || typeof projection !== 'object' || !projection.sessions) {
108
+ throw new TypeError('applyEvent: projection missing or malformed');
109
+ }
110
+ if (!event || typeof event !== 'object') {
111
+ throw new TypeError('applyEvent: event missing');
112
+ }
113
+ const { op, stable_id: stableId, ts } = event;
114
+ if (typeof stableId !== 'string' || stableId.length === 0) {
115
+ throw new TypeError('applyEvent: event.stable_id required');
116
+ }
117
+
118
+ // Ensure session exists for any op except (theoretically) ops that operate
119
+ // on global state — currently every op is session-scoped, so eager
120
+ // creation is safe and idempotent.
121
+ let session = projection.sessions[stableId];
122
+ if (!session) {
123
+ session = emptySession(stableId, ts);
124
+ projection.sessions[stableId] = session;
125
+ }
126
+
127
+ switch (op) {
128
+ case 'session_seen':
129
+ reduceSessionSeen(session, event);
130
+ break;
131
+ case 'session_link':
132
+ reduceSessionLink(session, event);
133
+ break;
134
+ case 'alias_set':
135
+ reduceAliasSet(session, event);
136
+ break;
137
+ case 'parent_set':
138
+ reduceParentSet(session, event);
139
+ break;
140
+ case 'close':
141
+ reduceClose(session, event);
142
+ break;
143
+ case 'sweep':
144
+ reduceSweep(session, event);
145
+ break;
146
+ case 'session_unlink':
147
+ reduceSessionUnlink(session, event);
148
+ break;
149
+ case 'manual_link':
150
+ reduceManualLink(session, event);
151
+ break;
152
+ default:
153
+ // Unknown op — no-op on the session, but still account for it in
154
+ // _meta so callers can detect drift.
155
+ break;
156
+ }
157
+
158
+ // Bump last_progress_at to the most recent event's ts for ops that
159
+ // represent real session activity. `sweep` is a maintenance/synthetic op
160
+ // that should NOT bump last_progress_at on its own — its dedicated
161
+ // reducer handles `effective_last_progress` if the sweep wants to push
162
+ // the timestamp forward explicitly. We still guard against out-of-order
163
+ // ts via lexical compare (correct for ISO 8601 strings).
164
+ if (op !== 'sweep' && ts && (!session.last_progress_at || ts > session.last_progress_at)) {
165
+ session.last_progress_at = ts;
166
+ }
167
+
168
+ // Update _meta — last_event_id wins on every event (events.jsonl ordering
169
+ // is the canonical event order).
170
+ projection._meta.event_count += 1;
171
+ projection._meta.last_event_id = event.event_id ?? projection._meta.last_event_id;
172
+ projection._meta.updated = ts ?? projection._meta.updated;
173
+
174
+ return projection;
175
+ }
176
+
177
+ /**
178
+ * Fold an event array into a fresh projection. Used both for full rebuilds
179
+ * (storage.rebuildProjection) and for unit tests.
180
+ *
181
+ * @param {Array<object>} events
182
+ */
183
+ export function rebuildFromEvents(events) {
184
+ const projection = emptyProjection();
185
+ if (!Array.isArray(events)) return projection;
186
+ for (const event of events) {
187
+ applyEvent(projection, event);
188
+ }
189
+ return projection;
190
+ }
191
+
192
+ // ---------------------------------------------------------------------------
193
+ // Per-op reducers (each isolated for testability).
194
+ // ---------------------------------------------------------------------------
195
+
196
+ function reduceSessionSeen(session, event) {
197
+ const p = event.payload ?? {};
198
+
199
+ // claude_session_ids — append (dedup); represents fork/resume of the same
200
+ // logical session.
201
+ if (typeof p.claude_session_id === 'string' && p.claude_session_id.length > 0) {
202
+ if (!session.claude_session_ids.includes(p.claude_session_id)) {
203
+ session.claude_session_ids.push(p.claude_session_id);
204
+ }
205
+ }
206
+
207
+ // transcript_files — dedup by `path`. We replace the existing entry with
208
+ // the newest data so latest_uuid / size / mtime / status reflect current
209
+ // truth.
210
+ if (p.transcript_file && typeof p.transcript_file === 'object') {
211
+ const tf = p.transcript_file;
212
+ const idx = session.transcript_files.findIndex((t) => t && t.path === tf.path);
213
+ if (idx === -1) {
214
+ session.transcript_files.push({ ...tf });
215
+ } else {
216
+ session.transcript_files[idx] = { ...session.transcript_files[idx], ...tf };
217
+ }
218
+ }
219
+
220
+ // Fingerprints — only set when missing (first observation wins for v1
221
+ // algorithm; future versions can layer a different field).
222
+ if (p.fingerprints && typeof p.fingerprints === 'object') {
223
+ if (
224
+ session.fingerprints.first_human_prompt_v1 == null &&
225
+ typeof p.fingerprints.first_human_prompt_v1 === 'string'
226
+ ) {
227
+ session.fingerprints.first_human_prompt_v1 = p.fingerprints.first_human_prompt_v1;
228
+ }
229
+ if (
230
+ session.fingerprints.lineage_prefix_v1 == null &&
231
+ typeof p.fingerprints.lineage_prefix_v1 === 'string'
232
+ ) {
233
+ session.fingerprints.lineage_prefix_v1 = p.fingerprints.lineage_prefix_v1;
234
+ }
235
+ }
236
+
237
+ // Worktree / git context — last-write-wins for these recency-sensitive
238
+ // fields. `head_last_seen` and `branch_current` should reflect the most
239
+ // recent observation.
240
+ setIfPresent(session, p, 'worktree_path_observed');
241
+ setIfPresent(session, p, 'worktree_realpath');
242
+ setIfPresent(session, p, 'worktree_registry_name');
243
+ setIfPresent(session, p, 'git_common_dir');
244
+ setIfPresent(session, p, 'branch_current');
245
+ setIfPresent(session, p, 'head_last_seen');
246
+
247
+ // First-write-wins fields (initial observation captures these and we
248
+ // refuse to overwrite to preserve history).
249
+ setIfMissing(session, p, 'branch_at_start');
250
+ setIfMissing(session, p, 'head_at_start');
251
+ setIfMissing(session, p, 'first_prompt_preview');
252
+ if (typeof p.cwd === 'string' && session.cwd == null) {
253
+ session.cwd = p.cwd;
254
+ }
255
+
256
+ // identity_resolution — last-write-wins. Every session_seen carries the
257
+ // resolution outcome (P1/P2/P3/minted) that produced the stable_id this
258
+ // event landed on. Storing the LATEST is informative: a session that
259
+ // started life as 'minted' and then gets corroborated by subsequent
260
+ // signals (resume / fork) shows the most recent resolution path.
261
+ if (p.identity_resolution && typeof p.identity_resolution === 'object') {
262
+ session.identity_resolution = p.identity_resolution;
263
+ }
264
+
265
+ // parent_candidates_omitted_count — last-write-wins. Backward compat:
266
+ // missing field is treated as "no change to existing value" so old events
267
+ // (pre-cap) replayed on a fresh projection leave the default 0 alone, and
268
+ // new events on top of old projections (legacy session may not have the
269
+ // field) get it created via the emptySession default. Numeric only;
270
+ // anything else is ignored (defensive).
271
+ if (typeof p.parent_candidates_omitted_count === 'number'
272
+ && p.parent_candidates_omitted_count >= 0
273
+ && Number.isFinite(p.parent_candidates_omitted_count)) {
274
+ session.parent_candidates_omitted_count = p.parent_candidates_omitted_count;
275
+ }
276
+ // Defensive shim for projections persisted before the field existed: if a
277
+ // session record loaded from disk lacks the field, materialize it as 0 so
278
+ // downstream consumers can read it without optional-chaining everywhere.
279
+ if (typeof session.parent_candidates_omitted_count !== 'number') {
280
+ session.parent_candidates_omitted_count = 0;
281
+ }
282
+
283
+ // parent_candidate_ids — append + dedup by stable_id. Each session_seen
284
+ // may surface fingerprint matches that didn't reach the corroborator
285
+ // threshold (hub-spoke hints, NOT auto-promotion to parent_session_id).
286
+ // We accumulate them across observations because cross-session evidence
287
+ // is additive: a candidate observed once is still a candidate even if
288
+ // later observations don't repeat it.
289
+ if (Array.isArray(p.parent_candidate_ids)) {
290
+ for (const candidate of p.parent_candidate_ids) {
291
+ if (!candidate || typeof candidate !== 'object') continue;
292
+ // session_seen-derived candidates use `stable_id` (canonical). Manual
293
+ // links use `parent_id`. Accept either to keep the reducer
294
+ // forward-compatible across both surfaces.
295
+ const candidateId =
296
+ typeof candidate.stable_id === 'string' && candidate.stable_id.length > 0
297
+ ? candidate.stable_id
298
+ : typeof candidate.parent_id === 'string' && candidate.parent_id.length > 0
299
+ ? candidate.parent_id
300
+ : typeof candidate.id === 'string' && candidate.id.length > 0
301
+ ? candidate.id
302
+ : null;
303
+ if (candidateId === null) continue;
304
+ const dup = session.parent_candidate_ids.find((c) => {
305
+ const existingId =
306
+ typeof c.stable_id === 'string'
307
+ ? c.stable_id
308
+ : typeof c.parent_id === 'string'
309
+ ? c.parent_id
310
+ : typeof c.id === 'string'
311
+ ? c.id
312
+ : null;
313
+ return existingId !== null && existingId === candidateId;
314
+ });
315
+ if (!dup) session.parent_candidate_ids.push({ ...candidate });
316
+ }
317
+ }
318
+ }
319
+
320
+ function reduceSessionLink(session, event) {
321
+ const p = event.payload ?? {};
322
+
323
+ // P5 migration guard (codex P5 round-1 fix): P4-era `link --remove` wrote
324
+ // `session_link` events with `payload.remove: true`, but the P4 reducer
325
+ // never honored the flag — those events would still ADD the named tasks
326
+ // / projects rather than remove them. Operators noticed and re-issued
327
+ // their intent via other means; the bad events sit in events.jsonl as
328
+ // dead markers.
329
+ //
330
+ // P5 ships `session_unlink` as the canonical remove op. To prevent any
331
+ // rebuild-from-events run from silently re-adding tasks / projects the
332
+ // operator had already abandoned, we explicitly skip the entire add path
333
+ // when we see the legacy `payload.remove === true` marker. Operators who
334
+ // want to remove the link must re-issue `link --remove --task X` under
335
+ // P5, which now writes `session_unlink` (see cli/link.mjs).
336
+ //
337
+ // We deliberately do NOT dispatch into `reduceSessionUnlink` here — those
338
+ // P4 markers carry add-shaped semantics ("we wanted to remove these
339
+ // listed tasks") in a context where the actual session.tasks state may
340
+ // already have been modified by subsequent legitimate events. Treating
341
+ // them as no-ops is the safest projection-stable choice; treating them
342
+ // as unlinks would risk double-removing items the operator legitimately
343
+ // re-added later.
344
+ if (p.remove === true) return;
345
+
346
+ if (Array.isArray(p.tasks)) {
347
+ for (const t of p.tasks) {
348
+ if (typeof t === 'string' && t.length > 0 && !session.tasks.includes(t)) {
349
+ session.tasks.push(t);
350
+ }
351
+ }
352
+ }
353
+ if (Array.isArray(p.projects)) {
354
+ for (const proj of p.projects) {
355
+ if (typeof proj === 'string' && proj.length > 0 && !session.projects.includes(proj)) {
356
+ session.projects.push(proj);
357
+ }
358
+ }
359
+ }
360
+ }
361
+
362
+ function reduceAliasSet(session, event) {
363
+ const p = event.payload ?? {};
364
+ // Allow explicit clear via null. Anything else must be a non-empty string;
365
+ // missing payload.alias is a no-op (defensive).
366
+ if (p.alias === null) {
367
+ session.alias = null;
368
+ } else if (typeof p.alias === 'string' && p.alias.length > 0) {
369
+ session.alias = p.alias;
370
+ }
371
+ }
372
+
373
+ function reduceParentSet(session, event) {
374
+ const p = event.payload ?? {};
375
+ if (p.parent_session_id === null) {
376
+ session.parent_session_id = null;
377
+ } else if (
378
+ typeof p.parent_session_id === 'string' &&
379
+ p.parent_session_id.length > 0
380
+ ) {
381
+ session.parent_session_id = p.parent_session_id;
382
+ }
383
+ }
384
+
385
+ function reduceClose(session, event) {
386
+ const p = event.payload ?? {};
387
+ if (typeof p.outcome === 'string' && p.outcome.length > 0) {
388
+ session.outcome = p.outcome;
389
+ }
390
+ // closed_at always set to event ts (the moment of closure).
391
+ session.closed_at = event.ts ?? session.closed_at;
392
+ if (typeof p.closed_reason === 'string') {
393
+ session.closed_reason = p.closed_reason;
394
+ } else if (p.closed_reason === null) {
395
+ session.closed_reason = null;
396
+ }
397
+ }
398
+
399
+ function reduceSweep(session, event) {
400
+ const p = event.payload ?? {};
401
+ if (typeof p.activity_state === 'string' && p.activity_state.length > 0) {
402
+ session.activity_state = p.activity_state;
403
+ }
404
+ if (typeof p.effective_last_progress === 'string') {
405
+ // Sweep-supplied effective time can be later than last_progress_at when
406
+ // it represents an externally-measured idle decision. We do not lower
407
+ // last_progress_at via sweep — that field is event-driven only.
408
+ if (
409
+ !session.last_progress_at ||
410
+ p.effective_last_progress > session.last_progress_at
411
+ ) {
412
+ session.last_progress_at = p.effective_last_progress;
413
+ }
414
+ }
415
+ }
416
+
417
+ /**
418
+ * P5: `session_unlink` reducer — set-based filter on tasks / projects.
419
+ *
420
+ * Counterpart to `reduceSessionLink` (additive). Operator (or future cleanup
421
+ * hook) writes a session_unlink event with the same payload shape as
422
+ * session_link; the reducer removes the named ids from the session arrays.
423
+ *
424
+ * Idempotent: removing an id that is not present is a no-op. The Set is
425
+ * built per-payload so duplicates within payload.tasks collapse for free.
426
+ *
427
+ * Why set-based instead of mutate-each? Operator may pass `--task X --task X`
428
+ * by accident; converting to a Set first keeps the filter O(n+m) and removes
429
+ * surprise behavior where the second X is silently ignored vs. counted.
430
+ */
431
+ function reduceSessionUnlink(session, event) {
432
+ const p = event.payload ?? {};
433
+ if (Array.isArray(p.tasks) && p.tasks.length > 0) {
434
+ const removeSet = new Set(
435
+ p.tasks.filter((t) => typeof t === 'string' && t.length > 0),
436
+ );
437
+ if (removeSet.size > 0 && Array.isArray(session.tasks)) {
438
+ session.tasks = session.tasks.filter((t) => !removeSet.has(t));
439
+ }
440
+ }
441
+ if (Array.isArray(p.projects) && p.projects.length > 0) {
442
+ const removeSet = new Set(
443
+ p.projects.filter((proj) => typeof proj === 'string' && proj.length > 0),
444
+ );
445
+ if (removeSet.size > 0 && Array.isArray(session.projects)) {
446
+ session.projects = session.projects.filter((proj) => !removeSet.has(proj));
447
+ }
448
+ }
449
+ }
450
+
451
+ function reduceManualLink(session, event) {
452
+ const p = event.payload ?? {};
453
+ if (Array.isArray(p.parent_candidate_ids)) {
454
+ for (const candidate of p.parent_candidate_ids) {
455
+ if (!candidate || typeof candidate !== 'object') continue;
456
+ // Dedup by candidate id — `parent_id` is the canonical key in v0.2
457
+ // schema; fall back to JSON shape match for raw strings.
458
+ const candidateId =
459
+ typeof candidate.parent_id === 'string'
460
+ ? candidate.parent_id
461
+ : typeof candidate.id === 'string'
462
+ ? candidate.id
463
+ : null;
464
+ const dup = session.parent_candidate_ids.find((c) => {
465
+ const existingId =
466
+ typeof c.parent_id === 'string'
467
+ ? c.parent_id
468
+ : typeof c.id === 'string'
469
+ ? c.id
470
+ : null;
471
+ return existingId !== null && candidateId !== null && existingId === candidateId;
472
+ });
473
+ if (!dup) {
474
+ session.parent_candidate_ids.push({ ...candidate });
475
+ }
476
+ }
477
+ }
478
+ }
479
+
480
+ // ---------------------------------------------------------------------------
481
+ // Helpers
482
+ // ---------------------------------------------------------------------------
483
+
484
+ function setIfPresent(target, source, key) {
485
+ const v = source[key];
486
+ if (v !== undefined && v !== null) {
487
+ target[key] = v;
488
+ }
489
+ }
490
+
491
+ function setIfMissing(target, source, key) {
492
+ const v = source[key];
493
+ if ((target[key] == null) && v !== undefined && v !== null) {
494
+ target[key] = v;
495
+ }
496
+ }
@@ -0,0 +1,131 @@
1
+ /**
2
+ * First-prompt sanitizer for sessions-db.
3
+ *
4
+ * Why this exists: the first user message of a Claude Code transcript is
5
+ * routinely wrapped in injected blocks emitted by the IDE bridge or by the
6
+ * harness itself:
7
+ * - `<system-reminder>...</system-reminder>` — system/harness reminders.
8
+ * - `<system>...</system>` — generic system prompt envelope.
9
+ * - `<thinking>...</thinking>` — chain-of-thought leak guard.
10
+ * - `<tool_use>...</tool_use>` — assistant tool call (echoed back).
11
+ * - `<tool_result>...</tool_result>` — tool output echo.
12
+ * - `<parameter>...</parameter>` — tool call argument body.
13
+ * - `<ide_opened_file>...</ide_opened_file>` — IDE "user has this file
14
+ * open" hint, which leaks file paths.
15
+ * - `<ide_selection>...</ide_selection>` — IDE "user highlighted these
16
+ * lines" hint, which leaks selected source code into the prompt preview.
17
+ * - `<command-name>...</command-message>` — slash command wrapper.
18
+ * If we naively persisted that text to disk we would (a) leak file paths and
19
+ * other IDE state, and (b) blow the preview budget on noise instead of the
20
+ * user's actual prompt. So we NFKC-normalise first (fold fullwidth → ASCII so
21
+ * disguised tags get caught), strip the wrappers in two passes (defensive
22
+ * against a wrapper revealed only after a sibling is removed), then trim and
23
+ * truncate to a safe preview length (default 200) on a UTF-16 code-point
24
+ * boundary so multi-byte characters survive intact.
25
+ *
26
+ * Note on HTML entities: we DO NOT entity-decode. `&lt;system-reminder&gt;`
27
+ * stays literally `&lt;system-reminder&gt;` in the preview — entities can be
28
+ * legitimate user content (e.g., quoted code), and decoding them before
29
+ * stripping would create a brand-new injection vector. The sanitizer's
30
+ * contract is byte-faithful pass-through for anything that is not an actual
31
+ * `<tag>...</tag>` wrapper.
32
+ */
33
+
34
+ // All opening tags use `<TAG\b[^>]*>` so a trailing space or attribute (e.g.
35
+ // `<system-reminder >` or `<system-reminder data-x="y">`) cannot bypass the
36
+ // match. `\b` anchors the tag name so `<system-reminderXYZ>` does NOT match.
37
+ const SYSTEM_REMINDER_RE = /<system-reminder\b[^>]*>[\s\S]*?<\/system-reminder>/gi;
38
+ const SYSTEM_RE = /<system\b[^>]*>[\s\S]*?<\/system>/gi;
39
+ const THINKING_RE = /<thinking\b[^>]*>[\s\S]*?<\/thinking>/gi;
40
+ const TOOL_USE_RE = /<tool_use\b[^>]*>[\s\S]*?<\/tool_use>/gi;
41
+ const TOOL_RESULT_RE = /<tool_result\b[^>]*>[\s\S]*?<\/tool_result>/gi;
42
+ const PARAMETER_RE = /<parameter\b[^>]*>[\s\S]*?<\/parameter>/gi;
43
+
44
+ const IDE_OPENED_RE = /<ide_opened_file\b[^>]*>[\s\S]*?<\/ide_opened_file>/gi;
45
+ // IDE injects user's editor selection (highlighted source lines + file path).
46
+ // Discovered in production 2026-05-10 leaking selected code into preview.
47
+ const IDE_SELECTION_RE = /<ide_selection\b[^>]*>[\s\S]*?<\/ide_selection>/gi;
48
+ // Slash-command wrapper opens with <command-name> and closes with the
49
+ // trailing </command-message> tag (not a typo — that is the actual shape).
50
+ const COMMAND_WRAPPER_RE = /<command-name\b[^>]*>[\s\S]*?<\/command-message>/gi;
51
+
52
+ /**
53
+ * Strip every `<system-reminder>...</system-reminder>` block from `s`, plus
54
+ * the related harness/system envelopes (`<system>`, `<thinking>`, `<tool_use>`,
55
+ * `<tool_result>`, `<parameter>`).
56
+ *
57
+ * @param {string} s
58
+ * @returns {string}
59
+ */
60
+ export function stripSystemReminders(s) {
61
+ if (typeof s !== 'string') return '';
62
+ return s
63
+ .replace(SYSTEM_REMINDER_RE, '')
64
+ .replace(SYSTEM_RE, '')
65
+ .replace(THINKING_RE, '')
66
+ .replace(TOOL_USE_RE, '')
67
+ .replace(TOOL_RESULT_RE, '')
68
+ .replace(PARAMETER_RE, '');
69
+ }
70
+
71
+ /**
72
+ * Strip IDE/harness wrappers (`<ide_opened_file>...`, `<ide_selection>...`,
73
+ * `<command-name>...</command-message>`).
74
+ * @param {string} s
75
+ * @returns {string}
76
+ */
77
+ export function stripIdeWrappers(s) {
78
+ if (typeof s !== 'string') return '';
79
+ return s
80
+ .replace(IDE_OPENED_RE, '')
81
+ .replace(IDE_SELECTION_RE, '')
82
+ .replace(COMMAND_WRAPPER_RE, '');
83
+ }
84
+
85
+ /**
86
+ * Sanitise a raw first-prompt string for safe persistence.
87
+ *
88
+ * Order matters and is the result of an adversarial review:
89
+ * 1. NFKC normalise FIRST. Fullwidth bracket variants (e.g.
90
+ * `<system-reminder>`) only fold into ASCII `<>` after NFKC; if we
91
+ * stripped before normalising the wrapper would survive the strip pass
92
+ * and then leak its body once normalisation happens.
93
+ * 2. Strip system-reminders + system envelopes.
94
+ * 3. Strip IDE/harness wrappers.
95
+ * 4. Defensive second pass: re-strip both families. Removing one wrapper
96
+ * can splice together text that now reads as a fresh wrapper (e.g.
97
+ * `<sys` + IDE block + `tem>...</system>`); the second pass closes that.
98
+ * 5. Trim and collapse runs of 3+ newlines to a paragraph break.
99
+ * 6. Truncate to `maxLen` (default 200) on a code-point boundary, append `…`.
100
+ *
101
+ * @param {string} raw
102
+ * @param {{ maxLen?: number }} [opts]
103
+ * @returns {string}
104
+ */
105
+ export function sanitizeFirstPrompt(raw, opts = {}) {
106
+ if (typeof raw !== 'string') return '';
107
+ const maxLen = Number.isFinite(opts.maxLen) && opts.maxLen > 0 ? opts.maxLen : 200;
108
+
109
+ let s = raw;
110
+ // (1) NFKC FIRST so fullwidth `<...>` becomes ASCII before strip runs.
111
+ s = s.normalize('NFKC');
112
+ // (2-3) First strip pass.
113
+ s = stripSystemReminders(s);
114
+ s = stripIdeWrappers(s);
115
+ // (4) Defensive second pass — close the splice-injection gap.
116
+ s = stripSystemReminders(s);
117
+ s = stripIdeWrappers(s);
118
+ // (5) Whitespace tidy.
119
+ s = s.replace(/\r\n/g, '\n');
120
+ s = s.replace(/\n{3,}/g, '\n\n');
121
+ s = s.trim();
122
+
123
+ if (s.length <= maxLen) return s;
124
+
125
+ // (6) Truncate on a code-point boundary so we never split a surrogate pair.
126
+ // We cap by code-point count (Array.from() iterates code points), then
127
+ // re-join. The ellipsis itself counts toward `maxLen`.
128
+ const cps = Array.from(s);
129
+ if (cps.length <= maxLen) return s;
130
+ return cps.slice(0, Math.max(0, maxLen - 1)).join('') + '…';
131
+ }