@lh8ppl/claude-memory-kit 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -47,6 +47,7 @@ import { existsSync, readdirSync, readFileSync, statSync } from 'node:fs';
47
47
  import { basename, join, relative } from 'node:path';
48
48
  import chokidar from 'chokidar';
49
49
  import { INDEX_DB_SCHEMA } from './index-db.mjs';
50
+ import { syncTranscriptChunks } from './transcript-index.mjs';
50
51
  import { readBullet, parseBulletProvenance } from './provenance.mjs';
51
52
  import { parse as parseFrontmatter } from './frontmatter.mjs';
52
53
  import {
@@ -145,7 +146,10 @@ export function parseObservationsFromScratchpad({
145
146
  projectRoot,
146
147
  userDir,
147
148
  }) {
148
- const lines = content.split('\n');
149
+ // Task 139 (D-126): CRLF-tolerant read — autocrlf clones rewrite the
150
+ // committed memory files; a strict-\n split left \r on every line and
151
+ // the bullet/provenance regexes went blind.
152
+ const lines = content.split(/\r?\n/);
149
153
  const sha1 = sha1OfContent(content);
150
154
  const source_file = relativeSource(path, { projectRoot, userDir });
151
155
  const baseName = basename(path);
@@ -435,6 +439,12 @@ export function reindexBoot({ projectRoot, userDir, db, now }) {
435
439
  });
436
440
  const knownPaths = db.prepare('SELECT path FROM files').all();
437
441
  for (const { path: relPath } of knownPaths) {
442
+ // Task 104.2 composition guard: 'transcript:'-prefixed checkpoints
443
+ // belong to the transcript scope (transcript-index.mjs) — they are
444
+ // never in the observation live-set and pruning them here would
445
+ // defeat that scope's checkpoint on every boot. Its own sync prunes
446
+ // its own orphans.
447
+ if (relPath.startsWith('transcript:')) continue;
438
448
  if (liveRelPaths.has(relPath)) continue;
439
449
  const obsCount = db
440
450
  .prepare('SELECT COUNT(*) AS n FROM observations WHERE source_file = ?')
@@ -443,12 +453,24 @@ export function reindexBoot({ projectRoot, userDir, db, now }) {
443
453
  }
444
454
  }
445
455
 
456
+ // Task 104.2 — sync the transcript scope (the L3 raw tier) in the same
457
+ // boot pass. Cheap: per-file sha1 checkpoint; best-effort — a transcript
458
+ // sync hiccup must not fail the observation reindex.
459
+ let transcripts = { files: 0, chunks: 0 };
460
+ try {
461
+ transcripts = syncTranscriptChunks({ db, projectRoot, now: ts });
462
+ } catch {
463
+ // best-effort; the next boot retries
464
+ }
465
+
446
466
  return {
447
467
  filesScanned,
448
468
  filesReindexed,
449
469
  observationsAffected,
450
470
  filesPruned,
451
471
  observationsPruned,
472
+ transcriptFiles: transcripts.files,
473
+ transcriptChunks: transcripts.chunks,
452
474
  durationMs: Date.now() - t0,
453
475
  skipped,
454
476
  };
@@ -464,13 +486,20 @@ export function reindexBoot({ projectRoot, userDir, db, now }) {
464
486
  export function reindexFull({ projectRoot, userDir, db, now }) {
465
487
  const t0 = Date.now();
466
488
  const ts = now ?? t0;
467
- // Drop + recreate (faster than per-row DELETE).
489
+ // Drop + recreate (faster than per-row DELETE). Task 104.2: the transcript
490
+ // scope drops + rebuilds with everything else — `files` carries its
491
+ // checkpoints, so a full reindex must re-chunk from scratch too.
468
492
  db.exec(`
469
493
  DROP TABLE IF EXISTS observations_fts;
470
494
  DROP TRIGGER IF EXISTS obs_after_insert;
471
495
  DROP TRIGGER IF EXISTS obs_after_update;
472
496
  DROP TRIGGER IF EXISTS obs_after_delete;
473
497
  DROP TABLE IF EXISTS observations;
498
+ DROP TABLE IF EXISTS transcript_chunks_fts;
499
+ DROP TRIGGER IF EXISTS tch_after_insert;
500
+ DROP TRIGGER IF EXISTS tch_after_update;
501
+ DROP TRIGGER IF EXISTS tch_after_delete;
502
+ DROP TABLE IF EXISTS transcript_chunks;
474
503
  DROP TABLE IF EXISTS files;
475
504
  `);
476
505
  db.exec(INDEX_DB_SCHEMA);
@@ -514,9 +543,20 @@ export function reindexFull({ projectRoot, userDir, db, now }) {
514
543
  observationsAffected += txn(source, sha1);
515
544
  }
516
545
 
546
+ // Task 104.2 — rebuild the transcript scope from scratch (its tables were
547
+ // dropped above). Best-effort, same contract as the boot-path sync.
548
+ let transcripts = { files: 0, chunks: 0 };
549
+ try {
550
+ transcripts = syncTranscriptChunks({ db, projectRoot, now: ts });
551
+ } catch {
552
+ // best-effort; the next reindex retries
553
+ }
554
+
517
555
  return {
518
556
  filesScanned,
519
557
  observationsAffected,
558
+ transcriptFiles: transcripts.files,
559
+ transcriptChunks: transcripts.chunks,
520
560
  durationMs: Date.now() - t0,
521
561
  skipped,
522
562
  };
@@ -61,6 +61,33 @@ function trustLabel(rank) {
61
61
  const DEFAULT_CAP_BYTES = 13_000;
62
62
  const HOOK_EVENT_NAME = 'SessionStart';
63
63
 
64
+ // Task 75.0 (D-64 / memory-os Layer-07 "Ground Truth", D-73 near-verbatim):
65
+ // injecting memory is insufficient — the agent must be TOLD the injected
66
+ // context is authoritative, or it re-derives from code what the snapshot
67
+ // already answers (the D-40 cold-open failure). This preamble leads every
68
+ // non-empty snapshot. It is code-generated (not template-scaffolded) on
69
+ // purpose: always present, never consolidated/evicted/graduated, and
70
+ // existing installs pick it up on upgrade with no re-scaffold (avoids the
71
+ // Task-73 stale-template class).
72
+ //
73
+ // §7.1 composition: the preamble + its 2 joining newlines must fit the
74
+ // 725-byte slack between Σ TIER_BUDGETS (12,275) and DEFAULT_CAP_BYTES
75
+ // (13,000) — worst case 12,275 + len + 2 ≤ 13,000, i.e. len ≤ 723. The
76
+ // boundary test pins len ≤ 700. injectContext also subtracts the reserve
77
+ // from the cap handed to enforceCap, so custom capBytes stay honored.
78
+ export const AUTHORITATIVE_MEMORY_PREAMBLE = [
79
+ '# Injected memory — AUTHORITATIVE (claude-memory-kit)',
80
+ '',
81
+ 'Ground-truth ranking: (1) terminal/tool output → live system state;',
82
+ '(2) THIS snapshot + `cmk search` → documented knowledge & prior decisions;',
83
+ '(3) official docs → version-specifics; (4) training knowledge → verify against 1-3.',
84
+ '',
85
+ 'When injected memory contradicts your assumptions, injected memory wins.',
86
+ 'Lead with memory — never re-derive from code what it already answers, and',
87
+ 'never treat a question as novel when the answer is already in your prompt.',
88
+ 'This snapshot is a bounded hot index; `cmk search "<topic>"` reaches the facts not shown here.',
89
+ ].join('\n');
90
+
64
91
  // Match any line containing a `(P-XXXXXXXX)`-shaped citation id. Looser
65
92
  // than ID_PATTERN on purpose — alphabet-validation is the writer's job;
66
93
  // here we just want to recognize "any line that LOOKS like it carries a
@@ -520,7 +547,12 @@ function truncateTierToBudget(blockText, budget, valueById = new Map()) {
520
547
  // lowest-priority tier wholesale, logged as a dropped_tiers event.
521
548
  // This shouldn't fire under the documented budget table (1500+4500+
522
549
  // 4000 = 10000 ≤ 10240 default cap), but the safety net is cheap.
523
- function enforceCap(orderedBlocks, capBytes, ts) {
550
+ // `reportCapBytes` (Task 75.0): the CALLER-facing cap for Door-4 events.
551
+ // injectContext hands enforceCap a cap reduced by the preamble reserve;
552
+ // truncation.log must still report the capBytes the user configured, not
553
+ // the internal effective value, or the log reads as nonsense (411 when
554
+ // the user set 1024).
555
+ function enforceCap(orderedBlocks, capBytes, ts, reportCapBytes = capBytes) {
524
556
  const tierEvents = [];
525
557
  // Step 1: per-tier budget enforcement (section-granular).
526
558
  for (const block of orderedBlocks) {
@@ -559,7 +591,7 @@ function enforceCap(orderedBlocks, capBytes, ts) {
559
591
  bytes -= Buffer.byteLength(dropped.text, 'utf8');
560
592
  let event = dropEvents[dropEvents.length - 1];
561
593
  if (!event) {
562
- event = { ts, capBytes, dropped_tiers: [] };
594
+ event = { ts, capBytes: reportCapBytes, dropped_tiers: [] };
563
595
  dropEvents.push(event);
564
596
  }
565
597
  event.dropped_tiers.push(dropped.tier);
@@ -707,15 +739,26 @@ export function injectContext({
707
739
  }
708
740
 
709
741
  // 3. Cap enforcement: drop whole tier blocks from the tail until within
710
- // capBytes. Each drop emits one truncation event.
742
+ // capBytes. Each drop emits one truncation event. The authoritative-memory
743
+ // preamble (Task 75.0) is reserved out of the cap up front so the final
744
+ // snapshot (preamble + blocks) still honors capBytes exactly.
745
+ const preambleReserve =
746
+ rawBlocks.length > 0
747
+ ? Buffer.byteLength(AUTHORITATIVE_MEMORY_PREAMBLE, 'utf8') + 2
748
+ : 0;
711
749
  const { blocks: keptBlocks, truncationEvents } = enforceCap(
712
750
  rawBlocks,
713
- cap,
751
+ Math.max(0, cap - preambleReserve),
714
752
  ts,
753
+ cap,
715
754
  );
716
755
 
717
- // 4. Concatenate.
718
- const snapshot = keptBlocks.map((b) => b.text).join('\n');
756
+ // 4. Concatenate. The preamble leads every non-empty snapshot; an empty
757
+ // snapshot stays empty (don't claim authoritative memory with nothing
758
+ // behind it).
759
+ const body = keptBlocks.map((b) => b.text).join('\n');
760
+ const snapshot =
761
+ body === '' ? '' : `${AUTHORITATIVE_MEMORY_PREAMBLE}\n\n${body}`;
719
762
 
720
763
  // 5. Persist side-effect logs under <projectRoot>/context/.locks/. We
721
764
  // only write the project-tier .locks file (which is the well-known
package/src/install.mjs CHANGED
@@ -39,6 +39,7 @@ import {
39
39
  writeFileSync,
40
40
  } from 'node:fs';
41
41
  import { homedir } from 'node:os';
42
+ import { spawnSync } from 'node:child_process';
42
43
  import { basename, dirname, join, relative, resolve } from 'node:path';
43
44
  import { fileURLToPath } from 'node:url';
44
45
  import { injectClaudeMdBlock } from './claude-md.mjs';
@@ -411,7 +412,112 @@ export async function install(options = {}) {
411
412
  }
412
413
  }
413
414
 
414
- return { projectRoot, userTier, created, skipped, gitignore, claudeMd, hooks, mcpServer, errors };
415
+ // Task 46 semantic-recall opt-in/out. `--with-semantic`: install the
416
+ // optional embedder (~260 MB once, fully local), flip the project's
417
+ // default search mode to hybrid, and pre-warm the model so the one-time
418
+ // download happens NOW, not as a surprise on the first search.
419
+ // `--no-semantic`: pin keyword explicitly. Neither flag → settings
420
+ // untouched (keyword by absence). The npm spawn is injectable
421
+ // (options.spawnNpm) so tests assert the argv without touching the host.
422
+ // Both flags together → withSemantic wins (the affirmative opt-in beats
423
+ // the pin-off; checked first below).
424
+ let semantic = { action: 'skipped' };
425
+ if (options.withSemantic) {
426
+ semantic = await enableSemantic({ projectRoot, spawnNpm: options.spawnNpm, warm: options.warmEmbedder });
427
+ if (semantic.action === 'error') errors.push({ path: 'semantic', error: semantic.error });
428
+ } else if (options.noSemantic) {
429
+ const r = mergeProjectSettings(projectRoot, { search: { default_mode: 'keyword' } });
430
+ semantic = r.ok
431
+ ? { action: 'disabled', path: r.path }
432
+ : { action: 'error', error: r.error };
433
+ if (!r.ok) errors.push({ path: r.path, error: r.error });
434
+ }
435
+
436
+ return { projectRoot, userTier, created, skipped, gitignore, claudeMd, hooks, mcpServer, semantic, errors };
437
+ }
438
+
439
+ /**
440
+ * Read-merge-write <projectRoot>/context/settings.json, preserving every
441
+ * key the user already has (over-mutation-safe; deep-merges one level).
442
+ */
443
+ export function mergeProjectSettings(projectRoot, patch) {
444
+ const path = join(projectRoot, 'context', 'settings.json');
445
+ try {
446
+ let current = {};
447
+ if (existsSync(path)) {
448
+ current = JSON.parse(readFileSync(path, 'utf8'));
449
+ }
450
+ const next = { ...current };
451
+ for (const [key, value] of Object.entries(patch)) {
452
+ next[key] =
453
+ value && typeof value === 'object' && !Array.isArray(value)
454
+ ? { ...(current[key] ?? {}), ...value }
455
+ : value;
456
+ }
457
+ mkdirSync(dirname(path), { recursive: true });
458
+ writeFileSync(path, JSON.stringify(next, null, 2) + '\n', 'utf8');
459
+ return { ok: true, path };
460
+ } catch (err) {
461
+ return { ok: false, path, error: err?.message ?? String(err) };
462
+ }
463
+ }
464
+
465
+ /**
466
+ * The production npm-spawn closure, as an injectable-seam factory
467
+ * (Task 125.4) so its argv/shell/timeout contract is testable without
468
+ * running a real `npm install -g` (which stays a machine-level step
469
+ * tests must never take).
470
+ */
471
+ export function buildDefaultNpmRunner({ spawnSyncImpl = spawnSync } = {}) {
472
+ return () => {
473
+ // One constant command string under shell:true (no user input — and
474
+ // an args array + shell:true trips Node's DEP0190). npm is npm.cmd
475
+ // on Windows; the shell resolves it cross-platform.
476
+ const r = spawnSyncImpl('npm install -g @huggingface/transformers', {
477
+ encoding: 'utf8',
478
+ stdio: 'inherit',
479
+ shell: true,
480
+ // spawn-discipline (design §8.5): a hung registry shouldn't hang
481
+ // install forever; 10 min covers the ~46 MB package on slow links.
482
+ timeout: 600_000,
483
+ });
484
+ return { status: r.status, error: r.error?.message };
485
+ };
486
+ }
487
+
488
+ async function enableSemantic({ projectRoot, spawnNpm, warm }) {
489
+ // 1. Install the optional embedder globally (it resolves as a sibling of
490
+ // the globally-installed kit). Injectable for tests.
491
+ const runNpm = spawnNpm ?? buildDefaultNpmRunner();
492
+ const npm = runNpm();
493
+ if (npm.status !== 0) {
494
+ return {
495
+ action: 'error',
496
+ error: `npm install -g @huggingface/transformers failed (${npm.error ?? `exit ${npm.status}`}) — semantic recall NOT enabled; keyword search is unaffected`,
497
+ };
498
+ }
499
+ // 2. Flip the project default to hybrid ONLY after the dependency landed
500
+ // (no half-state: a hybrid default without an embedder would degrade
501
+ // every search to a fallback warning).
502
+ const settings = mergeProjectSettings(projectRoot, { search: { default_mode: 'hybrid' } });
503
+ if (!settings.ok) {
504
+ return { action: 'error', error: settings.error };
505
+ }
506
+ // 3. Pre-warm (best-effort): the one-time model download happens during
507
+ // install, not on the first search. Injectable for tests.
508
+ let warmed = { ok: false, reason: 'skipped' };
509
+ try {
510
+ const warmFn =
511
+ warm ??
512
+ (async () => {
513
+ const { warmEmbedder } = await import('./semantic-backend.mjs');
514
+ return warmEmbedder();
515
+ });
516
+ warmed = await warmFn();
517
+ } catch (err) {
518
+ warmed = { ok: false, reason: err?.message ?? String(err) };
519
+ }
520
+ return { action: 'enabled', path: settings.path, defaultMode: 'hybrid', warmed };
415
521
  }
416
522
 
417
523
  /**
@@ -106,16 +106,60 @@ export function validatePath(p, { projectRoot, userDir }) {
106
106
 
107
107
  // --- Tool handlers ----------------------------------------------------
108
108
 
109
- function makeMkSearch({ db, semanticBackend }) {
110
- return async ({ query, mode, tier, since, limit, min_trust }) => {
109
+ function makeMkSearch({ db, semanticBackend, projectRoot }) {
110
+ return async ({ query, mode, scope, tier, since, limit, min_trust }) => {
111
+ // Task 46: explicit mode wins; otherwise the project's configured
112
+ // default (search.default_mode — set by `cmk install --with-semantic`).
113
+ const { prepareSemanticBackend, resolveDefaultSearchMode } = await import(
114
+ './semantic-backend.mjs'
115
+ );
116
+ let wantMode =
117
+ mode ??
118
+ (projectRoot ? resolveDefaultSearchMode({ projectRoot }) : SEARCH_MODES.KEYWORD);
119
+ // Task 65: when the caller asks for semantic/hybrid and no test seam is
120
+ // injected, prepare the REAL embedded backend (lazy-optional — an absent
121
+ // embedder degrades to the actionable error below; keyword unaffected).
122
+ let backend = semanticBackend;
123
+ let degradedNote = null;
124
+ if (
125
+ backend === undefined &&
126
+ (wantMode === SEARCH_MODES.SEMANTIC || wantMode === SEARCH_MODES.HYBRID)
127
+ ) {
128
+ const prep = await prepareSemanticBackend({ db, query, scope: scope ?? 'facts' });
129
+ if (!prep.ok && mode) {
130
+ // Explicitly requested — surface the actionable error.
131
+ return {
132
+ content: [
133
+ {
134
+ type: 'text',
135
+ text: `error: semantic backend unavailable (${prep.reason}). ${prep.hint ?? 'Use mode "keyword".'}`,
136
+ },
137
+ ],
138
+ isError: true,
139
+ };
140
+ }
141
+ if (!prep.ok) {
142
+ // Configured default can't run — degrade gracefully to keyword,
143
+ // but NOT silently (Task 125.1, the user's call reversing the
144
+ // Task-46 review skip): the note below tells the model what it
145
+ // got, so it can relay the fix to the user.
146
+ wantMode = SEARCH_MODES.KEYWORD;
147
+ degradedNote =
148
+ `note: this project's configured default search is semantic (hybrid), but the embedder is unavailable (${prep.reason}) — these are keyword-only results. ` +
149
+ 'Suggest the user run `cmk install --with-semantic` to restore semantic recall.';
150
+ } else {
151
+ backend = prep.backend;
152
+ }
153
+ }
111
154
  const r = search({
112
155
  db, query,
113
- mode: mode ?? SEARCH_MODES.KEYWORD,
156
+ mode: wantMode,
157
+ scope,
114
158
  tier,
115
159
  since,
116
160
  limit,
117
161
  minTrust: min_trust,
118
- semanticBackend,
162
+ semanticBackend: backend,
119
163
  });
120
164
  if (r.action === 'error') {
121
165
  return {
@@ -124,7 +168,12 @@ function makeMkSearch({ db, semanticBackend }) {
124
168
  };
125
169
  }
126
170
  return {
127
- content: [{ type: 'text', text: JSON.stringify(r.results, null, 2) }],
171
+ content: [
172
+ { type: 'text', text: JSON.stringify(r.results, null, 2) },
173
+ // Results stay content[0] (shape-compatible); the degradation note,
174
+ // when present, rides as a second block.
175
+ ...(degradedNote ? [{ type: 'text', text: degradedNote }] : []),
176
+ ],
128
177
  };
129
178
  };
130
179
  }
@@ -505,17 +554,18 @@ export function buildMcpServer({ projectRoot, userDir, db, semanticBackend }) {
505
554
  server.registerTool(
506
555
  'mk_search',
507
556
  {
508
- description: 'Search kit memory (FTS5 keyword by default; semantic + hybrid require the Layer-5b semantic backend, not yet shipped).',
557
+ description: 'Search kit memory. FTS5 keyword by default; semantic + hybrid use the embedded Layer-5b backend (sqlite-vec + a local ONNX embedder — needs the optional @huggingface/transformers install).',
509
558
  inputSchema: {
510
559
  query: z.string().min(1).describe('search query'),
511
560
  mode: z.enum(['keyword', 'semantic', 'hybrid']).optional(),
561
+ scope: z.enum(['facts', 'transcripts']).optional().describe("'facts' (default) = curated memory; 'transcripts' = the raw session record — the LAST-RESORT recall tier, search it only when curated memory has no answer"),
512
562
  tier: z.enum(['U', 'P', 'L']).optional(),
513
563
  since: z.string().optional().describe('ISO 8601 timestamp'),
514
564
  limit: z.number().int().positive().max(1000).optional(),
515
565
  min_trust: z.enum(['low', 'medium', 'high']).optional(),
516
566
  },
517
567
  },
518
- makeMkSearch({ db, semanticBackend }),
568
+ makeMkSearch({ db, semanticBackend, projectRoot }),
519
569
  );
520
570
 
521
571
  // mk_get
@@ -26,6 +26,7 @@ import { parse, format } from './frontmatter.mjs';
26
26
  import { appendAuditEntry, nowIso, REASON_CODES } from './audit-log.mjs';
27
27
  import { ERROR_CATEGORIES, errorResult, notFoundResult } from './result-shapes.mjs';
28
28
  import { writeFact } from './write-fact.mjs';
29
+ import { reindex } from './reindex.mjs';
29
30
 
30
31
  function listLiveFactFiles(factDir) {
31
32
  if (!existsSync(factDir)) return [];
@@ -193,6 +194,17 @@ export function mergeFacts(opts = {}) {
193
194
  const supersededA = moveToSuperseded(matchA, writeResult.id);
194
195
  const supersededB = moveToSuperseded(matchB, writeResult.id);
195
196
 
197
+ // Task 124 (the D-112 class): writeFact refreshed INDEX.md when C was
198
+ // created — but A and B left the fact dir AFTER that, so the index kept
199
+ // both as dangling lines until a manual `cmk reindex`. The writer owns
200
+ // the derived view on the removal side too. Best-effort, same contract
201
+ // as writeFact's: the merge is already durable on disk.
202
+ try {
203
+ reindex({ tier, projectRoot, userDir, warn: () => {} });
204
+ } catch {
205
+ // index rebuild is best-effort; the merge already succeeded
206
+ }
207
+
196
208
  const ts = now ?? nowIso();
197
209
  appendAuditEntry(tierRoot, {
198
210
  ts,
@@ -44,6 +44,10 @@ const VALID_WRITE_SOURCES = new Set([
44
44
  'compressor',
45
45
  'manual-edit',
46
46
  'imported',
47
+ // Task 138 review finding: the conflict-queue merge-both action writes a
48
+ // merged bullet to the scratchpad; its provenance needs a valid write key
49
+ // (the old hand-rolled comment had none and broke reindex - D-125 class).
50
+ 'merged',
47
51
  ]);
48
52
  const REQUIRED_PROVENANCE_FIELDS = [
49
53
  'source',
@@ -104,7 +104,7 @@ export const ERROR_CATEGORIES = Object.freeze({
104
104
  POISON_GUARD: 'poison_guard',
105
105
 
106
106
  // `cmk search` requested --mode=semantic or --mode=hybrid but the
107
- // Layer-5b semantic backend is not yet shipped (Task 30, design
107
+ // Layer-5b semantic backend's optional embedder is not installed (Task 30/65, design
108
108
  // §9.3). Pairs with `process.exitCode = 2` in subcommands.mjs per
109
109
  // tasks.md 30.2's explicit "exit 2 when unavailable" contract.
110
110
  // NO silent fallback to keyword — the user asked for semantic,
@@ -180,7 +180,9 @@ function findSectionRange(lines, sectionTitle) {
180
180
  }
181
181
 
182
182
  function insertIntoSection(text, sectionTitle, bullet) {
183
- const lines = text.split('\n');
183
+ // Task 139 (D-126): CRLF-tolerant read; the join below re-emits \n,
184
+ // so a CRLF-converted scratchpad self-heals on the next write.
185
+ const lines = text.split(/\r?\n/);
184
186
  const range = findSectionRange(lines, sectionTitle);
185
187
  if (!range) return null;
186
188
  // Insert before the next `## ` heading; skip trailing blank lines so the
@@ -208,7 +210,7 @@ function insertIntoSection(text, sectionTitle, bullet) {
208
210
  export function ensureSectionExists(scratchpadPath, sectionTitle) {
209
211
  if (!existsSync(scratchpadPath)) return { created: false, error: 'no-file' };
210
212
  const text = readFileSync(scratchpadPath, 'utf8');
211
- if (findSectionRange(text.split('\n'), sectionTitle)) return { created: false };
213
+ if (findSectionRange(text.split(/\r?\n/), sectionTitle)) return { created: false }; // Task 139: CRLF-tolerant
212
214
  const body = text.trimEnd(); // drop trailing whitespace/blank lines (no `\s+$` regex — trips ReDoS heuristics)
213
215
  // No leading blank lines for an empty/whitespace-only file (the scaffolded
214
216
  // scratchpads are never empty, but keep the output clean if one ever is).
@@ -220,7 +222,7 @@ export function ensureSectionExists(scratchpadPath, sectionTitle) {
220
222
  const EVICTED_ID_RE = /^- \(([PUL]-[A-Za-z0-9]+)\)/;
221
223
 
222
224
  function consolidate(text, { nowDate }) {
223
- const lines = text.split('\n');
225
+ const lines = text.split(/\r?\n/); // Task 139: CRLF-tolerant
224
226
  const removeIdx = new Set();
225
227
  const evicted = [];
226
228
  const staleCutoff = new Date(nowDate.getTime() - STALE_AFTER_DAYS * 24 * 60 * 60 * 1000);
package/src/search.mjs CHANGED
@@ -11,7 +11,7 @@
11
11
  // ~100ms for 10k bullets. Always available — the keyword
12
12
  // backend ships in v0.1.0 with no extra install.
13
13
  //
14
- // semantic the Layer-5b semantic backend (not yet shipped the embedded
14
+ // semantic the Layer-5b semantic backend (Task 65: sqlite-vec + local ONNX embedder; the embedded
15
15
  // vector backend is a future release; the DI seam below is the
16
16
  // drop-in point). Until then this mode errors with
17
17
  // ERROR_CATEGORIES.SEMANTIC_UNAVAILABLE when the caller
@@ -54,6 +54,15 @@ export const SEARCH_MODES = Object.freeze({
54
54
  export const DEFAULT_LIMIT = 20;
55
55
  const MAX_LIMIT = 1000;
56
56
 
57
+ // Task 104.2 (D-117) — search scopes. 'facts' = the curated observation
58
+ // index (L1, the default). 'transcripts' = the SEPARATE raw-transcript
59
+ // chunk index (the L3 last-resort tier) — reached ONLY when explicitly
60
+ // asked, so raw history never pollutes curated results.
61
+ export const SEARCH_SCOPES = Object.freeze({
62
+ FACTS: 'facts',
63
+ TRANSCRIPTS: 'transcripts',
64
+ });
65
+
57
66
  const TRUST_ORDINAL = Object.freeze({
58
67
  low: 1,
59
68
  medium: 2,
@@ -107,7 +116,24 @@ function validateInput(opts) {
107
116
  errors.push(`limit: must be a positive integer ≤ ${MAX_LIMIT}`);
108
117
  }
109
118
  }
110
- return { errors, mode };
119
+ const scope = opts.scope ?? SEARCH_SCOPES.FACTS;
120
+ if (scope !== SEARCH_SCOPES.FACTS && scope !== SEARCH_SCOPES.TRANSCRIPTS) {
121
+ errors.push(`scope: must be one of facts/transcripts (got ${JSON.stringify(scope)})`);
122
+ }
123
+ if (scope === SEARCH_SCOPES.TRANSCRIPTS) {
124
+ // Chunks carry no tier/trust/created_at — rejecting these is more honest
125
+ // than silently ignoring them (the explicit-vs-configured asymmetry rule).
126
+ for (const [key, label] of [
127
+ ['tier', 'tier'],
128
+ ['minTrust', 'minTrust'],
129
+ ['since', 'since'],
130
+ ]) {
131
+ if (opts[key] !== undefined) {
132
+ errors.push(`${label}: not supported under the transcripts scope (raw chunks carry no ${label})`);
133
+ }
134
+ }
135
+ }
136
+ return { errors, mode, scope };
111
137
  }
112
138
 
113
139
  // --- Keyword (FTS5 BM25) backend --------------------------------------
@@ -211,6 +237,60 @@ function runKeywordSearch(db, opts) {
211
237
  }));
212
238
  }
213
239
 
240
+ // --- Transcript-scope keyword backend (Task 104.2, the L3 raw tier) ----
241
+
242
+ const TRANSCRIPT_KEYWORD_SQL = `
243
+ SELECT
244
+ t.source_file AS source_file,
245
+ t.source_line AS source_line,
246
+ t.heading AS heading,
247
+ transcript_chunks_fts.rank AS score,
248
+ snippet(transcript_chunks_fts, 0, '<b>', '</b>', '...', 16) AS snippet
249
+ FROM transcript_chunks_fts
250
+ JOIN transcript_chunks t ON t.rowid = transcript_chunks_fts.rowid
251
+ WHERE transcript_chunks_fts MATCH @query
252
+ ORDER BY transcript_chunks_fts.rank
253
+ LIMIT @limit
254
+ `;
255
+
256
+ // Synthetic, readable id for a raw chunk (chunks are locations, not curated
257
+ // facts — no [PUL]-XXXXXXXX identity). Also the RRF fusion key in hybrid
258
+ // mode and the drill-back handle the memory-search skill surfaces.
259
+ function transcriptHitId(row) {
260
+ return `T:${row.source_file}:${row.source_line}`;
261
+ }
262
+
263
+ function runTranscriptKeywordSearch(db, opts) {
264
+ let rows;
265
+ try {
266
+ rows = db
267
+ .prepare(TRANSCRIPT_KEYWORD_SQL)
268
+ .all({ query: opts.query, limit: opts.limit ?? DEFAULT_LIMIT });
269
+ } catch (err) {
270
+ if (err?.code === 'SQLITE_ERROR' || /fts5:|no such column:/i.test(err?.message ?? '')) {
271
+ throw new FTS5ParseError(err, opts.query);
272
+ }
273
+ throw err;
274
+ }
275
+ return rows.map((r) => ({
276
+ id: transcriptHitId(r),
277
+ // Raw turns contain newlines (dialogue + Tools blocks) — flatten so the
278
+ // one-line-per-hit output contract holds across scopes.
279
+ snippet: flattenSnippet(r.snippet),
280
+ source_file: r.source_file,
281
+ source_line: r.source_line,
282
+ heading: r.heading,
283
+ score: r.score,
284
+ }));
285
+ }
286
+
287
+ const TRANSCRIPT_SNIPPET_MAX = 240;
288
+
289
+ function flattenSnippet(s) {
290
+ const flat = String(s ?? '').replace(/\s+/g, ' ').trim();
291
+ return flat.length > TRANSCRIPT_SNIPPET_MAX ? flat.slice(0, TRANSCRIPT_SNIPPET_MAX) + '…' : flat;
292
+ }
293
+
214
294
  // --- Reciprocal-rank fusion (hybrid mode) -----------------------------
215
295
 
216
296
  /**
@@ -255,10 +335,15 @@ export function reciprocalRankFusion({
255
335
  // --- Public boundary --------------------------------------------------
256
336
 
257
337
  export function search(opts = {}) {
258
- const { errors, mode } = validateInput(opts);
338
+ const { errors, mode, scope } = validateInput(opts);
259
339
  if (errors.length > 0) {
260
340
  return errorResult({ category: ERROR_CATEGORIES.SCHEMA, errors });
261
341
  }
342
+ // Scope dispatch (Task 104.2): the transcripts scope swaps the keyword
343
+ // backend; semantic/hybrid use the caller-prepared backend exactly like
344
+ // the facts scope (prepareSemanticBackend({scope}) embeds the right table).
345
+ const keywordBackend =
346
+ scope === SEARCH_SCOPES.TRANSCRIPTS ? runTranscriptKeywordSearch : runKeywordSearch;
262
347
 
263
348
  // Semantic + hybrid require an injected backend. Production v0.1.0
264
349
  // passes undefined → error with the not-yet-shipped hint. A future
@@ -268,7 +353,8 @@ export function search(opts = {}) {
268
353
  return errorResult({
269
354
  category: ERROR_CATEGORIES.SEMANTIC_UNAVAILABLE,
270
355
  errors: [
271
- 'the Layer-5b semantic backend is not yet shipped — semantic/hybrid search will land in a future release. ' +
356
+ 'no semantic backend provided — semantic/hybrid need the embedded Layer-5b backend prepared by the caller ' +
357
+ '(the CLI/MCP do this automatically when the optional @huggingface/transformers embedder is installed). ' +
272
358
  'Use --mode=keyword for the always-available FTS5 search.',
273
359
  ],
274
360
  });
@@ -278,15 +364,16 @@ export function search(opts = {}) {
278
364
  let results;
279
365
  try {
280
366
  if (mode === SEARCH_MODES.KEYWORD) {
281
- results = runKeywordSearch(opts.db, opts);
367
+ results = keywordBackend(opts.db, opts);
282
368
  } else if (mode === SEARCH_MODES.SEMANTIC) {
283
369
  // The semantic backend is an injected callable returning the same
284
- // shape as runKeywordSearch (array of {id, snippet, source_file,
285
- // source_line, tier, trust, score}).
370
+ // shape as the scope's keyword backend (facts: {id, snippet,
371
+ // source_file, source_line, tier, trust, score}; transcripts: the
372
+ // synthetic-T:-id shape without tier/trust).
286
373
  results = opts.semanticBackend(opts);
287
374
  } else {
288
375
  // hybrid: run both backends + fuse.
289
- const keywordResults = runKeywordSearch(opts.db, opts);
376
+ const keywordResults = keywordBackend(opts.db, opts);
290
377
  const semanticResults = opts.semanticBackend(opts);
291
378
  const fused = reciprocalRankFusion({
292
379
  keywordResults,
@@ -308,5 +395,5 @@ export function search(opts = {}) {
308
395
  throw err;
309
396
  }
310
397
 
311
- return { action: 'found', mode, results };
398
+ return { action: 'found', mode, scope, results };
312
399
  }