@isaacriehm/cairn-core 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/dist/.tsbuildinfo +1 -1
  2. package/dist/attention/bulk-accept.js +56 -15
  3. package/dist/attention/bulk-accept.js.map +1 -1
  4. package/dist/attention/serve/api.js +7 -1
  5. package/dist/attention/serve/api.js.map +1 -1
  6. package/dist/ground/file-candidates-map.d.ts +23 -0
  7. package/dist/ground/file-candidates-map.js +76 -0
  8. package/dist/ground/file-candidates-map.js.map +1 -0
  9. package/dist/ground/index.d.ts +4 -2
  10. package/dist/ground/index.js +4 -2
  11. package/dist/ground/index.js.map +1 -1
  12. package/dist/ground/paths.d.ts +2 -0
  13. package/dist/ground/paths.js +6 -0
  14. package/dist/ground/paths.js.map +1 -1
  15. package/dist/ground/rejected.d.ts +42 -0
  16. package/dist/ground/rejected.js +100 -0
  17. package/dist/ground/rejected.js.map +1 -0
  18. package/dist/ground/schemas.d.ts +80 -0
  19. package/dist/ground/schemas.js +54 -0
  20. package/dist/ground/schemas.js.map +1 -1
  21. package/dist/hooks/post-tool-use/index.d.ts +1 -1
  22. package/dist/hooks/post-tool-use/index.js +1 -1
  23. package/dist/hooks/post-tool-use/index.js.map +1 -1
  24. package/dist/hooks/post-tool-use/ledger-cache.d.ts +13 -0
  25. package/dist/hooks/post-tool-use/ledger-cache.js +48 -0
  26. package/dist/hooks/post-tool-use/ledger-cache.js.map +1 -1
  27. package/dist/hooks/post-tool-use/legend-builder.d.ts +10 -1
  28. package/dist/hooks/post-tool-use/legend-builder.js +27 -2
  29. package/dist/hooks/post-tool-use/legend-builder.js.map +1 -1
  30. package/dist/hooks/post-tool-use/read-enricher.js +8 -2
  31. package/dist/hooks/post-tool-use/read-enricher.js.map +1 -1
  32. package/dist/init/index.d.ts +2 -2
  33. package/dist/init/index.js +1 -1
  34. package/dist/init/index.js.map +1 -1
  35. package/dist/init/ingest-docs.d.ts +82 -22
  36. package/dist/init/ingest-docs.js +632 -108
  37. package/dist/init/ingest-docs.js.map +1 -1
  38. package/dist/init/init.d.ts +10 -1
  39. package/dist/init/init.js +113 -251
  40. package/dist/init/init.js.map +1 -1
  41. package/dist/init/mapper-parallel.js +8 -0
  42. package/dist/init/mapper-parallel.js.map +1 -1
  43. package/dist/init/phases/6-docs-ingest.d.ts +9 -4
  44. package/dist/init/phases/6-docs-ingest.js +13 -10
  45. package/dist/init/phases/6-docs-ingest.js.map +1 -1
  46. package/dist/init/phases/parallel-678.js +10 -4
  47. package/dist/init/phases/parallel-678.js.map +1 -1
  48. package/dist/init/sot-emit.d.ts +22 -0
  49. package/dist/init/sot-emit.js +50 -4
  50. package/dist/init/sot-emit.js.map +1 -1
  51. package/dist/init/source-comments/ingest.js +107 -7
  52. package/dist/init/source-comments/ingest.js.map +1 -1
  53. package/dist/init/topic-index/index.d.ts +14 -0
  54. package/dist/init/topic-index/index.js +83 -4
  55. package/dist/init/topic-index/index.js.map +1 -1
  56. package/dist/init/topic-index/judge.js +14 -1
  57. package/dist/init/topic-index/judge.js.map +1 -1
  58. package/dist/init/topic-index/resolve.d.ts +19 -0
  59. package/dist/init/topic-index/resolve.js +100 -14
  60. package/dist/init/topic-index/resolve.js.map +1 -1
  61. package/dist/init/topic-index/walk.d.ts +32 -0
  62. package/dist/init/topic-index/walk.js +70 -4
  63. package/dist/init/topic-index/walk.js.map +1 -1
  64. package/dist/mcp/history/summarizer.js +5 -0
  65. package/dist/mcp/history/summarizer.js.map +1 -1
  66. package/dist/mcp/schemas.d.ts +48 -0
  67. package/dist/mcp/schemas.js +43 -0
  68. package/dist/mcp/schemas.js.map +1 -1
  69. package/dist/mcp/tools/index.js +8 -0
  70. package/dist/mcp/tools/index.js.map +1 -1
  71. package/dist/mcp/tools/propose-decision.d.ts +34 -0
  72. package/dist/mcp/tools/propose-decision.js +200 -0
  73. package/dist/mcp/tools/propose-decision.js.map +1 -0
  74. package/dist/mcp/tools/reject-candidate.d.ts +24 -0
  75. package/dist/mcp/tools/reject-candidate.js +71 -0
  76. package/dist/mcp/tools/reject-candidate.js.map +1 -0
  77. package/dist/mcp/tools/search-candidates.d.ts +20 -0
  78. package/dist/mcp/tools/search-candidates.js +93 -0
  79. package/dist/mcp/tools/search-candidates.js.map +1 -0
  80. package/package.json +1 -1
  81. package/templates/attention-ui/app.js +40 -3
@@ -1,28 +1,78 @@
1
1
  /**
2
- * Phase 6 — docs ingestion (v0.5.0 SoT model).
2
+ * Phase 6 — staged docs ingestion (PHASE_6_REDESIGN §4.1).
3
3
  *
4
- * Reads the topic-index built by phase 5b, filters to entries whose SoT
5
- * source lives under `docs/*`, and emits verbatim DEC files under
6
- * `.cairn/ground/decisions/`. Auto-promoted to `status: accepted`. No
7
- * draft inbox, no LLM paraphrase — the doc paragraph itself IS the
8
- * canonical body, recorded with `sot_kind: path` so the lens renders
9
- * the live source on every read.
4
+ * Replaces the v0.6 bulk-classifier path. Cuts wall from ~15 min
5
+ * ~75 s on gcb-platform-scale repos AND collapses the noisy ledger
6
+ * (7000 DECs) to a curated draft inbox (30-80 drafts).
10
7
  *
11
- * Per-entry Haiku call decides `kind` only (decision / domain-rule /
12
- * voice-guidelines / api-docs / other). The first two emit a DEC; the
13
- * rest are skipped at this layer (voice + canonical-topic flows are
14
- * handled by other tooling now — they were file-level concerns under
15
- * the v0.4.x model and have no clean paragraph-level analogue).
8
+ * Pipeline:
9
+ *
10
+ * Stage 3 (deterministic, 0 Haiku) marker scan
11
+ * Topic-index entries with `marker_kind` in {"decision","rule"} go
12
+ * straight to emit. The walker stamped them at parse time when it
13
+ * saw frontmatter `cairn.kind` or `<!-- cairn:decision -->` /
14
+ * `<!-- cairn:rule -->` within 3 lines of the heading.
15
+ *
16
+ * Stage 1 — file-purpose binary filter (batch=30, concurrency=5)
17
+ * Per file: filepath + frontmatter + first 800 chars + every
18
+ * H1/H2/H3 line (capped at 100). Locked rigid prompt: a file is
19
+ * authoritative ONLY if it's a canonical rulebook, formal ADR,
20
+ * or list of binding domain invariants. Plans / scratchpads /
21
+ * UAT logs / API docs are NOT authoritative even if they
22
+ * contain proposed or historical decisions.
23
+ *
24
+ * Stage 2 — section-level batch classifier (batch=30, concurrency=5)
25
+ * Same shape as the v0.6 classifier, but scoped to sections
26
+ * belonging to Stage-1-authoritative files AND not already
27
+ * handled by a marker. This is where Haiku still adds signal —
28
+ * the file passed the rigid filter; now decide WHICH sections
29
+ * of it are decisions vs context.
30
+ *
31
+ * Stage 4 — emit
32
+ * Stage 2 + Stage 3 outputs → `.cairn/ground/decisions/_inbox/<id>.draft.md`.
33
+ * `status: draft`, `capture_source: init-docs-ingest`,
34
+ * `decided_by: cairn-init`. Body is verbatim via
35
+ * `readSotBody` — no Haiku paraphrasing. Operator triages via
36
+ * the existing `cairn-attention` skill.
37
+ *
38
+ * Skipped entries (everything else) stay in the topic-index as
39
+ * unpromoted candidates. The PR 2 `cairn_search_candidates` /
40
+ * `cairn_propose_decision` MCP tools surface them to AI agents as
41
+ * the project lives.
16
42
  */
17
- import { existsSync, readdirSync, statSync, } from "node:fs";
43
+ import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync, } from "node:fs";
18
44
  import { join, relative } from "node:path";
45
+ import { stringify as stringifyYaml } from "yaml";
19
46
  import { runClaude } from "../claude/index.js";
20
- import { readAnchorMap, readTopicIndex, writeSotBindings, writeSotCache, writeTopicIndex, } from "../ground/index.js";
47
+ import { bodyContentHash, decisionsDir, deriveDecId, readAnchorMap, readRejectedYaml, readTopicIndex, setTopic, writeFileCandidatesMap, writeTopicIndex, } from "../ground/index.js";
21
48
  import { logger } from "../logger.js";
22
- import { emitFromTopicIndex } from "./sot-emit.js";
49
+ import { firstLineFallback, readSotBody } from "./sot-emit.js";
23
50
  const log = logger("init.ingest-docs");
24
- const PER_DOC_TIMEOUT_MS = 60_000;
25
- const DOC_BODY_CAP = 8_000;
51
+ /* -------------------------------------------------------------------------- */
52
+ /* Tunables locked in §3 of PHASE_6_REDESIGN */
53
+ /* -------------------------------------------------------------------------- */
54
+ /** N files per Stage-1 Haiku call. */
55
+ const FILE_FILTER_BATCH_SIZE = 30;
56
+ /** Concurrent Stage-1 batches. */
57
+ const FILE_FILTER_CONCURRENCY = 5;
58
+ /** Stage 1 per-file context — first chars of body, frontmatter stripped. */
59
+ const FILE_FILTER_INTRO_CHARS = 800;
60
+ /** Stage 1 max ToC lines (H1/H2/H3 only). */
61
+ const FILE_FILTER_TOC_MAX_LINES = 100;
62
+ /** Stage 1 wall budget per Haiku call. */
63
+ const FILE_FILTER_TIMEOUT_MS = 60_000;
64
+ /** N sections per Stage-2 Haiku call. */
65
+ const SECTION_BATCH_SIZE = 30;
66
+ /** Concurrent Stage-2 batches. */
67
+ const SECTION_CONCURRENCY = 5;
68
+ /** Stage 2 per-section body cap (chars) before truncation marker. */
69
+ const SECTION_BODY_CAP = 2_000;
70
+ /** Stage 2 wall budget per Haiku call. */
71
+ const SECTION_TIMEOUT_MS = 120_000;
72
+ /** Capture source stamped on every Stage 2/3 emit. */
73
+ const CAPTURE_SOURCE = "init-docs-ingest";
74
+ /** Decided-by stamp on every Stage 2/3 emit. */
75
+ const DECIDED_BY = "cairn-init";
26
76
  /** Subdirs we never descend into when discovering candidate doc files. */
27
77
  const SKIP_DIRS = new Set([
28
78
  ".cairn",
@@ -73,7 +123,11 @@ function walkDocsDir(dir, repoRoot, out) {
73
123
  catch {
74
124
  continue;
75
125
  }
76
- out.push({ path: relative(repoRoot, abs), size: st.size, group: dirGroup(relative(repoRoot, abs)) });
126
+ out.push({
127
+ path: relative(repoRoot, abs),
128
+ size: st.size,
129
+ group: dirGroup(relative(repoRoot, abs)),
130
+ });
77
131
  }
78
132
  }
79
133
  function dirGroup(rel) {
@@ -83,140 +137,610 @@ function dirGroup(rel) {
83
137
  return `${parts[0]}/`;
84
138
  }
85
139
  /* -------------------------------------------------------------------------- */
86
- /* Haiku classifierkind only, no rewriting */
140
+ /* Stage 1file-purpose binary filter */
141
+ /* */
142
+ /* Locked rigid prompt — DO NOT paraphrase. A file is authoritative ONLY */
143
+ /* if it's a canonical rulebook, a formal ADR, or a list of active binding */
144
+ /* domain invariants. Plans / scratchpads / UAT logs / API docs are NOT */
145
+ /* authoritative even if they contain proposed or historical decisions. */
87
146
  /* -------------------------------------------------------------------------- */
88
- const CLASSIFY_SCHEMA = {
147
+ const FILE_FILTER_SCHEMA = {
89
148
  type: "object",
90
149
  additionalProperties: false,
150
+ required: ["files"],
91
151
  properties: {
92
- kind: {
93
- type: "string",
94
- enum: ["decision", "domain-rule", "voice-guidelines", "api-docs", "other"],
152
+ files: {
153
+ type: "array",
154
+ items: {
155
+ type: "object",
156
+ additionalProperties: false,
157
+ required: ["path", "is_authoritative", "reason"],
158
+ properties: {
159
+ path: { type: "string" },
160
+ is_authoritative: { type: "boolean" },
161
+ reason: { type: "string" },
162
+ },
163
+ },
95
164
  },
96
- proposedTitle: { type: "string" },
97
165
  },
98
- required: ["kind", "proposedTitle"],
99
166
  };
100
- const CLASSIFY_SYSTEM = `You classify project documentation paragraphs for Cairn's Single-Source-of-Truth ledger.
167
+ const FILE_FILTER_SYSTEM = `You are a rigid filter for an architecture ledger. A file is authoritative ONLY if it is a canonical rulebook, a formal Architecture Decision Record (ADR), or a list of active, binding domain invariants.
101
168
 
102
- Return JSON matching the supplied schema.
169
+ If a file is a project plan, research scratchpad, UAT log, status update, or API documentation, it is NOT authoritative, even if it contains proposed or historical decisions.
170
+
171
+ Evaluate the provided filepath, frontmatter, intro, and Table of Contents. Return JSON:
172
+ { "files": [ { "path": "<filepath>", "is_authoritative": <bool>, "reason": "10 words max" }, ... ] }
173
+
174
+ EXACTLY one entry per input filepath. Do NOT omit. Do NOT invent paths.`;
175
+ function buildFileFilterInputs(repoRoot, files) {
176
+ const out = [];
177
+ for (const rel of files) {
178
+ const abs = join(repoRoot, rel);
179
+ if (!existsSync(abs))
180
+ continue;
181
+ let raw;
182
+ try {
183
+ raw = readFileSync(abs, "utf8");
184
+ }
185
+ catch {
186
+ continue;
187
+ }
188
+ const { frontmatter, body } = splitFrontmatter(raw);
189
+ const introChars = body.slice(0, FILE_FILTER_INTRO_CHARS);
190
+ const toc = extractToc(body);
191
+ out.push({ path: rel, frontmatter, introChars, toc });
192
+ }
193
+ return out;
194
+ }
195
+ function splitFrontmatter(raw) {
196
+ const m = raw.match(/^---\n([\s\S]*?)\n---\n?/);
197
+ if (m === null)
198
+ return { frontmatter: null, body: raw };
199
+ const fm = m[1] ?? "";
200
+ return { frontmatter: fm, body: raw.slice(m[0].length) };
201
+ }
202
+ function extractToc(body) {
203
+ const lines = body.split("\n");
204
+ const toc = [];
205
+ for (const line of lines) {
206
+ if (/^#{1,3}\s+/.test(line)) {
207
+ toc.push(line.trim());
208
+ if (toc.length >= FILE_FILTER_TOC_MAX_LINES)
209
+ break;
210
+ }
211
+ }
212
+ return toc.join("\n");
213
+ }
214
+ async function classifyFileBatch(inputs) {
215
+ if (inputs.length === 0)
216
+ return new Map();
217
+ const blocks = inputs
218
+ .map((it) => {
219
+ const fmBlock = it.frontmatter !== null
220
+ ? `frontmatter:\n${it.frontmatter}\n`
221
+ : `frontmatter: (none)\n`;
222
+ const tocBlock = it.toc.length > 0 ? `toc:\n${it.toc}\n` : `toc: (none)\n`;
223
+ const intro = it.introChars.length > 0
224
+ ? `intro:\n${it.introChars}`
225
+ : `intro: (empty)`;
226
+ return `=== path: ${it.path}\n${fmBlock}${tocBlock}${intro}`;
227
+ })
228
+ .join("\n\n");
229
+ const prompt = `Classify each file. Return one entry per path.\n\n${blocks}`;
230
+ const result = await runClaude({
231
+ tier: "haiku",
232
+ system: FILE_FILTER_SYSTEM,
233
+ prompt,
234
+ jsonSchema: FILE_FILTER_SCHEMA,
235
+ timeoutMs: FILE_FILTER_TIMEOUT_MS,
236
+ isolateAmbientContext: true,
237
+ });
238
+ const parsed = result.parsed;
239
+ if (typeof parsed !== "object" || parsed === null) {
240
+ throw new Error("haiku file-filter returned non-object");
241
+ }
242
+ const arr = parsed["files"];
243
+ if (!Array.isArray(arr)) {
244
+ throw new Error("haiku file-filter missing `files` array");
245
+ }
246
+ const out = new Map();
247
+ for (const raw of arr) {
248
+ if (typeof raw !== "object" || raw === null)
249
+ continue;
250
+ const e = raw;
251
+ const path = e["path"];
252
+ const flag = e["is_authoritative"];
253
+ const reason = e["reason"];
254
+ if (typeof path !== "string")
255
+ continue;
256
+ if (typeof flag !== "boolean")
257
+ continue;
258
+ out.set(path, {
259
+ is_authoritative: flag,
260
+ reason: typeof reason === "string" ? reason : "",
261
+ });
262
+ }
263
+ return out;
264
+ }
265
+ /* -------------------------------------------------------------------------- */
266
+ /* Stage 2 — section batch classifier (kind + proposedTitle) */
267
+ /* -------------------------------------------------------------------------- */
268
+ const SECTION_SCHEMA = {
269
+ type: "object",
270
+ additionalProperties: false,
271
+ required: ["classifications"],
272
+ properties: {
273
+ classifications: {
274
+ type: "array",
275
+ items: {
276
+ type: "object",
277
+ additionalProperties: false,
278
+ required: ["slug", "kind", "proposedTitle"],
279
+ properties: {
280
+ slug: { type: "string" },
281
+ kind: {
282
+ type: "string",
283
+ enum: ["decision", "domain-rule", "voice-guidelines", "api-docs", "other"],
284
+ },
285
+ proposedTitle: { type: "string" },
286
+ },
287
+ },
288
+ },
289
+ },
290
+ };
291
+ const SECTION_SYSTEM = `You classify N sections from authoritative project documentation for Cairn's Single-Source-of-Truth ledger.
292
+
293
+ These sections come from files already filtered as canonical rulebooks, ADRs, or binding invariant lists. Decide which sections are themselves binding decisions / rules vs supporting context.
294
+
295
+ Return JSON: { "classifications": [ { "slug": "...", "kind": "...", "proposedTitle": "..." }, ... ] }
296
+
297
+ EXACTLY one classification per input section, keyed by its slug. Do NOT omit. Do NOT invent slugs. If unsure, kind="other".
103
298
 
104
299
  \`kind\` choices:
105
- - "decision" paragraph describes a binding decision or architectural choice
106
- - "domain-rule" paragraph describes a domain rule or constraint developers must obey
107
- - "voice-guidelines" paragraph is brand voice / tone guidance
108
- - "api-docs" paragraph documents an API surface or schema (descriptive, not binding)
300
+ - "decision" binding decision or architectural choice
301
+ - "domain-rule" domain rule or constraint developers must obey
302
+ - "voice-guidelines" brand voice / tone guidance
303
+ - "api-docs" API surface / schema documentation (descriptive)
109
304
  - "other" nothing actionable for the cairn state layer
110
305
 
111
- \`proposedTitle\` 5-10 words, imperative voice, empty for "other".
306
+ \`proposedTitle\` 5-10 words, imperative voice. Empty string for "other".
112
307
 
113
- Be conservative — false-positive decisions pollute the ground state worse
114
- than missed capture. Default to "other" when uncertain.`;
115
- async function classifyEntry(entry, body) {
116
- const capped = body.length > DOC_BODY_CAP ? `${body.slice(0, DOC_BODY_CAP)}\n…[truncated]` : body;
117
- const prompt = `Source: ${entry.sot_source}\nSlug: ${entry.slug}\n\n---\n${capped}`;
308
+ Be conservative — false-positive decisions pollute the ground state worse than missed capture. Default to "other" when uncertain.`;
309
+ async function classifySectionBatch(items) {
310
+ if (items.length === 0)
311
+ return new Map();
312
+ const sections = items
313
+ .map((it, i) => {
314
+ const capped = it.body.length > SECTION_BODY_CAP
315
+ ? `${it.body.slice(0, SECTION_BODY_CAP)}\n…[truncated]`
316
+ : it.body;
317
+ return `[${i + 1}] slug=${it.slug} source=${it.sot_source}\n${capped}`;
318
+ })
319
+ .join("\n\n---\n\n");
320
+ const prompt = `Classify each section. Return one entry per slug.\n\n${sections}`;
118
321
  const result = await runClaude({
119
322
  tier: "haiku",
120
- system: CLASSIFY_SYSTEM,
323
+ system: SECTION_SYSTEM,
121
324
  prompt,
122
- jsonSchema: CLASSIFY_SCHEMA,
123
- timeoutMs: PER_DOC_TIMEOUT_MS,
325
+ jsonSchema: SECTION_SCHEMA,
326
+ timeoutMs: SECTION_TIMEOUT_MS,
124
327
  isolateAmbientContext: true,
125
328
  });
126
329
  const parsed = result.parsed;
127
330
  if (typeof parsed !== "object" || parsed === null) {
128
- throw new Error("haiku returned non-object classification");
331
+ throw new Error("haiku section batch returned non-object");
129
332
  }
130
- const r = parsed;
131
- const kind = r["kind"];
132
- if (kind !== "decision" &&
133
- kind !== "domain-rule" &&
134
- kind !== "voice-guidelines" &&
135
- kind !== "api-docs" &&
136
- kind !== "other") {
137
- throw new Error(`haiku returned unexpected kind: ${String(kind)}`);
333
+ const arr = parsed["classifications"];
334
+ if (!Array.isArray(arr)) {
335
+ throw new Error("haiku section batch missing `classifications`");
138
336
  }
139
- return {
140
- kind,
141
- proposedTitle: typeof r["proposedTitle"] === "string" ? r["proposedTitle"] : "",
142
- };
337
+ const out = new Map();
338
+ for (const raw of arr) {
339
+ if (typeof raw !== "object" || raw === null)
340
+ continue;
341
+ const e = raw;
342
+ const slug = e["slug"];
343
+ const kind = e["kind"];
344
+ if (typeof slug !== "string")
345
+ continue;
346
+ if (kind !== "decision" &&
347
+ kind !== "domain-rule" &&
348
+ kind !== "voice-guidelines" &&
349
+ kind !== "api-docs" &&
350
+ kind !== "other") {
351
+ continue;
352
+ }
353
+ out.set(slug, {
354
+ kind,
355
+ proposedTitle: typeof e["proposedTitle"] === "string" ? e["proposedTitle"] : "",
356
+ });
357
+ }
358
+ return out;
143
359
  }
144
- /* -------------------------------------------------------------------------- */
145
- /* Orchestrator */
146
- /* -------------------------------------------------------------------------- */
147
360
  export async function runDocsIngestion(args) {
148
361
  const topicIndex = readTopicIndex(args.repoRoot);
149
362
  const anchorMap = readAnchorMap(args.repoRoot);
150
- const candidateEntries = Object.values(topicIndex.topics).filter((entry) => isDocSoT(entry) && entry.dec_id === undefined);
151
- if (candidateEntries.length === 0) {
363
+ const rejected = readRejectedYaml(args.repoRoot);
364
+ const allCandidates = Object.values(topicIndex.topics).filter((entry) => isDocSoT(entry) && entry.dec_id === undefined && !rejected.has(entry.slug));
365
+ if (allCandidates.length === 0) {
152
366
  log.info("phase 6 found no eligible docs entries in topic-index");
153
- return { decsWritten: [], skipped: [], scannedEntries: 0 };
154
- }
155
- let processed = 0;
156
- const result = await emitFromTopicIndex({
157
- repoRoot: args.repoRoot,
158
- topicIndex,
159
- anchorMap,
160
- filter: (entry) => isDocSoT(entry) && entry.dec_id === undefined,
161
- classifier: async ({ body, entry }) => {
367
+ writeFileCandidatesMap(args.repoRoot, topicIndex);
368
+ return zeroResult(allCandidates.length, topicIndex);
369
+ }
370
+ // Read each candidate body once. Stage 3 needs the body for title
371
+ // derivation; Stages 1/2 don't, but reading up front keeps the
372
+ // pipeline single-pass over entries. Bodies that fail to read are
373
+ // dropped — anchor-map drift is the only realistic cause and the
374
+ // entry stays as a candidate for the next phase 5b refresh.
375
+ const ctxBySlug = new Map();
376
+ for (const entry of allCandidates) {
377
+ const body = readSotBody(args.repoRoot, entry, anchorMap);
378
+ if (body === null)
379
+ continue;
380
+ ctxBySlug.set(entry.slug, { entry, body });
381
+ }
382
+ // ── Stage 3 — marker scan (deterministic, 0 Haiku) ──
383
+ const markerCandidates = [];
384
+ const nonMarkerCandidates = [];
385
+ for (const ctx of ctxBySlug.values()) {
386
+ if (ctx.entry.marker_kind !== undefined)
387
+ markerCandidates.push(ctx);
388
+ else
389
+ nonMarkerCandidates.push(ctx);
390
+ }
391
+ // ── Mock path — bypass Stages 1+2; run mockClassify on every
392
+ // non-marker candidate. Smokes only.
393
+ let sectionEmits = [];
394
+ let authoritativeFileCount = 0;
395
+ let filesEvaluated = 0;
396
+ if (args.mockClassify !== undefined) {
397
+ for (const ctx of nonMarkerCandidates) {
162
398
  let cls;
163
399
  try {
164
- cls = args.mockClassify !== undefined
165
- ? args.mockClassify(entry, body)
166
- : await classifyEntry(entry, body);
400
+ cls = args.mockClassify(ctx.entry, ctx.body);
167
401
  }
168
402
  catch (err) {
169
- log.warn({ slug: entry.slug, err: err instanceof Error ? err.message : String(err) }, "classifier failed; skipping");
170
- return { kind: "skip", title: "" };
171
- }
172
- processed += 1;
173
- if (args.onEntryProgress !== undefined) {
174
- args.onEntryProgress({
175
- slug: entry.slug,
176
- emitted: cls.kind === "decision" || cls.kind === "domain-rule",
177
- total: candidateEntries.length,
178
- });
403
+ log.warn({ slug: ctx.entry.slug, err: err instanceof Error ? err.message : String(err) }, "mockClassify failed; skipping");
404
+ continue;
179
405
  }
180
406
  if (cls.kind === "decision" || cls.kind === "domain-rule") {
181
- return { kind: "decision", title: cls.proposedTitle };
407
+ sectionEmits.push({ ctx, cls });
182
408
  }
183
- return { kind: "skip", title: cls.proposedTitle };
184
- },
185
- sot_kind: "path",
186
- capture_source: "init-docs-ingest",
187
- });
188
- writeSotBindings(args.repoRoot, result.bindings);
189
- writeSotCache(args.repoRoot, result.cache);
190
- writeTopicIndex(args.repoRoot, result.topicIndex);
191
- const decsWritten = result.emitted.map((rec) => ({
192
- id: rec.id,
193
- path: relativeDecPath(rec.id),
194
- sourceFile: rec.source_file,
195
- slug: rec.slug,
196
- }));
409
+ }
410
+ if (args.onChunkProgress !== undefined) {
411
+ args.onChunkProgress({
412
+ chunksDone: 1,
413
+ totalChunks: 1,
414
+ entriesDone: nonMarkerCandidates.length,
415
+ totalEntries: nonMarkerCandidates.length,
416
+ stage: "section-classify",
417
+ });
418
+ }
419
+ }
420
+ else {
421
+ // ── Stage 1 — file-purpose binary filter ──
422
+ const distinctFiles = [
423
+ ...new Set(nonMarkerCandidates.map((c) => c.entry.sot_source)),
424
+ ].sort();
425
+ filesEvaluated = distinctFiles.length;
426
+ const stage1Args = {
427
+ repoRoot: args.repoRoot,
428
+ files: distinctFiles,
429
+ };
430
+ if (args.onChunkProgress !== undefined) {
431
+ stage1Args.onChunkProgress = args.onChunkProgress;
432
+ }
433
+ const fileVerdicts = await runStage1FileFilter(stage1Args);
434
+ const authoritativeFiles = new Set();
435
+ for (const [path, v] of fileVerdicts.entries()) {
436
+ if (v.is_authoritative)
437
+ authoritativeFiles.add(path);
438
+ }
439
+ authoritativeFileCount = authoritativeFiles.size;
440
+ // ── Stage 2 — section batch classifier (scoped) ──
441
+ const stage2Inputs = nonMarkerCandidates.filter((c) => authoritativeFiles.has(c.entry.sot_source));
442
+ const stage2Args = {
443
+ candidates: stage2Inputs,
444
+ };
445
+ if (args.onChunkProgress !== undefined) {
446
+ stage2Args.onChunkProgress = args.onChunkProgress;
447
+ }
448
+ sectionEmits = await runStage2SectionClassifier(stage2Args);
449
+ }
450
+ // ── Stage 4 — emit drafts to `_inbox/` ──
451
+ const existingDecIds = args.existingDecIds ?? scanExistingDecIds(args.repoRoot);
452
+ const finalEmits = [
453
+ ...markerCandidates.map((ctx) => {
454
+ const kind = ctx.entry.marker_kind === "rule" ? "domain-rule" : "decision";
455
+ return { ctx, cls: { kind, proposedTitle: deriveMarkerTitle(ctx) } };
456
+ }),
457
+ ...sectionEmits,
458
+ ];
459
+ let updatedTopicIndex = topicIndex;
460
+ const decsWritten = [];
461
+ const skipped = [];
462
+ for (const { ctx, cls } of finalEmits) {
463
+ const sot_path = entryToSotPath(ctx.entry);
464
+ const titleSeed = cls.proposedTitle.length > 0
465
+ ? cls.proposedTitle
466
+ : firstLineFallback(ctx.body);
467
+ const id = allocateUniqueDecId({ sot_path, title: titleSeed, capture_source: CAPTURE_SOURCE }, existingDecIds);
468
+ const draftPath = writeDraftToInbox({
469
+ repoRoot: args.repoRoot,
470
+ id,
471
+ title: titleSeed,
472
+ body: ctx.body,
473
+ sot_path,
474
+ source_file: ctx.entry.sot_source,
475
+ });
476
+ decsWritten.push({
477
+ id,
478
+ path: relativeInboxPath(id),
479
+ sourceFile: ctx.entry.sot_source,
480
+ slug: ctx.entry.slug,
481
+ });
482
+ updatedTopicIndex = setTopic(updatedTopicIndex, ctx.entry.slug, {
483
+ ...ctx.entry,
484
+ dec_id: id,
485
+ });
486
+ log.debug({ id, slug: ctx.entry.slug, draftPath }, "phase 6 emitted draft");
487
+ }
488
+ // Refresh topic-index + file-candidates-map so the read-enrich hook
489
+ // sees the post-emit candidate counts. Anchor-map / sot-bindings /
490
+ // sot-cache stay untouched — drafts in `_inbox/` aren't canonical
491
+ // until the operator (or `cairn attention`) accepts them.
492
+ writeTopicIndex(args.repoRoot, updatedTopicIndex);
493
+ writeFileCandidatesMap(args.repoRoot, updatedTopicIndex);
494
+ const unpromotedCandidates = countUnpromoted(updatedTopicIndex);
197
495
  log.info({
198
- scanned: candidateEntries.length,
496
+ scanned: allCandidates.length,
199
497
  emitted: decsWritten.length,
200
- skipped: result.skipped.length,
201
- processed,
498
+ markerEmits: markerCandidates.length,
499
+ sectionEmits: sectionEmits.length,
500
+ authoritativeFiles: authoritativeFileCount,
501
+ filesEvaluated,
502
+ unpromotedCandidates,
202
503
  }, "phase 6 complete");
203
504
  return {
204
505
  decsWritten,
205
- skipped: result.skipped,
206
- scannedEntries: candidateEntries.length,
506
+ skipped,
507
+ scannedEntries: allCandidates.length,
508
+ markerEmits: markerCandidates.length,
509
+ sectionEmits: sectionEmits.length,
510
+ authoritativeFiles: authoritativeFileCount,
511
+ filesEvaluated,
512
+ unpromotedCandidates,
207
513
  };
208
514
  }
209
- function relativeDecPath(id) {
210
- return `.cairn/ground/decisions/${id}.md`;
515
+ /* -------------------------------------------------------------------------- */
516
+ /* Stage runners */
517
+ /* -------------------------------------------------------------------------- */
518
+ export async function runStage1FileFilter(args) {
519
+ const verdicts = new Map();
520
+ if (args.files.length === 0)
521
+ return verdicts;
522
+ const inputs = buildFileFilterInputs(args.repoRoot, args.files);
523
+ const chunks = [];
524
+ for (let i = 0; i < inputs.length; i += FILE_FILTER_BATCH_SIZE) {
525
+ chunks.push(inputs.slice(i, i + FILE_FILTER_BATCH_SIZE));
526
+ }
527
+ let nextIdx = 0;
528
+ let chunksDone = 0;
529
+ let entriesDone = 0;
530
+ const worker = async () => {
531
+ for (;;) {
532
+ const idx = nextIdx;
533
+ nextIdx += 1;
534
+ if (idx >= chunks.length)
535
+ return;
536
+ const chunk = chunks[idx];
537
+ try {
538
+ const map = await classifyFileBatch(chunk);
539
+ for (const [path, v] of map.entries())
540
+ verdicts.set(path, v);
541
+ }
542
+ catch (err) {
543
+ log.warn({ chunkIdx: idx, size: chunk.length, err: err instanceof Error ? err.message : String(err) }, "phase 6 stage 1 file-filter failed; chunk treated as non-authoritative");
544
+ }
545
+ chunksDone += 1;
546
+ entriesDone += chunk.length;
547
+ if (args.onChunkProgress !== undefined) {
548
+ args.onChunkProgress({
549
+ chunksDone,
550
+ totalChunks: chunks.length,
551
+ entriesDone,
552
+ totalEntries: inputs.length,
553
+ stage: "file-filter",
554
+ });
555
+ }
556
+ }
557
+ };
558
+ await Promise.all(Array.from({ length: Math.min(FILE_FILTER_CONCURRENCY, Math.max(1, chunks.length)) }, () => worker()));
559
+ return verdicts;
211
560
  }
212
- /**
213
- * Phase 6 owns every topic-index entry whose SoT candidate was tagged
214
- * `kind="doc"` by the phase 5b walker. Path-prefix matching would lock
215
- * us to `docs/` and miss `documentation/`, `official_docs/`, etc.; the
216
- * walker's per-candidate kind is already the right discriminant.
217
- */
561
+ async function runStage2SectionClassifier(args) {
562
+ const out = [];
563
+ if (args.candidates.length === 0)
564
+ return out;
565
+ const items = args.candidates.map((c) => ({
566
+ slug: c.entry.slug,
567
+ body: c.body,
568
+ sot_source: c.entry.sot_source,
569
+ }));
570
+ const ctxBySlug = new Map(args.candidates.map((c) => [c.entry.slug, c]));
571
+ const chunks = [];
572
+ for (let i = 0; i < items.length; i += SECTION_BATCH_SIZE) {
573
+ chunks.push(items.slice(i, i + SECTION_BATCH_SIZE));
574
+ }
575
+ let nextIdx = 0;
576
+ let chunksDone = 0;
577
+ let entriesDone = 0;
578
+ const verdicts = new Map();
579
+ const worker = async () => {
580
+ for (;;) {
581
+ const idx = nextIdx;
582
+ nextIdx += 1;
583
+ if (idx >= chunks.length)
584
+ return;
585
+ const chunk = chunks[idx];
586
+ try {
587
+ const map = await classifySectionBatch(chunk);
588
+ for (const [slug, cls] of map.entries())
589
+ verdicts.set(slug, cls);
590
+ }
591
+ catch (err) {
592
+ log.warn({ chunkIdx: idx, size: chunk.length, err: err instanceof Error ? err.message : String(err) }, "phase 6 stage 2 batch failed; chunk skipped");
593
+ }
594
+ chunksDone += 1;
595
+ entriesDone += chunk.length;
596
+ if (args.onChunkProgress !== undefined) {
597
+ args.onChunkProgress({
598
+ chunksDone,
599
+ totalChunks: chunks.length,
600
+ entriesDone,
601
+ totalEntries: items.length,
602
+ stage: "section-classify",
603
+ });
604
+ }
605
+ }
606
+ };
607
+ await Promise.all(Array.from({ length: Math.min(SECTION_CONCURRENCY, Math.max(1, chunks.length)) }, () => worker()));
608
+ for (const [slug, cls] of verdicts.entries()) {
609
+ if (cls.kind !== "decision" && cls.kind !== "domain-rule")
610
+ continue;
611
+ const ctx = ctxBySlug.get(slug);
612
+ if (ctx === undefined)
613
+ continue;
614
+ out.push({ ctx, cls });
615
+ }
616
+ return out;
617
+ }
618
+ function writeDraftToInbox(args) {
619
+ const inboxDir = join(decisionsDir(args.repoRoot), "_inbox");
620
+ mkdirSync(inboxDir, { recursive: true });
621
+ const abs = join(inboxDir, `${args.id}.draft.md`);
622
+ const now = new Date().toISOString();
623
+ const fm = {
624
+ id: args.id,
625
+ title: args.title,
626
+ type: "adr",
627
+ status: "draft",
628
+ audience: "dual",
629
+ generated: now,
630
+ "verified-at": now,
631
+ decided_at: now,
632
+ decided_by: DECIDED_BY,
633
+ sot_kind: "path",
634
+ sot_path: args.sot_path,
635
+ sot_content_hash: bodyContentHash(args.body),
636
+ capture_source: CAPTURE_SOURCE,
637
+ source_file: args.source_file,
638
+ };
639
+ const out = [];
640
+ out.push("---");
641
+ out.push(stringifyYaml(fm).trimEnd());
642
+ out.push("---");
643
+ out.push("");
644
+ out.push(args.body.trimEnd());
645
+ out.push("");
646
+ writeFileSync(abs, out.join("\n"), "utf8");
647
+ return abs;
648
+ }
649
+ function relativeInboxPath(id) {
650
+ return `.cairn/ground/decisions/_inbox/${id}.draft.md`;
651
+ }
652
+ /* -------------------------------------------------------------------------- */
653
+ /* Helpers */
654
+ /* -------------------------------------------------------------------------- */
218
655
  function isDocSoT(entry) {
219
656
  const sot = entry.candidates.find((c) => c.file === entry.sot_source);
220
657
  return sot !== undefined && sot.kind === "doc";
221
658
  }
659
+ function entryToSotPath(entry) {
660
+ const sot = entry.candidates.find((c) => c.file === entry.sot_source);
661
+ if (sot === undefined)
662
+ return entry.sot_source;
663
+ if (sot.anchor !== undefined && sot.anchor.length > 0) {
664
+ return `${entry.sot_source}#${sot.anchor}`;
665
+ }
666
+ return entry.sot_source;
667
+ }
668
+ // firstLineFallback now lives in sot-emit.ts (single source of truth).
669
+ // Imported above as `firstLineFallback`.
670
+ function deriveMarkerTitle(ctx) {
671
+ // Prefer the topic-index entry's anchor text (post-walker normalization)
672
+ // when present; fall back to the SoT body's first non-blank line.
673
+ const sot = ctx.entry.candidates.find((c) => c.file === ctx.entry.sot_source);
674
+ if (sot?.anchor !== undefined && sot.anchor.length > 0) {
675
+ return sot.anchor.replace(/[-_]+/g, " ").trim().slice(0, 120) || firstLineFallback(ctx.body);
676
+ }
677
+ return firstLineFallback(ctx.body);
678
+ }
679
+ /**
680
+ * Allocate a DEC id that doesn't collide with `existingIds`. The
681
+ * derivation is content-stable, but two distinct topics with identical
682
+ * `(sot_path, title, capture_source)` tuples would clash — fall back
683
+ * to a counter suffix in that pathological case.
684
+ */
685
+ function allocateUniqueDecId(input, existingIds) {
686
+ let id = deriveDecId(input);
687
+ if (!existingIds.has(id)) {
688
+ existingIds.add(id);
689
+ return id;
690
+ }
691
+ for (let suffix = 2; suffix < 1_000; suffix += 1) {
692
+ const tagged = deriveDecId({ ...input, title: `${input.title} #${suffix}` });
693
+ if (!existingIds.has(tagged)) {
694
+ existingIds.add(tagged);
695
+ return tagged;
696
+ }
697
+ }
698
+ // Exceedingly unlikely. If we hit it, return the deterministic id and
699
+ // let the filesystem write fail loudly rather than fabricating a
700
+ // random suffix that would break subsequent re-runs.
701
+ existingIds.add(id);
702
+ return id;
703
+ }
704
+ function scanExistingDecIds(repoRoot) {
705
+ const out = new Set();
706
+ const dir = decisionsDir(repoRoot);
707
+ for (const sub of [dir, join(dir, "_inbox")]) {
708
+ let entries;
709
+ try {
710
+ entries = readdirSync(sub, { withFileTypes: true, encoding: "utf8" });
711
+ }
712
+ catch {
713
+ continue;
714
+ }
715
+ for (const e of entries) {
716
+ if (!e.isFile())
717
+ continue;
718
+ const m = e.name.match(/^(DEC-[0-9a-f]{7,})/);
719
+ if (m === null)
720
+ continue;
721
+ out.add(m[1]);
722
+ }
723
+ }
724
+ return out;
725
+ }
726
+ function countUnpromoted(topicIndex) {
727
+ let n = 0;
728
+ for (const e of Object.values(topicIndex.topics)) {
729
+ if (e.dec_id === undefined)
730
+ n += 1;
731
+ }
732
+ return n;
733
+ }
734
+ function zeroResult(scanned, topicIndex) {
735
+ return {
736
+ decsWritten: [],
737
+ skipped: [],
738
+ scannedEntries: scanned,
739
+ markerEmits: 0,
740
+ sectionEmits: 0,
741
+ authoritativeFiles: 0,
742
+ filesEvaluated: 0,
743
+ unpromotedCandidates: countUnpromoted(topicIndex),
744
+ };
745
+ }
222
746
  //# sourceMappingURL=ingest-docs.js.map