llm-wiki-compiler 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -6,13 +6,15 @@ import { createRequire } from "module";
6
6
  import { Command } from "commander";
7
7
 
8
8
  // src/commands/ingest.ts
9
- import path3 from "path";
10
- import { mkdir as mkdir2, writeFile as writeFile2 } from "fs/promises";
9
+ import path7 from "path";
10
+ import { mkdir as mkdir2, readFile as readFile6, writeFile as writeFile2 } from "fs/promises";
11
11
 
12
12
  // src/utils/markdown.ts
13
13
  import { writeFile, rename, readFile, mkdir } from "fs/promises";
14
14
  import path from "path";
15
15
  import yaml from "js-yaml";
16
+ var SPAN_SUFFIX_PATTERN = /^(?<file>[^:#]+)(?:(?::(?<colonStart>\d+)(?:-(?<colonEnd>\d+))?)|(?:#L(?<hashStart>\d+)(?:-L(?<hashEnd>\d+))?))?$/;
17
+ var MIN_LINE_NUMBER = 1;
16
18
  var VALID_PROVENANCE_STATES = /* @__PURE__ */ new Set([
17
19
  "extracted",
18
20
  "merged",
@@ -49,6 +51,23 @@ async function atomicWrite(filePath, content) {
49
51
  await writeFile(tmpPath, content, "utf-8");
50
52
  await rename(tmpPath, filePath);
51
53
  }
54
+ function isValidLineRange(start, end) {
55
+ return start >= MIN_LINE_NUMBER && end >= start;
56
+ }
57
+ function isMalformedCitationEntry(entry) {
58
+ const trimmed = entry.trim();
59
+ if (trimmed.length === 0) return true;
60
+ if (!trimmed.includes(":") && !trimmed.includes("#")) return false;
61
+ const match = SPAN_SUFFIX_PATTERN.exec(trimmed);
62
+ if (!match || !match.groups) return true;
63
+ const { colonStart, colonEnd, hashStart, hashEnd } = match.groups;
64
+ const start = colonStart ?? hashStart;
65
+ const end = colonEnd ?? hashEnd;
66
+ if (start === void 0) return false;
67
+ const startLine = Number(start);
68
+ const endLine = end === void 0 ? startLine : Number(end);
69
+ return !isValidLineRange(startLine, endLine);
70
+ }
52
71
  async function safeReadFile(filePath) {
53
72
  try {
54
73
  return await readFile(filePath, "utf-8");
@@ -120,6 +139,8 @@ var PROVIDER_MODELS = {
120
139
  minimax: "MiniMax-M2.7"
121
140
  };
122
141
  var OLLAMA_DEFAULT_HOST = "http://localhost:11434/v1";
142
+ var OPENAI_DEFAULT_TIMEOUT_MS = 10 * 60 * 1e3;
143
+ var OLLAMA_DEFAULT_TIMEOUT_MS = 30 * 60 * 1e3;
123
144
  var SOURCES_DIR = "sources";
124
145
  var CONCEPTS_DIR = "wiki/concepts";
125
146
  var QUERIES_DIR = "wiki/queries";
@@ -129,9 +150,17 @@ var LOCK_FILE = ".llmwiki/lock";
129
150
  var INDEX_FILE = "wiki/index.md";
130
151
  var MOC_FILE = "wiki/MOC.md";
131
152
  var EMBEDDINGS_FILE = ".llmwiki/embeddings.json";
153
+ var IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".jpg", ".jpeg", ".png", ".gif", ".webp"]);
154
+ var TRANSCRIPT_EXTENSIONS = /* @__PURE__ */ new Set([".vtt", ".srt"]);
155
+ var IMAGE_DESCRIBE_MAX_TOKENS = 2048;
132
156
  var CANDIDATES_DIR = ".llmwiki/candidates";
133
157
  var CANDIDATES_ARCHIVE_DIR = ".llmwiki/candidates/archive";
134
158
  var EMBEDDING_TOP_K = 15;
159
+ var CHUNK_TOP_K = 30;
160
+ var CHUNK_RERANK_KEEP = 12;
161
+ var CHUNK_TARGET_CHARS = 800;
162
+ var CHUNK_MAX_CHARS = 1400;
163
+ var CHUNK_MIN_CHARS = 200;
135
164
  var LOW_CONFIDENCE_THRESHOLD = 0.5;
136
165
  var MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS = 2;
137
166
  var EMBEDDING_MODELS = {
@@ -216,19 +245,24 @@ async function ingestWeb(url) {
216
245
 
217
246
  // src/ingest/file.ts
218
247
  import { readFile as readFile2 } from "fs/promises";
248
+ import path3 from "path";
249
+
250
+ // src/ingest/shared.ts
219
251
  import path2 from "path";
220
- var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
221
252
  function titleFromFilename(filePath) {
222
253
  const basename = path2.basename(filePath, path2.extname(filePath));
223
254
  return basename.replace(/[-_]+/g, " ").trim();
224
255
  }
256
+
257
+ // src/ingest/file.ts
258
+ var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
225
259
  function wrapPlainText(text) {
226
260
  return `\`\`\`
227
261
  ${text}
228
262
  \`\`\``;
229
263
  }
230
264
  async function ingestFile(filePath) {
231
- const ext = path2.extname(filePath).toLowerCase();
265
+ const ext = path3.extname(filePath).toLowerCase();
232
266
  if (!SUPPORTED_EXTENSIONS.has(ext)) {
233
267
  throw new Error(
234
268
  `Unsupported file type "${ext}". Only .md and .txt files are supported.`
@@ -240,208 +274,36 @@ async function ingestFile(filePath) {
240
274
  return { title, content };
241
275
  }
242
276
 
243
- // src/commands/ingest.ts
244
- function isUrl(source2) {
245
- return source2.startsWith("http://") || source2.startsWith("https://");
246
- }
247
- function enforceCharLimit(content) {
248
- if (content.length <= MAX_SOURCE_CHARS) {
249
- return { content, truncated: false, originalChars: content.length };
250
- }
251
- status(
252
- "!",
253
- warn(
254
- `Content truncated from ${content.length.toLocaleString()} to ${MAX_SOURCE_CHARS.toLocaleString()} characters.`
255
- )
256
- );
257
- return {
258
- content: content.slice(0, MAX_SOURCE_CHARS),
259
- truncated: true,
260
- originalChars: content.length
261
- };
262
- }
263
- function enforceMinContent(content) {
264
- const length = content.trim().length;
265
- if (length === 0) {
266
- throw new Error(
267
- "No readable content could be extracted from the source."
268
- );
269
- }
270
- if (length < MIN_SOURCE_CHARS) {
271
- status(
272
- "!",
273
- warn(
274
- `Content seems very short (${length} chars, minimum recommended is ${MIN_SOURCE_CHARS}).`
275
- )
276
- );
277
- }
278
- }
279
- function buildDocument(title, source2, result) {
280
- const meta = {
281
- title,
282
- source: source2,
283
- ingestedAt: (/* @__PURE__ */ new Date()).toISOString()
284
- };
285
- if (result.truncated) {
286
- meta.truncated = true;
287
- meta.originalChars = result.originalChars;
277
+ // src/ingest/pdf.ts
278
+ import { readFile as readFile3 } from "fs/promises";
279
+ function resolveTitle(filePath, info2) {
280
+ if (info2 && typeof info2 === "object") {
281
+ const titleField = info2["Title"];
282
+ if (typeof titleField === "string" && titleField.trim().length > 0) {
283
+ return titleField.trim();
284
+ }
288
285
  }
289
- const frontmatter = buildFrontmatter(meta);
290
- return `${frontmatter}
291
-
292
- ${result.content}
293
- `;
294
- }
295
- async function saveSource(title, document) {
296
- const filename = `${slugify(title)}.md`;
297
- const destPath = path3.join(SOURCES_DIR, filename);
298
- await mkdir2(SOURCES_DIR, { recursive: true });
299
- await writeFile2(destPath, document, "utf-8");
300
- return destPath;
301
- }
302
- async function ingestSource(source2) {
303
- status("*", info(`Ingesting: ${source2}`));
304
- const { title, content } = isUrl(source2) ? await ingestWeb(source2) : await ingestFile(source2);
305
- const result = enforceCharLimit(content);
306
- enforceMinContent(result.content);
307
- const document = buildDocument(title, source2, result);
308
- const savedPath = await saveSource(title, document);
309
- return {
310
- filename: path3.basename(savedPath),
311
- charCount: result.content.length,
312
- truncated: result.truncated,
313
- source: source2
314
- };
315
- }
316
- async function ingest(source2) {
317
- const result = await ingestSource(source2);
318
- const savedPath = path3.join(SOURCES_DIR, result.filename);
319
- status(
320
- "+",
321
- success(`Saved ${bold(result.filename)} \u2192 ${source(savedPath)}`)
322
- );
323
- status("\u2192", dim("Next: llmwiki compile"));
324
- }
325
-
326
- // src/commands/compile.ts
327
- import { existsSync as existsSync5 } from "fs";
328
-
329
- // src/compiler/index.ts
330
- import { readFile as readFile8 } from "fs/promises";
331
- import path16 from "path";
332
-
333
- // src/utils/state.ts
334
- import { readFile as readFile3, writeFile as writeFile3, rename as rename2, mkdir as mkdir3, copyFile } from "fs/promises";
335
- import { existsSync } from "fs";
336
- import path4 from "path";
337
- function emptyState() {
338
- return { version: 1, indexHash: "", sources: {} };
286
+ return titleFromFilename(filePath);
339
287
  }
340
- async function readState(root) {
341
- const filePath = path4.join(root, STATE_FILE);
342
- if (!existsSync(filePath)) {
343
- return emptyState();
344
- }
288
+ async function ingestPdf(filePath) {
289
+ const { PDFParse } = await import("pdf-parse");
290
+ const buffer = await readFile3(filePath);
291
+ const parser = new PDFParse({ data: new Uint8Array(buffer) });
345
292
  try {
346
- const raw = await readFile3(filePath, "utf-8");
347
- return JSON.parse(raw);
348
- } catch {
349
- const bakPath = filePath + ".bak";
350
- console.warn(`\u26A0 Corrupt state.json \u2014 backed up to ${bakPath}, starting fresh.`);
351
- await copyFile(filePath, bakPath);
352
- return emptyState();
293
+ const textResult = await parser.getText();
294
+ const infoResult = await parser.getInfo();
295
+ const title = resolveTitle(filePath, infoResult.info);
296
+ const content = textResult.text.trim();
297
+ return { title, content };
298
+ } finally {
299
+ await parser.destroy();
353
300
  }
354
301
  }
355
- async function writeState(root, state) {
356
- const dir = path4.join(root, LLMWIKI_DIR);
357
- await mkdir3(dir, { recursive: true });
358
- const filePath = path4.join(root, STATE_FILE);
359
- const tmpPath = filePath + ".tmp";
360
- await writeFile3(tmpPath, JSON.stringify(state, null, 2), "utf-8");
361
- await rename2(tmpPath, filePath);
362
- }
363
- async function updateSourceState(root, sourceFile, entry) {
364
- const state = await readState(root);
365
- state.sources[sourceFile] = entry;
366
- await writeState(root, state);
367
- }
368
- async function removeSourceState(root, sourceFile) {
369
- const state = await readState(root);
370
- delete state.sources[sourceFile];
371
- await writeState(root, state);
372
- }
373
-
374
- // src/compiler/source-state.ts
375
- import path6 from "path";
376
302
 
377
- // src/compiler/hasher.ts
378
- import { createHash } from "crypto";
379
- import { readFile as readFile4, readdir } from "fs/promises";
303
+ // src/ingest/image.ts
304
+ import { readFile as readFile4 } from "fs/promises";
380
305
  import path5 from "path";
381
- async function hashFile(filePath) {
382
- const content = await readFile4(filePath, "utf-8");
383
- return createHash("sha256").update(content).digest("hex");
384
- }
385
- async function detectChanges(root, prevState) {
386
- const sourcesPath = path5.join(root, SOURCES_DIR);
387
- const currentFiles = await listSourceFiles(sourcesPath);
388
- const changes = [];
389
- for (const file of currentFiles) {
390
- const status2 = await classifyFile(root, file, prevState);
391
- changes.push({ file, status: status2 });
392
- }
393
- const deletedChanges = findDeletedFiles(currentFiles, prevState);
394
- changes.push(...deletedChanges);
395
- return changes;
396
- }
397
- async function listSourceFiles(sourcesPath) {
398
- try {
399
- const entries = await readdir(sourcesPath);
400
- return entries.filter((f) => f.endsWith(".md"));
401
- } catch {
402
- return [];
403
- }
404
- }
405
- async function classifyFile(root, file, prevState) {
406
- const filePath = path5.join(root, SOURCES_DIR, file);
407
- const hash = await hashFile(filePath);
408
- const prev = prevState.sources[file];
409
- if (!prev) return "new";
410
- if (prev.hash !== hash) return "changed";
411
- return "unchanged";
412
- }
413
- function findDeletedFiles(currentFiles, prevState) {
414
- const currentSet = new Set(currentFiles);
415
- return Object.keys(prevState.sources).filter((file) => !currentSet.has(file)).map((file) => ({ file, status: "deleted" }));
416
- }
417
-
418
- // src/compiler/source-state.ts
419
- async function buildExtractionSourceStates(root, extractions) {
420
- const snapshot = {};
421
- const compiledAt = (/* @__PURE__ */ new Date()).toISOString();
422
- for (const result of extractions) {
423
- if (result.concepts.length === 0) continue;
424
- snapshot[result.sourceFile] = await buildEntry(root, result, compiledAt);
425
- }
426
- return snapshot;
427
- }
428
- async function buildEntry(root, result, compiledAt) {
429
- const filePath = path6.join(root, SOURCES_DIR, result.sourceFile);
430
- const hash = await hashFile(filePath);
431
- return {
432
- hash,
433
- concepts: result.concepts.map((concept) => slugify(concept.concept)),
434
- compiledAt
435
- };
436
- }
437
- function pickStatesForSources(allStates, sourceFiles) {
438
- const picked = {};
439
- for (const file of sourceFiles) {
440
- const entry = allStates[file];
441
- if (entry) picked[file] = entry;
442
- }
443
- return picked;
444
- }
306
+ import Anthropic2 from "@anthropic-ai/sdk";
445
307
 
446
308
  // src/providers/anthropic.ts
447
309
  import Anthropic from "@anthropic-ai/sdk";
@@ -554,160 +416,46 @@ var AnthropicProvider = class {
554
416
  }
555
417
  };
556
418
 
557
- // src/providers/openai.ts
558
- import OpenAI from "openai";
559
- function translateToolToOpenAI(tool) {
560
- return {
561
- type: "function",
562
- function: {
563
- name: tool.name,
564
- description: tool.description,
565
- parameters: tool.input_schema
419
+ // src/utils/claude-settings.ts
420
+ import { readFileSync } from "fs";
421
+ import { homedir } from "os";
422
+ import path4 from "path";
423
+ var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
424
+ function isRecord(value) {
425
+ return typeof value === "object" && value !== null;
426
+ }
427
+ function normalize(value) {
428
+ if (typeof value !== "string") return void 0;
429
+ const trimmed = value.trim();
430
+ return trimmed.length > 0 ? trimmed : void 0;
431
+ }
432
+ function resolveClaudeSettingsPath(env) {
433
+ return env[CLAUDE_SETTINGS_PATH_ENV] ?? path4.join(homedir(), ".claude", "settings.json");
434
+ }
435
+ function readClaudeSettingsFile(settingsPath) {
436
+ try {
437
+ return readFileSync(settingsPath, "utf8");
438
+ } catch (err) {
439
+ if (isRecord(err) && err.code === "ENOENT") {
440
+ return void 0;
566
441
  }
567
- };
442
+ const message = err instanceof Error ? err.message : String(err);
443
+ throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
444
+ }
568
445
  }
569
- var OpenAIProvider = class {
570
- client;
571
- embeddingsClient;
572
- model;
573
- configuredEmbeddingModel;
574
- constructor(model, options = {}) {
575
- this.model = model;
576
- this.configuredEmbeddingModel = options.embeddingModel;
577
- const resolvedKey = options.apiKey ?? process.env.OPENAI_API_KEY ?? "";
578
- this.client = new OpenAI({
579
- apiKey: resolvedKey,
580
- baseURL: options.baseURL ?? null
581
- });
582
- this.embeddingsClient = options.embeddingsBaseURL ? new OpenAI({ apiKey: resolvedKey, baseURL: options.embeddingsBaseURL }) : this.client;
446
+ function readClaudeSettingsEnv(env = process.env) {
447
+ const settingsPath = resolveClaudeSettingsPath(env);
448
+ const raw = readClaudeSettingsFile(settingsPath);
449
+ if (!raw) return void 0;
450
+ let parsed;
451
+ try {
452
+ parsed = JSON.parse(raw);
453
+ } catch (err) {
454
+ const message = err instanceof Error ? err.message : String(err);
455
+ throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
583
456
  }
584
- /** Send a single non-streaming completion request. */
585
- async complete(system, messages, maxTokens) {
586
- const response = await this.client.chat.completions.create({
587
- model: this.model,
588
- max_tokens: maxTokens,
589
- messages: [{ role: "system", content: system }, ...messages]
590
- });
591
- return response.choices[0]?.message?.content ?? "";
592
- }
593
- /** Stream a completion, invoking onToken for each text chunk. */
594
- async stream(system, messages, maxTokens, onToken) {
595
- const stream = await this.client.chat.completions.create({
596
- model: this.model,
597
- max_tokens: maxTokens,
598
- messages: [{ role: "system", content: system }, ...messages],
599
- stream: true
600
- });
601
- let fullText = "";
602
- for await (const chunk of stream) {
603
- const delta = chunk.choices[0]?.delta?.content;
604
- if (delta) {
605
- fullText += delta;
606
- onToken?.(delta);
607
- }
608
- }
609
- return fullText;
610
- }
611
- /** Call the model with tool definitions and return the parsed tool input as JSON. */
612
- async toolCall(system, messages, tools, maxTokens) {
613
- const openaiTools = tools.map(translateToolToOpenAI);
614
- const response = await this.client.chat.completions.create({
615
- model: this.model,
616
- max_tokens: maxTokens,
617
- messages: [{ role: "system", content: system }, ...messages],
618
- tools: openaiTools
619
- });
620
- const toolCalls = response.choices[0]?.message?.tool_calls;
621
- if (toolCalls && toolCalls.length > 0) {
622
- return toolCalls[0].function.arguments;
623
- }
624
- return response.choices[0]?.message?.content ?? "";
625
- }
626
- /**
627
- * Produce a single embedding vector via the OpenAI embeddings API.
628
- * Subclasses (e.g. Ollama) override embeddingModel() to pick a different model.
629
- */
630
- async embed(text) {
631
- const response = await this.embeddingsClient.embeddings.create({
632
- model: this.embeddingModel(),
633
- input: text
634
- });
635
- const vector = response.data[0]?.embedding;
636
- if (!Array.isArray(vector)) {
637
- throw new Error("OpenAI embeddings response did not include a vector.");
638
- }
639
- return vector;
640
- }
641
- /** Default embedding model for this provider. Subclasses may override. */
642
- embeddingModel() {
643
- return this.configuredEmbeddingModel ?? EMBEDDING_MODELS.openai;
644
- }
645
- };
646
-
647
- // src/providers/ollama.ts
648
- var OllamaProvider = class extends OpenAIProvider {
649
- constructor(model, options) {
650
- super(model, {
651
- baseURL: options.baseURL,
652
- apiKey: "ollama",
653
- embeddingsBaseURL: options.embeddingsBaseURL,
654
- embeddingModel: options.embeddingModel
655
- });
656
- }
657
- /** Ollama ships a dedicated embedding model (nomic-embed-text). */
658
- embeddingModel() {
659
- return this.configuredEmbeddingModel ?? EMBEDDING_MODELS.ollama;
660
- }
661
- };
662
-
663
- // src/providers/minimax.ts
664
- var MINIMAX_BASE_URL = "https://api.minimax.io/v1";
665
- var MiniMaxProvider = class extends OpenAIProvider {
666
- constructor(model, apiKey) {
667
- super(model, { baseURL: MINIMAX_BASE_URL, apiKey });
668
- }
669
- };
670
-
671
- // src/utils/claude-settings.ts
672
- import { readFileSync } from "fs";
673
- import { homedir } from "os";
674
- import path7 from "path";
675
- var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
676
- function isRecord(value) {
677
- return typeof value === "object" && value !== null;
678
- }
679
- function normalize(value) {
680
- if (typeof value !== "string") return void 0;
681
- const trimmed = value.trim();
682
- return trimmed.length > 0 ? trimmed : void 0;
683
- }
684
- function resolveClaudeSettingsPath(env) {
685
- return env[CLAUDE_SETTINGS_PATH_ENV] ?? path7.join(homedir(), ".claude", "settings.json");
686
- }
687
- function readClaudeSettingsFile(settingsPath) {
688
- try {
689
- return readFileSync(settingsPath, "utf8");
690
- } catch (err) {
691
- if (isRecord(err) && err.code === "ENOENT") {
692
- return void 0;
693
- }
694
- const message = err instanceof Error ? err.message : String(err);
695
- throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
696
- }
697
- }
698
- function readClaudeSettingsEnv(env = process.env) {
699
- const settingsPath = resolveClaudeSettingsPath(env);
700
- const raw = readClaudeSettingsFile(settingsPath);
701
- if (!raw) return void 0;
702
- let parsed;
703
- try {
704
- parsed = JSON.parse(raw);
705
- } catch (err) {
706
- const message = err instanceof Error ? err.message : String(err);
707
- throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
708
- }
709
- if (!isRecord(parsed) || !isRecord(parsed.env)) {
710
- return void 0;
457
+ if (!isRecord(parsed) || !isRecord(parsed.env)) {
458
+ return void 0;
711
459
  }
712
460
  const values = {
713
461
  ANTHROPIC_API_KEY: normalize(parsed.env.ANTHROPIC_API_KEY),
@@ -763,312 +511,878 @@ function resolveAnthropicBaseURLFromEnv(env = process.env) {
763
511
  return validateAnthropicBaseURL(fallbackBaseURL);
764
512
  }
765
513
 
766
- // src/utils/provider.ts
767
- var SUPPORTED_PROVIDERS = /* @__PURE__ */ new Set(["anthropic", "openai", "ollama", "minimax"]);
768
- function getProvider() {
769
- const providerName = getProviderName();
770
- switch (providerName) {
771
- case "anthropic":
772
- return getAnthropicProvider();
773
- case "openai":
774
- return new OpenAIProvider(getModelForProvider("openai"), {
775
- baseURL: readOptionalEnv("OPENAI_BASE_URL"),
776
- embeddingsBaseURL: readOptionalEnv("OPENAI_EMBEDDINGS_BASE_URL"),
777
- embeddingModel: readOptionalEnv("LLMWIKI_EMBEDDING_MODEL")
778
- });
779
- case "ollama":
780
- return new OllamaProvider(getModelForProvider("ollama"), {
781
- baseURL: readOptionalEnv("OLLAMA_HOST") ?? OLLAMA_DEFAULT_HOST,
782
- embeddingsBaseURL: readOptionalEnv("OLLAMA_EMBEDDINGS_HOST"),
783
- embeddingModel: readOptionalEnv("LLMWIKI_EMBEDDING_MODEL")
784
- });
785
- case "minimax":
786
- return getMiniMaxProvider();
787
- default:
788
- throw new Error(`Unhandled provider: ${providerName}`);
789
- }
790
- }
791
- function readOptionalEnv(name) {
792
- const value = process.env[name]?.trim();
793
- return value ? value : void 0;
794
- }
795
- function getModelForProvider(providerName) {
796
- return process.env.LLMWIKI_MODEL ?? PROVIDER_MODELS[providerName];
797
- }
798
- function getMiniMaxProvider() {
799
- const apiKey = process.env.MINIMAX_API_KEY;
800
- if (!apiKey) {
514
+ // src/ingest/image.ts
515
+ var EXTENSION_TO_MIME = {
516
+ ".jpg": "image/jpeg",
517
+ ".jpeg": "image/jpeg",
518
+ ".png": "image/png",
519
+ ".gif": "image/gif",
520
+ ".webp": "image/webp"
521
+ };
522
+ function mimeTypeForExtension(ext) {
523
+ const mimeType = EXTENSION_TO_MIME[ext.toLowerCase()];
524
+ if (!mimeType) {
801
525
  throw new Error(
802
- "MiniMax provider requires MINIMAX_API_KEY environment variable.\n Set it with: export MINIMAX_API_KEY=your_key"
526
+ `Unsupported image extension "${ext}". Supported: ${Object.keys(EXTENSION_TO_MIME).join(", ")}`
803
527
  );
804
528
  }
805
- return new MiniMaxProvider(getModelForProvider("minimax"), apiKey);
529
+ return mimeType;
806
530
  }
807
- function getAnthropicProvider() {
808
- const model = resolveAnthropicModelFromEnv() ?? PROVIDER_MODELS.anthropic;
531
+ function buildClient() {
809
532
  const baseURL = resolveAnthropicBaseURLFromEnv();
810
533
  const auth = resolveAnthropicAuthFromEnv();
811
- return new AnthropicProvider(model, {
812
- baseURL,
813
- ...auth
534
+ return new Anthropic2(buildAnthropicClientOptions({ baseURL, ...auth }));
535
+ }
536
+ async function describeImageWithVision(client, model, imageData, mimeType) {
537
+ const response = await client.messages.create({
538
+ model,
539
+ max_tokens: IMAGE_DESCRIBE_MAX_TOKENS,
540
+ messages: [
541
+ {
542
+ role: "user",
543
+ content: [
544
+ {
545
+ type: "image",
546
+ source: { type: "base64", media_type: mimeType, data: imageData }
547
+ },
548
+ {
549
+ type: "text",
550
+ text: "Extract and transcribe all text visible in this image. Then provide a detailed description of any non-text visual content. Format your response as markdown."
551
+ }
552
+ ]
553
+ }
554
+ ]
814
555
  });
556
+ const textBlock = response.content.find((block) => block.type === "text");
557
+ return textBlock?.type === "text" ? textBlock.text : "";
815
558
  }
816
- function getProviderName() {
817
- const providerName = process.env.LLMWIKI_PROVIDER ?? DEFAULT_PROVIDER;
818
- if (!SUPPORTED_PROVIDERS.has(providerName)) {
559
+ async function ingestImage(filePath) {
560
+ const providerName = process.env.LLMWIKI_PROVIDER ?? "anthropic";
561
+ if (providerName !== "anthropic") {
819
562
  throw new Error(
820
- `Unknown provider "${providerName}". Supported: ${[...SUPPORTED_PROVIDERS].join(", ")}`
563
+ `Image ingest requires the Anthropic provider (vision). Current provider: "${providerName}". Set LLMWIKI_PROVIDER=anthropic and ANTHROPIC_API_KEY to use image ingest.`
821
564
  );
822
565
  }
823
- return providerName;
824
- }
825
- function getActiveProviderName() {
826
- return getProviderName();
566
+ const ext = path5.extname(filePath).toLowerCase();
567
+ const mimeType = mimeTypeForExtension(ext);
568
+ const imageBuffer = await readFile4(filePath);
569
+ const imageData = imageBuffer.toString("base64");
570
+ const client = buildClient();
571
+ const model = resolveAnthropicModelFromEnv() ?? PROVIDER_MODELS.anthropic;
572
+ const content = await describeImageWithVision(client, model, imageData, mimeType);
573
+ const title = titleFromFilename(filePath);
574
+ return { title, content };
827
575
  }
828
576
 
829
- // src/utils/llm.ts
830
- function sleep(ms) {
831
- return new Promise((resolve) => setTimeout(resolve, ms));
832
- }
833
- async function callClaude(options) {
834
- const { system, messages, tools, maxTokens = 4096, stream = false, onToken } = options;
835
- const provider = getProvider();
836
- for (let attempt = 0; attempt <= RETRY_COUNT; attempt++) {
837
- try {
838
- if (stream) {
839
- return await provider.stream(system, messages, maxTokens, onToken);
840
- }
841
- if (tools && tools.length > 0) {
842
- return await provider.toolCall(system, messages, tools, maxTokens);
843
- }
844
- return await provider.complete(system, messages, maxTokens);
845
- } catch (error2) {
846
- if (attempt === RETRY_COUNT) throw error2;
847
- const delayMs = RETRY_BASE_MS * Math.pow(RETRY_MULTIPLIER, attempt);
848
- const errMsg = error2 instanceof Error ? error2.message : String(error2);
849
- console.warn(`\u26A0 API call failed (attempt ${attempt + 1}/${RETRY_COUNT + 1}): ${errMsg}`);
850
- console.warn(` Retrying in ${delayMs / 1e3}s...`);
851
- await sleep(delayMs);
852
- }
577
+ // src/ingest/transcript.ts
578
+ import { readFile as readFile5 } from "fs/promises";
579
+ import path6 from "path";
580
+ import { YoutubeTranscript as YoutubeTranscriptUntyped } from "youtube-transcript/dist/youtube-transcript.esm.js";
581
+ var YoutubeTranscript = YoutubeTranscriptUntyped;
582
+ var YOUTUBE_URL_PATTERN = /^https?:\/\/(www\.)?(youtube\.com\/watch|youtu\.be\/)/;
583
+ var SRT_SEQUENCE_PATTERN = /^\d+$/;
584
+ var TIMESTAMP_PATTERN = /\d{2}:\d{2}[:.]\d{2}/;
585
+ var MS_PER_MINUTE = 6e4;
586
+ var MS_PER_SECOND = 1e3;
587
+ function isYoutubeUrl(source2) {
588
+ return YOUTUBE_URL_PATTERN.test(source2);
589
+ }
590
+ function extractVideoId(url) {
591
+ const match = url.match(/(?:v=|youtu\.be\/)([^&?/]+)/);
592
+ if (!match) {
593
+ throw new Error(`Could not extract video ID from YouTube URL: ${url}`);
853
594
  }
854
- throw new Error("Unreachable");
595
+ return match[1];
855
596
  }
856
-
857
- // src/utils/lock.ts
858
- import { open, readFile as readFile5, unlink, mkdir as mkdir4 } from "fs/promises";
859
- import path8 from "path";
860
- var RECLAIM_SUFFIX = ".reclaim";
861
- var MAX_ACQUIRE_ATTEMPTS = 2;
862
- function isProcessAlive(pid) {
863
- try {
864
- process.kill(pid, 0);
865
- return true;
866
- } catch {
867
- return false;
597
+ function formatOffset(offsetMs) {
598
+ const minutes = Math.floor(offsetMs / MS_PER_MINUTE);
599
+ const seconds = Math.floor(offsetMs % MS_PER_MINUTE / MS_PER_SECOND);
600
+ return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}`;
601
+ }
602
+ async function fetchYoutubeTranscript(url) {
603
+ const videoId = extractVideoId(url);
604
+ const segments = await YoutubeTranscript.fetchTranscript(videoId);
605
+ if (!segments || segments.length === 0) {
606
+ throw new Error(`No transcript available for YouTube video: ${url}`);
868
607
  }
608
+ const lines = segments.map((seg) => `[${formatOffset(seg.offset)}] ${seg.text}`);
609
+ return {
610
+ title: `YouTube Transcript ${videoId}`,
611
+ content: lines.join("\n")
612
+ };
869
613
  }
870
- async function acquireLock(root) {
871
- const lockPath = path8.join(root, LOCK_FILE);
872
- await mkdir4(path8.join(root, LLMWIKI_DIR), { recursive: true });
873
- for (let attempt = 0; attempt < MAX_ACQUIRE_ATTEMPTS; attempt++) {
874
- const created = await tryCreateLock(lockPath);
875
- if (created) return true;
876
- const stale = await isLockStale(lockPath);
877
- if (!stale) {
878
- status("!", warn("Another compilation is running."));
879
- return false;
614
+ function isCueTimestamp(trimmed) {
615
+ return TIMESTAMP_PATTERN.test(trimmed) && trimmed.includes("-->");
616
+ }
617
+ function parseVtt(raw, filePath) {
618
+ const lines = raw.split("\n");
619
+ const output = [];
620
+ let inCue = false;
621
+ for (const line of lines) {
622
+ const trimmed = line.trim();
623
+ if (trimmed === "WEBVTT" || trimmed === "") {
624
+ inCue = false;
625
+ continue;
626
+ }
627
+ if (isCueTimestamp(trimmed)) {
628
+ output.push(`
629
+ **[${trimmed}]**`);
630
+ inCue = true;
631
+ continue;
632
+ }
633
+ if (inCue && trimmed.length > 0) {
634
+ output.push(trimmed);
880
635
  }
881
- const reclaimed = await reclaimStaleLock(root, lockPath);
882
- if (reclaimed) return true;
883
636
  }
884
- status("!", warn("Could not acquire lock after retrying."));
885
- return false;
637
+ return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
886
638
  }
887
- async function reclaimStaleLock(root, lockPath) {
888
- const reclaimPath = lockPath + RECLAIM_SUFFIX;
889
- const gotReclaimLock = await acquireReclaimLock(reclaimPath);
890
- if (!gotReclaimLock) return false;
891
- try {
892
- if (!await isLockStale(lockPath)) {
893
- return false;
894
- }
895
- try {
896
- await unlink(lockPath);
897
- } catch {
639
+ function parseSrt(raw, filePath) {
640
+ const lines = raw.split("\n");
641
+ const output = [];
642
+ for (const line of lines) {
643
+ const trimmed = line.trim();
644
+ if (trimmed === "" || SRT_SEQUENCE_PATTERN.test(trimmed)) {
645
+ continue;
898
646
  }
899
- const acquired = await tryCreateLock(lockPath);
900
- if (acquired) {
901
- status("i", dim("Reclaimed stale lock from dead process."));
647
+ if (isCueTimestamp(trimmed)) {
648
+ output.push(`
649
+ **[${trimmed}]**`);
650
+ continue;
902
651
  }
903
- return acquired;
904
- } finally {
905
- try {
906
- await unlink(reclaimPath);
907
- } catch {
652
+ if (trimmed.length > 0) {
653
+ output.push(trimmed);
908
654
  }
909
655
  }
656
+ return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
910
657
  }
911
- async function acquireReclaimLock(reclaimPath) {
912
- if (await tryCreateLock(reclaimPath)) return true;
913
- if (!await isLockStale(reclaimPath)) return false;
914
- try {
915
- await unlink(reclaimPath);
916
- } catch {
658
+ function parsePlainTranscript(raw, filePath) {
659
+ return { title: titleFromFilename(filePath), content: raw.trim() };
660
+ }
661
+ async function ingestTranscript(source2) {
662
+ if (isYoutubeUrl(source2)) {
663
+ return fetchYoutubeTranscript(source2);
917
664
  }
918
- return false;
665
+ const ext = path6.extname(source2).toLowerCase();
666
+ const raw = await readFile5(source2, "utf-8");
667
+ if (ext === ".vtt") return parseVtt(raw, source2);
668
+ if (ext === ".srt") return parseSrt(raw, source2);
669
+ if (ext === ".txt") return parsePlainTranscript(raw, source2);
670
+ throw new Error(
671
+ `Unsupported transcript file type "${ext}". Supported: .vtt, .srt, .txt`
672
+ );
919
673
  }
920
- async function tryCreateLock(lockPath) {
921
- try {
922
- const fd = await open(lockPath, "wx");
923
- await fd.writeFile(String(process.pid), "utf-8");
924
- await fd.close();
925
- return true;
926
- } catch (err) {
927
- if (err instanceof Error && "code" in err && err.code === "EEXIST") {
928
- return false;
929
- }
930
- throw err;
674
+
675
+ // src/commands/ingest.ts
676
+ function isUrl(source2) {
677
+ return source2.startsWith("http://") || source2.startsWith("https://");
678
+ }
679
+ var TXT_SNIFF_BYTES = 2048;
680
+ var SPEAKER_TAG_PATTERN = /^([A-Z][a-zA-Z .'-]{0,40}):\s/gm;
681
+ var TIMESTAMP_PATTERN2 = /^\s*\d{1,2}:\d{2}(:\d{2})?/;
682
+ var MIN_TIMESTAMP_MATCHES = 3;
683
+ var MIN_SPEAKER_REPEAT_COUNT = 2;
684
+ var MIN_DISTINCT_SPEAKERS = 2;
685
+ function countSpeakerOccurrences(sample) {
686
+ const counts = /* @__PURE__ */ new Map();
687
+ SPEAKER_TAG_PATTERN.lastIndex = 0;
688
+ let match;
689
+ while ((match = SPEAKER_TAG_PATTERN.exec(sample)) !== null) {
690
+ const name = match[1].trim();
691
+ counts.set(name, (counts.get(name) ?? 0) + 1);
692
+ }
693
+ return counts;
694
+ }
695
+ function hasSpeakerDialoguePattern(sample) {
696
+ const counts = countSpeakerOccurrences(sample);
697
+ const distinctSpeakers = counts.size;
698
+ const hasEnoughSpeakers = distinctSpeakers >= MIN_DISTINCT_SPEAKERS;
699
+ const hasRepeatedSpeaker = [...counts.values()].some(
700
+ (n) => n >= MIN_SPEAKER_REPEAT_COUNT
701
+ );
702
+ return hasEnoughSpeakers && hasRepeatedSpeaker;
703
+ }
704
+ async function looksLikeTxtTranscript(filePath) {
705
+ const raw = await readFile6(filePath, "utf-8");
706
+ const sample = raw.slice(0, TXT_SNIFF_BYTES);
707
+ if (hasSpeakerDialoguePattern(sample)) return true;
708
+ const timestampMatches = sample.match(new RegExp(TIMESTAMP_PATTERN2.source, "gm"));
709
+ return (timestampMatches?.length ?? 0) >= MIN_TIMESTAMP_MATCHES;
710
+ }
711
+ function enforceCharLimit(content) {
712
+ if (content.length <= MAX_SOURCE_CHARS) {
713
+ return { content, truncated: false, originalChars: content.length };
931
714
  }
715
+ status(
716
+ "!",
717
+ warn(
718
+ `Content truncated from ${content.length.toLocaleString()} to ${MAX_SOURCE_CHARS.toLocaleString()} characters.`
719
+ )
720
+ );
721
+ return {
722
+ content: content.slice(0, MAX_SOURCE_CHARS),
723
+ truncated: true,
724
+ originalChars: content.length
725
+ };
932
726
  }
933
- async function isLockStale(lockPath) {
934
- try {
935
- const content = await readFile5(lockPath, "utf-8");
936
- const pid = parseInt(content.trim(), 10);
937
- if (isNaN(pid)) return true;
938
- return !isProcessAlive(pid);
939
- } catch {
940
- return true;
727
+ function enforceMinContent(content) {
728
+ const length = content.trim().length;
729
+ if (length === 0) {
730
+ throw new Error(
731
+ "No readable content could be extracted from the source."
732
+ );
733
+ }
734
+ if (length < MIN_SOURCE_CHARS) {
735
+ status(
736
+ "!",
737
+ warn(
738
+ `Content seems very short (${length} chars, minimum recommended is ${MIN_SOURCE_CHARS}).`
739
+ )
740
+ );
941
741
  }
942
742
  }
943
- async function releaseLock(root) {
944
- const lockPath = path8.join(root, LOCK_FILE);
945
- try {
946
- await unlink(lockPath);
947
- } catch {
743
+ async function detectSourceType(source2) {
744
+ if (!isUrl(source2)) {
745
+ const ext = path7.extname(source2).toLowerCase();
746
+ if (ext === ".pdf") return "pdf";
747
+ if (IMAGE_EXTENSIONS.has(ext)) return "image";
748
+ if (TRANSCRIPT_EXTENSIONS.has(ext)) return "transcript";
749
+ if (ext === ".txt") {
750
+ const isTranscript = await looksLikeTxtTranscript(source2);
751
+ return isTranscript ? "transcript" : "file";
752
+ }
753
+ return "file";
948
754
  }
755
+ if (isYoutubeUrl(source2)) return "transcript";
756
+ return "web";
949
757
  }
950
-
951
- // src/compiler/prompts.ts
952
- var PROVENANCE_STATE_VALUES = [
953
- "extracted",
954
- "merged",
955
- "inferred",
956
- "ambiguous"
957
- ];
958
- var CONCEPT_EXTRACTION_TOOL = {
959
- name: "extract_concepts",
960
- description: "Extract knowledge concepts from a source document",
961
- input_schema: {
962
- type: "object",
963
- properties: {
964
- concepts: {
965
- type: "array",
966
- items: {
967
- type: "object",
968
- properties: {
969
- concept: {
970
- type: "string",
971
- description: "Human-readable concept title"
972
- },
973
- summary: {
974
- type: "string",
975
- description: "One-line description"
976
- },
977
- is_new: {
978
- type: "boolean",
979
- description: "True if this is a new concept not in existing wiki"
980
- },
981
- tags: {
982
- type: "array",
983
- items: { type: "string" },
984
- description: "2-4 categorical tags for organizing this concept (e.g., 'machine-learning', 'optimization')"
985
- },
986
- confidence: {
987
- type: "number",
988
- description: "Confidence in this concept on a 0..1 scale (1 = directly stated, 0 = highly speculative)."
989
- },
990
- provenance_state: {
991
- type: "string",
992
- enum: PROVENANCE_STATE_VALUES,
993
- description: "How this concept was produced: 'extracted' (direct from source), 'merged' (synthesised across sources), 'inferred' (model deduction), or 'ambiguous' (sources disagree)."
994
- },
995
- contradicted_by: {
996
- type: "array",
997
- items: {
998
- type: "object",
999
- properties: {
1000
- slug: { type: "string", description: "Slug of the contradicting concept." },
1001
- reason: { type: "string", description: "Brief reason for the contradiction." }
1002
- },
1003
- required: ["slug"]
1004
- },
1005
- description: "Slugs of other concepts whose evidence contradicts this one."
1006
- },
1007
- inferred_paragraphs: {
1008
- type: "integer",
1009
- description: "Estimated number of paragraphs in the page that will be inferred rather than directly cited."
1010
- }
1011
- },
1012
- required: ["concept", "summary", "is_new"]
1013
- }
1014
- }
1015
- },
1016
- required: ["concepts"]
758
+ function buildDocument(title, source2, result, sourceType) {
759
+ const meta = {
760
+ title,
761
+ source: source2,
762
+ ingestedAt: (/* @__PURE__ */ new Date()).toISOString()
763
+ };
764
+ if (sourceType !== void 0) {
765
+ meta.sourceType = sourceType;
1017
766
  }
1018
- };
1019
- function buildExtractionPrompt(sourceContent, existingIndex) {
1020
- const indexSection = existingIndex ? `
1021
-
1022
- Here is the existing wiki index \u2014 avoid duplicating concepts already covered:
767
+ if (result.truncated) {
768
+ meta.truncated = true;
769
+ meta.originalChars = result.originalChars;
770
+ }
771
+ const frontmatter = buildFrontmatter(meta);
772
+ return `${frontmatter}
1023
773
 
1024
- ${existingIndex}` : "\n\nNo existing wiki pages yet.";
1025
- return [
1026
- "You are a knowledge extraction engine. Analyze the following source document",
1027
- "and identify 3-8 distinct, meaningful concepts worth documenting as wiki pages.",
1028
- "Each concept should be a standalone topic that someone might look up.",
1029
- "Focus on key ideas, techniques, patterns, or entities \u2014 not trivial details.",
1030
- "Use the extract_concepts tool to return your findings.",
1031
- "",
1032
- "For every concept, emit provenance metadata so downstream tools can reason",
1033
- "about reliability:",
1034
- " - confidence: 0..1 \u2014 how certain you are the source supports this concept.",
1035
- " - provenance_state: 'extracted' if directly stated, 'merged' if synthesised",
1036
- " from multiple parts of the source, 'inferred' if reasoned from context,",
1037
- " or 'ambiguous' if the source is contradictory or unclear.",
1038
- " - contradicted_by: slugs of other concepts (in this batch or the index)",
1039
- " whose evidence conflicts with this one.",
1040
- " - inferred_paragraphs: estimated number of paragraphs in the resulting",
1041
- " page that will be inferred rather than directly citable.",
1042
- indexSection,
1043
- "\n\n--- SOURCE DOCUMENT ---\n\n",
1044
- sourceContent
1045
- ].join("\n");
774
+ ${result.content}
775
+ `;
776
+ }
777
+ async function fetchContent(source2, sourceType) {
778
+ switch (sourceType) {
779
+ case "web":
780
+ return ingestWeb(source2);
781
+ case "pdf":
782
+ return ingestPdf(source2);
783
+ case "image":
784
+ return ingestImage(source2);
785
+ case "transcript":
786
+ return ingestTranscript(source2);
787
+ case "file":
788
+ return ingestFile(source2);
789
+ }
790
+ }
791
+ async function saveSource(title, document) {
792
+ const filename = `${slugify(title)}.md`;
793
+ const destPath = path7.join(SOURCES_DIR, filename);
794
+ await mkdir2(SOURCES_DIR, { recursive: true });
795
+ await writeFile2(destPath, document, "utf-8");
796
+ return destPath;
797
+ }
798
+ async function ingestSource(source2) {
799
+ const sourceType = await detectSourceType(source2);
800
+ status("*", info(`Ingesting [${sourceType}]: ${source2}`));
801
+ const { title, content } = await fetchContent(source2, sourceType);
802
+ const result = enforceCharLimit(content);
803
+ enforceMinContent(result.content);
804
+ const document = buildDocument(title, source2, result, sourceType);
805
+ const savedPath = await saveSource(title, document);
806
+ return {
807
+ filename: path7.basename(savedPath),
808
+ charCount: result.content.length,
809
+ truncated: result.truncated,
810
+ source: source2,
811
+ sourceType
812
+ };
813
+ }
814
+ async function ingest(source2) {
815
+ const result = await ingestSource(source2);
816
+ const savedPath = path7.join(SOURCES_DIR, result.filename);
817
+ status(
818
+ "+",
819
+ success(`Saved ${bold(result.filename)} \u2192 ${source(savedPath)}`)
820
+ );
821
+ status("\u2192", dim("Next: llmwiki compile"));
1046
822
  }
1047
- function buildPagePrompt(concept, sourceContent, existingPage, relatedPages) {
1048
- const existingSection = existingPage ? `
1049
823
 
1050
- Existing page to update:
824
+ // src/commands/compile.ts
825
+ import { existsSync as existsSync7 } from "fs";
1051
826
 
1052
- ${existingPage}` : "";
1053
- const relatedSection = relatedPages ? `
827
+ // src/compiler/index.ts
828
+ import { readFile as readFile14 } from "fs/promises";
829
+ import path21 from "path";
1054
830
 
1055
- Related wiki pages for cross-referencing:
831
+ // src/utils/state.ts
832
+ import { readFile as readFile7, writeFile as writeFile3, rename as rename2, mkdir as mkdir3, copyFile } from "fs/promises";
833
+ import { existsSync } from "fs";
834
+ import path8 from "path";
835
+ function emptyState() {
836
+ return { version: 1, indexHash: "", sources: {} };
837
+ }
838
+ async function readState(root) {
839
+ const filePath = path8.join(root, STATE_FILE);
840
+ if (!existsSync(filePath)) {
841
+ return emptyState();
842
+ }
843
+ try {
844
+ const raw = await readFile7(filePath, "utf-8");
845
+ return JSON.parse(raw);
846
+ } catch {
847
+ const bakPath = filePath + ".bak";
848
+ console.warn(`\u26A0 Corrupt state.json \u2014 backed up to ${bakPath}, starting fresh.`);
849
+ await copyFile(filePath, bakPath);
850
+ return emptyState();
851
+ }
852
+ }
853
+ async function writeState(root, state) {
854
+ const dir = path8.join(root, LLMWIKI_DIR);
855
+ await mkdir3(dir, { recursive: true });
856
+ const filePath = path8.join(root, STATE_FILE);
857
+ const tmpPath = filePath + ".tmp";
858
+ await writeFile3(tmpPath, JSON.stringify(state, null, 2), "utf-8");
859
+ await rename2(tmpPath, filePath);
860
+ }
861
+ async function updateSourceState(root, sourceFile, entry) {
862
+ const state = await readState(root);
863
+ state.sources[sourceFile] = entry;
864
+ await writeState(root, state);
865
+ }
866
+ async function removeSourceState(root, sourceFile) {
867
+ const state = await readState(root);
868
+ delete state.sources[sourceFile];
869
+ await writeState(root, state);
870
+ }
1056
871
 
1057
- ${relatedPages}` : "";
1058
- return [
1059
- `You are a wiki author. Write a clear, well-structured markdown page about "${concept}".`,
1060
- "Draw facts only from the provided source material.",
1061
- "Include a ## Sources section at the end listing the source document.",
1062
- "Suggest [[wikilinks]] to related concepts where appropriate.",
1063
- "Write in a neutral, informative tone. Be concise but thorough.",
1064
- "",
1065
- "Source attribution: at the end of each prose paragraph, append a citation",
1066
- "marker showing which source file(s) the paragraph drew from.",
1067
- "Format: ^[filename.md] for single-source, ^[source-a.md, source-b.md] for multi-source.",
1068
- "Place citations only at the end of prose paragraphs \u2014 not on headings, list items, or code blocks.",
1069
- "Source filenames are visible as `--- SOURCE: filename.md ---` headers in the content below.",
1070
- "",
1071
- "If a paragraph is your inference rather than a direct extraction, leave it",
872
+ // src/compiler/source-state.ts
873
+ import path10 from "path";
874
+
875
+ // src/compiler/hasher.ts
876
+ import { createHash } from "crypto";
877
+ import { readFile as readFile8, readdir } from "fs/promises";
878
+ import path9 from "path";
879
+ async function hashFile(filePath) {
880
+ const content = await readFile8(filePath, "utf-8");
881
+ return createHash("sha256").update(content).digest("hex");
882
+ }
883
+ async function detectChanges(root, prevState) {
884
+ const sourcesPath = path9.join(root, SOURCES_DIR);
885
+ const currentFiles = await listSourceFiles(sourcesPath);
886
+ const changes = [];
887
+ for (const file of currentFiles) {
888
+ const status2 = await classifyFile(root, file, prevState);
889
+ changes.push({ file, status: status2 });
890
+ }
891
+ const deletedChanges = findDeletedFiles(currentFiles, prevState);
892
+ changes.push(...deletedChanges);
893
+ return changes;
894
+ }
895
+ async function listSourceFiles(sourcesPath) {
896
+ try {
897
+ const entries = await readdir(sourcesPath);
898
+ return entries.filter((f) => f.endsWith(".md"));
899
+ } catch {
900
+ return [];
901
+ }
902
+ }
903
+ async function classifyFile(root, file, prevState) {
904
+ const filePath = path9.join(root, SOURCES_DIR, file);
905
+ const hash = await hashFile(filePath);
906
+ const prev = prevState.sources[file];
907
+ if (!prev) return "new";
908
+ if (prev.hash !== hash) return "changed";
909
+ return "unchanged";
910
+ }
911
+ function findDeletedFiles(currentFiles, prevState) {
912
+ const currentSet = new Set(currentFiles);
913
+ return Object.keys(prevState.sources).filter((file) => !currentSet.has(file)).map((file) => ({ file, status: "deleted" }));
914
+ }
915
+
916
+ // src/compiler/source-state.ts
917
+ async function buildExtractionSourceStates(root, extractions) {
918
+ const snapshot = {};
919
+ const compiledAt = (/* @__PURE__ */ new Date()).toISOString();
920
+ for (const result of extractions) {
921
+ if (result.concepts.length === 0) continue;
922
+ snapshot[result.sourceFile] = await buildEntry(root, result, compiledAt);
923
+ }
924
+ return snapshot;
925
+ }
926
+ async function buildEntry(root, result, compiledAt) {
927
+ const filePath = path10.join(root, SOURCES_DIR, result.sourceFile);
928
+ const hash = await hashFile(filePath);
929
+ return {
930
+ hash,
931
+ concepts: result.concepts.map((concept) => slugify(concept.concept)),
932
+ compiledAt
933
+ };
934
+ }
935
+ function pickStatesForSources(allStates, sourceFiles) {
936
+ const picked = {};
937
+ for (const file of sourceFiles) {
938
+ const entry = allStates[file];
939
+ if (entry) picked[file] = entry;
940
+ }
941
+ return picked;
942
+ }
943
+
944
+ // src/providers/openai.ts
945
+ import OpenAI from "openai";
946
+ function readTimeoutEnv(name) {
947
+ const raw = process.env[name]?.trim();
948
+ if (!raw) return void 0;
949
+ const parsed = Number(raw);
950
+ return Number.isFinite(parsed) && parsed > 0 ? parsed : void 0;
951
+ }
952
+ function resolveOpenAITimeoutMs() {
953
+ return readTimeoutEnv("LLMWIKI_REQUEST_TIMEOUT_MS");
954
+ }
955
+ function translateToolToOpenAI(tool) {
956
+ return {
957
+ type: "function",
958
+ function: {
959
+ name: tool.name,
960
+ description: tool.description,
961
+ parameters: tool.input_schema
962
+ }
963
+ };
964
+ }
965
+ var OpenAIProvider = class {
966
+ client;
967
+ embeddingsClient;
968
+ model;
969
+ configuredEmbeddingModel;
970
+ constructor(model, options = {}) {
971
+ this.model = model;
972
+ this.configuredEmbeddingModel = options.embeddingModel;
973
+ const resolvedKey = options.apiKey ?? process.env.OPENAI_API_KEY ?? "";
974
+ const timeout = options.timeoutMs ?? resolveOpenAITimeoutMs() ?? OPENAI_DEFAULT_TIMEOUT_MS;
975
+ this.client = new OpenAI({
976
+ apiKey: resolvedKey,
977
+ baseURL: options.baseURL ?? null,
978
+ timeout
979
+ });
980
+ this.embeddingsClient = options.embeddingsBaseURL ? new OpenAI({ apiKey: resolvedKey, baseURL: options.embeddingsBaseURL, timeout }) : this.client;
981
+ }
982
+ /** Send a single non-streaming completion request. */
983
+ async complete(system, messages, maxTokens) {
984
+ const response = await this.client.chat.completions.create({
985
+ model: this.model,
986
+ max_tokens: maxTokens,
987
+ messages: [{ role: "system", content: system }, ...messages]
988
+ });
989
+ return response.choices[0]?.message?.content ?? "";
990
+ }
991
+ /** Stream a completion, invoking onToken for each text chunk. */
992
+ async stream(system, messages, maxTokens, onToken) {
993
+ const stream = await this.client.chat.completions.create({
994
+ model: this.model,
995
+ max_tokens: maxTokens,
996
+ messages: [{ role: "system", content: system }, ...messages],
997
+ stream: true
998
+ });
999
+ let fullText = "";
1000
+ for await (const chunk of stream) {
1001
+ const delta = chunk.choices[0]?.delta?.content;
1002
+ if (delta) {
1003
+ fullText += delta;
1004
+ onToken?.(delta);
1005
+ }
1006
+ }
1007
+ return fullText;
1008
+ }
1009
+ /** Call the model with tool definitions and return the parsed tool input as JSON. */
1010
+ async toolCall(system, messages, tools, maxTokens) {
1011
+ const openaiTools = tools.map(translateToolToOpenAI);
1012
+ const response = await this.client.chat.completions.create({
1013
+ model: this.model,
1014
+ max_tokens: maxTokens,
1015
+ messages: [{ role: "system", content: system }, ...messages],
1016
+ tools: openaiTools
1017
+ });
1018
+ const toolCalls = response.choices[0]?.message?.tool_calls;
1019
+ if (toolCalls && toolCalls.length > 0) {
1020
+ return toolCalls[0].function.arguments;
1021
+ }
1022
+ return response.choices[0]?.message?.content ?? "";
1023
+ }
1024
+ /**
1025
+ * Produce a single embedding vector via the OpenAI embeddings API.
1026
+ * Subclasses (e.g. Ollama) override embeddingModel() to pick a different model.
1027
+ */
1028
+ async embed(text) {
1029
+ const response = await this.embeddingsClient.embeddings.create({
1030
+ model: this.embeddingModel(),
1031
+ input: text
1032
+ });
1033
+ const vector = response.data[0]?.embedding;
1034
+ if (!Array.isArray(vector)) {
1035
+ throw new Error("OpenAI embeddings response did not include a vector.");
1036
+ }
1037
+ return vector;
1038
+ }
1039
+ /** Default embedding model for this provider. Subclasses may override. */
1040
+ embeddingModel() {
1041
+ return this.configuredEmbeddingModel ?? EMBEDDING_MODELS.openai;
1042
+ }
1043
+ };
1044
+
1045
+ // src/providers/ollama.ts
1046
+ function resolveOllamaTimeoutMs(explicit) {
1047
+ return explicit ?? readTimeoutEnv("OLLAMA_TIMEOUT_MS") ?? readTimeoutEnv("LLMWIKI_REQUEST_TIMEOUT_MS") ?? OLLAMA_DEFAULT_TIMEOUT_MS;
1048
+ }
1049
+ var OllamaProvider = class extends OpenAIProvider {
1050
+ constructor(model, options) {
1051
+ super(model, {
1052
+ baseURL: options.baseURL,
1053
+ apiKey: "ollama",
1054
+ embeddingsBaseURL: options.embeddingsBaseURL,
1055
+ embeddingModel: options.embeddingModel,
1056
+ timeoutMs: resolveOllamaTimeoutMs(options.timeoutMs)
1057
+ });
1058
+ }
1059
+ /** Ollama ships a dedicated embedding model (nomic-embed-text). */
1060
+ embeddingModel() {
1061
+ return this.configuredEmbeddingModel ?? EMBEDDING_MODELS.ollama;
1062
+ }
1063
+ };
1064
+
1065
+ // src/providers/minimax.ts
1066
+ var MINIMAX_BASE_URL = "https://api.minimax.io/v1";
1067
+ var MiniMaxProvider = class extends OpenAIProvider {
1068
+ constructor(model, apiKey) {
1069
+ super(model, { baseURL: MINIMAX_BASE_URL, apiKey });
1070
+ }
1071
+ };
1072
+
1073
+ // src/utils/provider.ts
1074
+ var SUPPORTED_PROVIDERS = /* @__PURE__ */ new Set(["anthropic", "openai", "ollama", "minimax"]);
1075
+ function getProvider() {
1076
+ const providerName = getProviderName();
1077
+ switch (providerName) {
1078
+ case "anthropic":
1079
+ return getAnthropicProvider();
1080
+ case "openai":
1081
+ return new OpenAIProvider(getModelForProvider("openai"), {
1082
+ baseURL: readOptionalEnv("OPENAI_BASE_URL"),
1083
+ embeddingsBaseURL: readOptionalEnv("OPENAI_EMBEDDINGS_BASE_URL"),
1084
+ embeddingModel: readOptionalEnv("LLMWIKI_EMBEDDING_MODEL")
1085
+ });
1086
+ case "ollama":
1087
+ return new OllamaProvider(getModelForProvider("ollama"), {
1088
+ baseURL: readOptionalEnv("OLLAMA_HOST") ?? OLLAMA_DEFAULT_HOST,
1089
+ embeddingsBaseURL: readOptionalEnv("OLLAMA_EMBEDDINGS_HOST"),
1090
+ embeddingModel: readOptionalEnv("LLMWIKI_EMBEDDING_MODEL")
1091
+ });
1092
+ case "minimax":
1093
+ return getMiniMaxProvider();
1094
+ default:
1095
+ throw new Error(`Unhandled provider: ${providerName}`);
1096
+ }
1097
+ }
1098
+ function readOptionalEnv(name) {
1099
+ const value = process.env[name]?.trim();
1100
+ return value ? value : void 0;
1101
+ }
1102
+ function getModelForProvider(providerName) {
1103
+ return process.env.LLMWIKI_MODEL ?? PROVIDER_MODELS[providerName];
1104
+ }
1105
+ function getMiniMaxProvider() {
1106
+ const apiKey = process.env.MINIMAX_API_KEY;
1107
+ if (!apiKey) {
1108
+ throw new Error(
1109
+ "MiniMax provider requires MINIMAX_API_KEY environment variable.\n Set it with: export MINIMAX_API_KEY=your_key"
1110
+ );
1111
+ }
1112
+ return new MiniMaxProvider(getModelForProvider("minimax"), apiKey);
1113
+ }
1114
+ function getAnthropicProvider() {
1115
+ const model = resolveAnthropicModelFromEnv() ?? PROVIDER_MODELS.anthropic;
1116
+ const baseURL = resolveAnthropicBaseURLFromEnv();
1117
+ const auth = resolveAnthropicAuthFromEnv();
1118
+ return new AnthropicProvider(model, {
1119
+ baseURL,
1120
+ ...auth
1121
+ });
1122
+ }
1123
+ function getProviderName() {
1124
+ const providerName = process.env.LLMWIKI_PROVIDER ?? DEFAULT_PROVIDER;
1125
+ if (!SUPPORTED_PROVIDERS.has(providerName)) {
1126
+ throw new Error(
1127
+ `Unknown provider "${providerName}". Supported: ${[...SUPPORTED_PROVIDERS].join(", ")}`
1128
+ );
1129
+ }
1130
+ return providerName;
1131
+ }
1132
+ function getActiveProviderName() {
1133
+ return getProviderName();
1134
+ }
1135
+
1136
+ // src/utils/llm.ts
1137
+ function sleep(ms) {
1138
+ return new Promise((resolve) => setTimeout(resolve, ms));
1139
+ }
1140
+ async function callClaude(options) {
1141
+ const { system, messages, tools, maxTokens = 4096, stream = false, onToken } = options;
1142
+ const provider = getProvider();
1143
+ for (let attempt = 0; attempt <= RETRY_COUNT; attempt++) {
1144
+ try {
1145
+ if (stream) {
1146
+ return await provider.stream(system, messages, maxTokens, onToken);
1147
+ }
1148
+ if (tools && tools.length > 0) {
1149
+ return await provider.toolCall(system, messages, tools, maxTokens);
1150
+ }
1151
+ return await provider.complete(system, messages, maxTokens);
1152
+ } catch (error2) {
1153
+ if (attempt === RETRY_COUNT) throw error2;
1154
+ const delayMs = RETRY_BASE_MS * Math.pow(RETRY_MULTIPLIER, attempt);
1155
+ const errMsg = error2 instanceof Error ? error2.message : String(error2);
1156
+ console.warn(`\u26A0 API call failed (attempt ${attempt + 1}/${RETRY_COUNT + 1}): ${errMsg}`);
1157
+ console.warn(` Retrying in ${delayMs / 1e3}s...`);
1158
+ await sleep(delayMs);
1159
+ }
1160
+ }
1161
+ throw new Error("Unreachable");
1162
+ }
1163
+
1164
+ // src/utils/lock.ts
1165
+ import { open, readFile as readFile9, unlink, mkdir as mkdir4 } from "fs/promises";
1166
+ import path11 from "path";
1167
+ var RECLAIM_SUFFIX = ".reclaim";
1168
+ var MAX_ACQUIRE_ATTEMPTS = 2;
1169
+ function isProcessAlive(pid) {
1170
+ try {
1171
+ process.kill(pid, 0);
1172
+ return true;
1173
+ } catch {
1174
+ return false;
1175
+ }
1176
+ }
1177
+ async function acquireLock(root) {
1178
+ const lockPath = path11.join(root, LOCK_FILE);
1179
+ await mkdir4(path11.join(root, LLMWIKI_DIR), { recursive: true });
1180
+ for (let attempt = 0; attempt < MAX_ACQUIRE_ATTEMPTS; attempt++) {
1181
+ const created = await tryCreateLock(lockPath);
1182
+ if (created) return true;
1183
+ const stale = await isLockStale(lockPath);
1184
+ if (!stale) {
1185
+ status("!", warn("Another compilation is running."));
1186
+ return false;
1187
+ }
1188
+ const reclaimed = await reclaimStaleLock(root, lockPath);
1189
+ if (reclaimed) return true;
1190
+ }
1191
+ status("!", warn("Could not acquire lock after retrying."));
1192
+ return false;
1193
+ }
1194
+ async function reclaimStaleLock(root, lockPath) {
1195
+ const reclaimPath = lockPath + RECLAIM_SUFFIX;
1196
+ const gotReclaimLock = await acquireReclaimLock(reclaimPath);
1197
+ if (!gotReclaimLock) return false;
1198
+ try {
1199
+ if (!await isLockStale(lockPath)) {
1200
+ return false;
1201
+ }
1202
+ try {
1203
+ await unlink(lockPath);
1204
+ } catch {
1205
+ }
1206
+ const acquired = await tryCreateLock(lockPath);
1207
+ if (acquired) {
1208
+ status("i", dim("Reclaimed stale lock from dead process."));
1209
+ }
1210
+ return acquired;
1211
+ } finally {
1212
+ try {
1213
+ await unlink(reclaimPath);
1214
+ } catch {
1215
+ }
1216
+ }
1217
+ }
1218
+ async function acquireReclaimLock(reclaimPath) {
1219
+ if (await tryCreateLock(reclaimPath)) return true;
1220
+ if (!await isLockStale(reclaimPath)) return false;
1221
+ try {
1222
+ await unlink(reclaimPath);
1223
+ } catch {
1224
+ }
1225
+ return false;
1226
+ }
1227
+ async function tryCreateLock(lockPath) {
1228
+ try {
1229
+ const fd = await open(lockPath, "wx");
1230
+ await fd.writeFile(String(process.pid), "utf-8");
1231
+ await fd.close();
1232
+ return true;
1233
+ } catch (err) {
1234
+ if (err instanceof Error && "code" in err && err.code === "EEXIST") {
1235
+ return false;
1236
+ }
1237
+ throw err;
1238
+ }
1239
+ }
1240
+ async function isLockStale(lockPath) {
1241
+ try {
1242
+ const content = await readFile9(lockPath, "utf-8");
1243
+ const pid = parseInt(content.trim(), 10);
1244
+ if (isNaN(pid)) return true;
1245
+ return !isProcessAlive(pid);
1246
+ } catch {
1247
+ return true;
1248
+ }
1249
+ }
1250
+ async function releaseLock(root) {
1251
+ const lockPath = path11.join(root, LOCK_FILE);
1252
+ try {
1253
+ await unlink(lockPath);
1254
+ } catch {
1255
+ }
1256
+ }
1257
+
1258
+ // src/compiler/prompts.ts
1259
+ var PROVENANCE_STATE_VALUES = [
1260
+ "extracted",
1261
+ "merged",
1262
+ "inferred",
1263
+ "ambiguous"
1264
+ ];
1265
+ var CONCEPT_EXTRACTION_TOOL = {
1266
+ name: "extract_concepts",
1267
+ description: "Extract knowledge concepts from a source document",
1268
+ input_schema: {
1269
+ type: "object",
1270
+ properties: {
1271
+ concepts: {
1272
+ type: "array",
1273
+ items: {
1274
+ type: "object",
1275
+ properties: {
1276
+ concept: {
1277
+ type: "string",
1278
+ description: "Human-readable concept title"
1279
+ },
1280
+ summary: {
1281
+ type: "string",
1282
+ description: "One-line description"
1283
+ },
1284
+ is_new: {
1285
+ type: "boolean",
1286
+ description: "True if this is a new concept not in existing wiki"
1287
+ },
1288
+ tags: {
1289
+ type: "array",
1290
+ items: { type: "string" },
1291
+ description: "2-4 categorical tags for organizing this concept (e.g., 'machine-learning', 'optimization')"
1292
+ },
1293
+ confidence: {
1294
+ type: "number",
1295
+ description: "Confidence in this concept on a 0..1 scale (1 = directly stated, 0 = highly speculative)."
1296
+ },
1297
+ provenance_state: {
1298
+ type: "string",
1299
+ enum: PROVENANCE_STATE_VALUES,
1300
+ description: "How this concept was produced: 'extracted' (direct from source), 'merged' (synthesised across sources), 'inferred' (model deduction), or 'ambiguous' (sources disagree)."
1301
+ },
1302
+ contradicted_by: {
1303
+ type: "array",
1304
+ items: {
1305
+ type: "object",
1306
+ properties: {
1307
+ slug: { type: "string", description: "Slug of the contradicting concept." },
1308
+ reason: { type: "string", description: "Brief reason for the contradiction." }
1309
+ },
1310
+ required: ["slug"]
1311
+ },
1312
+ description: "Slugs of other concepts whose evidence contradicts this one."
1313
+ },
1314
+ inferred_paragraphs: {
1315
+ type: "integer",
1316
+ description: "Estimated number of paragraphs in the page that will be inferred rather than directly cited."
1317
+ }
1318
+ },
1319
+ required: ["concept", "summary", "is_new"]
1320
+ }
1321
+ }
1322
+ },
1323
+ required: ["concepts"]
1324
+ }
1325
+ };
1326
+ function buildExtractionPrompt(sourceContent, existingIndex) {
1327
+ const indexSection = existingIndex ? `
1328
+
1329
+ Here is the existing wiki index \u2014 avoid duplicating concepts already covered:
1330
+
1331
+ ${existingIndex}` : "\n\nNo existing wiki pages yet.";
1332
+ return [
1333
+ "You are a knowledge extraction engine. Analyze the following source document",
1334
+ "and identify 3-8 distinct, meaningful concepts worth documenting as wiki pages.",
1335
+ "Each concept should be a standalone topic that someone might look up.",
1336
+ "Focus on key ideas, techniques, patterns, or entities \u2014 not trivial details.",
1337
+ "Use the extract_concepts tool to return your findings.",
1338
+ "",
1339
+ "For every concept, emit provenance metadata so downstream tools can reason",
1340
+ "about reliability:",
1341
+ " - confidence: 0..1 \u2014 how certain you are the source supports this concept.",
1342
+ " - provenance_state: 'extracted' if directly stated, 'merged' if synthesised",
1343
+ " from multiple parts of the source, 'inferred' if reasoned from context,",
1344
+ " or 'ambiguous' if the source is contradictory or unclear.",
1345
+ " - contradicted_by: slugs of other concepts (in this batch or the index)",
1346
+ " whose evidence conflicts with this one.",
1347
+ " - inferred_paragraphs: estimated number of paragraphs in the resulting",
1348
+ " page that will be inferred rather than directly citable.",
1349
+ indexSection,
1350
+ "\n\n--- SOURCE DOCUMENT ---\n\n",
1351
+ sourceContent
1352
+ ].join("\n");
1353
+ }
1354
+ function buildPagePrompt(concept, sourceContent, existingPage, relatedPages) {
1355
+ const existingSection = existingPage ? `
1356
+
1357
+ Existing page to update:
1358
+
1359
+ ${existingPage}` : "";
1360
+ const relatedSection = relatedPages ? `
1361
+
1362
+ Related wiki pages for cross-referencing:
1363
+
1364
+ ${relatedPages}` : "";
1365
+ return [
1366
+ `You are a wiki author. Write a clear, well-structured markdown page about "${concept}".`,
1367
+ "Draw facts only from the provided source material.",
1368
+ "Include a ## Sources section at the end listing the source document.",
1369
+ "Suggest [[wikilinks]] to related concepts where appropriate.",
1370
+ "Write in a neutral, informative tone. Be concise but thorough.",
1371
+ "",
1372
+ "Source attribution: at the end of each prose paragraph, append a citation",
1373
+ "marker showing which source file(s) the paragraph drew from.",
1374
+ "Format: ^[filename.md] for single-source, ^[source-a.md, source-b.md] for multi-source.",
1375
+ "When a single sentence makes a specific factual claim and you can identify the",
1376
+ "exact line range it came from, you may use the claim-level form",
1377
+ "^[filename.md:START-END] (or ^[filename.md#LSTART-LEND]) at the end of that",
1378
+ "sentence \u2014 START and END are 1-indexed line numbers in the source file.",
1379
+ "Paragraph-level citations remain the default; only switch to claim-level form",
1380
+ "when it materially improves verifiability and the line range is unambiguous.",
1381
+ "Place citations only at the end of prose paragraphs or sentences \u2014 not on",
1382
+ "headings, list items, or code blocks.",
1383
+ "Source filenames are visible as `--- SOURCE: filename.md ---` headers in the content below.",
1384
+ "",
1385
+ "If a paragraph is your inference rather than a direct extraction, leave it",
1072
1386
  "uncited \u2014 downstream lint rules will count uncited paragraphs as 'inferred'",
1073
1387
  "to compute the page's provenance metadata.",
1074
1388
  existingSection,
@@ -1077,774 +1391,1504 @@ ${relatedPages}` : "";
1077
1391
  sourceContent
1078
1392
  ].join("\n");
1079
1393
  }
1080
- function isValidRawConcept(c) {
1081
- return typeof c.concept === "string" && typeof c.summary === "string" && typeof c.is_new === "boolean" && (c.tags === void 0 || Array.isArray(c.tags));
1394
+ function isValidRawConcept(c) {
1395
+ return typeof c.concept === "string" && typeof c.summary === "string" && typeof c.is_new === "boolean" && (c.tags === void 0 || Array.isArray(c.tags));
1396
+ }
1397
+ function coerceContradictedBy(raw) {
1398
+ if (!Array.isArray(raw)) return void 0;
1399
+ const refs = [];
1400
+ for (const entry of raw) {
1401
+ if (!entry || typeof entry !== "object") continue;
1402
+ const obj = entry;
1403
+ if (typeof obj.slug !== "string" || obj.slug.trim().length === 0) continue;
1404
+ const ref = { slug: obj.slug.trim() };
1405
+ if (typeof obj.reason === "string") ref.reason = obj.reason;
1406
+ refs.push(ref);
1407
+ }
1408
+ return refs.length > 0 ? refs : void 0;
1409
+ }
1410
+ function mapRawConcept(c) {
1411
+ const provenance = typeof c.provenance_state === "string" && PROVENANCE_STATE_VALUES.includes(c.provenance_state) ? c.provenance_state : void 0;
1412
+ return {
1413
+ concept: c.concept,
1414
+ summary: c.summary,
1415
+ is_new: c.is_new,
1416
+ tags: Array.isArray(c.tags) ? c.tags : void 0,
1417
+ confidence: typeof c.confidence === "number" ? c.confidence : void 0,
1418
+ provenanceState: provenance,
1419
+ contradictedBy: coerceContradictedBy(c.contradicted_by),
1420
+ inferredParagraphs: typeof c.inferred_paragraphs === "number" && Number.isInteger(c.inferred_paragraphs) && c.inferred_paragraphs >= 0 ? c.inferred_paragraphs : void 0
1421
+ };
1422
+ }
1423
+ function buildSeedPagePrompt(seed, rule, relatedPagesContent) {
1424
+ const minLinks = rule.minWikilinks;
1425
+ const linkExpectation = minLinks > 0 ? `Include at least ${minLinks} [[wikilinks]] to related pages.` : "Use [[wikilinks]] when referencing other pages.";
1426
+ return [
1427
+ `You are a wiki author. Write a ${seed.kind} page titled "${seed.title}".`,
1428
+ `Page-kind guidance: ${rule.description}`,
1429
+ `Summary line for context: ${seed.summary}`,
1430
+ "Draw facts only from the related wiki pages provided below.",
1431
+ linkExpectation,
1432
+ "Write in a neutral, informative tone. Be concise but thorough.",
1433
+ "\n\n--- RELATED PAGES ---\n\n",
1434
+ relatedPagesContent
1435
+ ].join("\n");
1436
+ }
1437
+ function parseConcepts(toolOutput) {
1438
+ try {
1439
+ const parsed = JSON.parse(toolOutput);
1440
+ const concepts = parsed.concepts ?? [];
1441
+ return concepts.filter(isValidRawConcept).map(mapRawConcept);
1442
+ } catch {
1443
+ return [];
1444
+ }
1445
+ }
1446
+
1447
+ // src/schema/types.ts
1448
+ var PAGE_KINDS = [
1449
+ "concept",
1450
+ "entity",
1451
+ "comparison",
1452
+ "overview"
1453
+ ];
1454
+
1455
+ // src/schema/defaults.ts
1456
+ var DEFAULT_MIN_LINKS = {
1457
+ concept: 0,
1458
+ entity: 1,
1459
+ comparison: 2,
1460
+ overview: 3
1461
+ };
1462
+ var DEFAULT_DESCRIPTIONS = {
1463
+ concept: "A standalone idea, technique, or pattern worth documenting.",
1464
+ entity: "A specific thing \u2014 a person, product, organization, or named artifact.",
1465
+ comparison: "A side-by-side analysis weighing two or more concepts or entities.",
1466
+ overview: "A top-down map page that situates several concepts within a domain."
1467
+ };
1468
+ function buildDefaultKindRules() {
1469
+ return {
1470
+ concept: { minWikilinks: DEFAULT_MIN_LINKS.concept, description: DEFAULT_DESCRIPTIONS.concept },
1471
+ entity: { minWikilinks: DEFAULT_MIN_LINKS.entity, description: DEFAULT_DESCRIPTIONS.entity },
1472
+ comparison: {
1473
+ minWikilinks: DEFAULT_MIN_LINKS.comparison,
1474
+ description: DEFAULT_DESCRIPTIONS.comparison
1475
+ },
1476
+ overview: {
1477
+ minWikilinks: DEFAULT_MIN_LINKS.overview,
1478
+ description: DEFAULT_DESCRIPTIONS.overview
1479
+ }
1480
+ };
1481
+ }
1482
+ function buildDefaultSchema() {
1483
+ return {
1484
+ version: 1,
1485
+ defaultKind: "concept",
1486
+ kinds: buildDefaultKindRules(),
1487
+ seedPages: [],
1488
+ loadedFrom: null
1489
+ };
1490
+ }
1491
+
1492
+ // src/schema/loader.ts
1493
+ import { existsSync as existsSync2 } from "fs";
1494
+ import { readFile as readFile10 } from "fs/promises";
1495
+ import path12 from "path";
1496
+ import yaml2 from "js-yaml";
1497
+ var SCHEMA_CANDIDATE_PATHS = [
1498
+ ".llmwiki/schema.json",
1499
+ ".llmwiki/schema.yaml",
1500
+ ".llmwiki/schema.yml",
1501
+ "wiki/.schema.yaml",
1502
+ "wiki/.schema.yml"
1503
+ ];
1504
+ function findSchemaPath(root) {
1505
+ for (const candidate of SCHEMA_CANDIDATE_PATHS) {
1506
+ const absolute = path12.join(root, candidate);
1507
+ if (existsSync2(absolute)) return absolute;
1508
+ }
1509
+ return null;
1510
+ }
1511
+ function parseSchemaFile(filePath, content) {
1512
+ const isJson = filePath.endsWith(".json");
1513
+ const parsed = isJson ? JSON.parse(content) : yaml2.load(content);
1514
+ if (parsed && typeof parsed === "object") return parsed;
1515
+ return {};
1516
+ }
1517
+ function isPageKind(value) {
1518
+ return typeof value === "string" && PAGE_KINDS.includes(value);
1519
+ }
1520
+ function mergeKindRule(defaults, override) {
1521
+ if (!override) return defaults;
1522
+ const minWikilinks = typeof override.minWikilinks === "number" ? override.minWikilinks : defaults.minWikilinks;
1523
+ const description = typeof override.description === "string" ? override.description : defaults.description;
1524
+ return { minWikilinks, description };
1525
+ }
1526
+ function mergeKinds(defaults, overrides) {
1527
+ const merged = { ...defaults };
1528
+ if (!overrides) return merged;
1529
+ for (const kind of PAGE_KINDS) {
1530
+ merged[kind] = mergeKindRule(defaults[kind], overrides[kind]);
1531
+ }
1532
+ return merged;
1533
+ }
1534
+ function normalizeSeedPage(entry) {
1535
+ if (typeof entry.title !== "string" || entry.title.trim() === "") return null;
1536
+ if (!isPageKind(entry.kind)) return null;
1537
+ const summary = typeof entry.summary === "string" ? entry.summary : "";
1538
+ const relatedSlugs = Array.isArray(entry.relatedSlugs) ? entry.relatedSlugs.filter((slug) => typeof slug === "string") : void 0;
1539
+ return { title: entry.title, kind: entry.kind, summary, relatedSlugs };
1540
+ }
1541
+ function normalizeSeedPages(entries) {
1542
+ if (!Array.isArray(entries)) return [];
1543
+ return entries.map(normalizeSeedPage).filter((entry) => entry !== null);
1544
+ }
1545
+ function applyOverrides(defaults, overrides, loadedFrom) {
1546
+ const defaultKind = isPageKind(overrides.defaultKind) ? overrides.defaultKind : defaults.defaultKind;
1547
+ return {
1548
+ version: 1,
1549
+ defaultKind,
1550
+ kinds: mergeKinds(defaults.kinds, overrides.kinds),
1551
+ seedPages: normalizeSeedPages(overrides.seedPages),
1552
+ loadedFrom
1553
+ };
1554
+ }
1555
+ async function loadSchema(root) {
1556
+ const defaults = buildDefaultSchema();
1557
+ const schemaPath = findSchemaPath(root);
1558
+ if (!schemaPath) return defaults;
1559
+ const raw = await readFile10(schemaPath, "utf-8");
1560
+ const parsed = parseSchemaFile(schemaPath, raw);
1561
+ return applyOverrides(defaults, parsed, schemaPath);
1562
+ }
1563
+ function defaultSchemaInitPath(root) {
1564
+ return path12.join(root, SCHEMA_CANDIDATE_PATHS[0]);
1565
+ }
1566
+
1567
+ // src/schema/helpers.ts
1568
+ import yaml3 from "js-yaml";
1569
+ var WIKILINK_PATTERN = /\[\[([^\]]+)\]\]/g;
1570
+ function resolvePageKind(rawKind, schema) {
1571
+ if (typeof rawKind === "string" && PAGE_KINDS.includes(rawKind)) {
1572
+ return rawKind;
1573
+ }
1574
+ return schema.defaultKind;
1575
+ }
1576
+ function countWikilinks(body) {
1577
+ const matches = body.match(WIKILINK_PATTERN);
1578
+ return matches ? matches.length : 0;
1579
+ }
1580
+ function serializeSchemaToYaml(schema) {
1581
+ const serializable = {
1582
+ version: schema.version,
1583
+ defaultKind: schema.defaultKind,
1584
+ kinds: schema.kinds,
1585
+ seedPages: schema.seedPages
1586
+ };
1587
+ return yaml3.dump(serializable, { lineWidth: -1, quotingType: '"' });
1588
+ }
1589
+
1590
+ // src/compiler/deps.ts
1591
+ function buildConceptToSourcesMap(sources) {
1592
+ const conceptMap = /* @__PURE__ */ new Map();
1593
+ for (const [sourceFile, entry] of Object.entries(sources)) {
1594
+ for (const slug of entry.concepts) {
1595
+ const existing = conceptMap.get(slug);
1596
+ if (existing) {
1597
+ existing.push(sourceFile);
1598
+ } else {
1599
+ conceptMap.set(slug, [sourceFile]);
1600
+ }
1601
+ }
1602
+ }
1603
+ return conceptMap;
1604
+ }
1605
+ function filesByStatus(changes, ...statuses) {
1606
+ const statusSet = new Set(statuses);
1607
+ return new Set(
1608
+ changes.filter((c) => statusSet.has(c.status)).map((c) => c.file)
1609
+ );
1610
+ }
1611
+ function collectSharedContributors(sourceFile, state, conceptMap, excludeSets, out) {
1612
+ const sourceEntry = state.sources[sourceFile];
1613
+ if (!sourceEntry) return;
1614
+ for (const slug of sourceEntry.concepts) {
1615
+ const contributors = conceptMap.get(slug);
1616
+ if (!contributors || contributors.length < 2) continue;
1617
+ for (const contributor of contributors) {
1618
+ const isExcluded = excludeSets.some((s) => s.has(contributor));
1619
+ if (!isExcluded) out.add(contributor);
1620
+ }
1621
+ }
1622
+ }
1623
+ function findAffectedSources(state, directChanges) {
1624
+ const changedFiles = filesByStatus(directChanges, "new", "changed");
1625
+ const deletedFiles = filesByStatus(directChanges, "deleted");
1626
+ const conceptMap = buildConceptToSourcesMap(state.sources);
1627
+ const affected = /* @__PURE__ */ new Set();
1628
+ for (const changedFile of changedFiles) {
1629
+ collectSharedContributors(
1630
+ changedFile,
1631
+ state,
1632
+ conceptMap,
1633
+ [changedFiles, deletedFiles, affected],
1634
+ affected
1635
+ );
1636
+ }
1637
+ return Array.from(affected);
1638
+ }
1639
+ function findFrozenSlugs(state, changes) {
1640
+ const frozen = new Set(state.frozenSlugs ?? []);
1641
+ const deletedFiles = changes.filter((c) => c.status === "deleted").map((c) => c.file);
1642
+ const conceptMap = buildConceptToSourcesMap(state.sources);
1643
+ for (const file of deletedFiles) {
1644
+ const entry = state.sources[file];
1645
+ if (!entry) continue;
1646
+ for (const slug of entry.concepts) {
1647
+ const contributors = conceptMap.get(slug);
1648
+ if (contributors && contributors.length > 1) {
1649
+ frozen.add(slug);
1650
+ }
1651
+ }
1652
+ }
1653
+ return frozen;
1654
+ }
1655
+ async function persistFrozenSlugs(root, frozenSlugs, successfulExtractions) {
1656
+ const currentState = await readState(root);
1657
+ const conceptMap = buildConceptToSourcesMap(currentState.sources);
1658
+ const extractedBy = /* @__PURE__ */ new Set();
1659
+ for (const result of successfulExtractions) {
1660
+ if (result.concepts.length === 0) continue;
1661
+ for (const c of result.concepts) {
1662
+ extractedBy.add(slugify(c.concept));
1663
+ }
1664
+ }
1665
+ const compiledFiles = new Set(
1666
+ successfulExtractions.filter((r) => r.concepts.length > 0).map((r) => r.sourceFile)
1667
+ );
1668
+ const remaining = /* @__PURE__ */ new Set();
1669
+ for (const slug of frozenSlugs) {
1670
+ const owners = conceptMap.get(slug) ?? [];
1671
+ const allOwnersCompiled = owners.length > 0 && owners.every((f) => compiledFiles.has(f)) && extractedBy.has(slug);
1672
+ if (!allOwnersCompiled) remaining.add(slug);
1673
+ }
1674
+ const stateToSave = { ...currentState, frozenSlugs: Array.from(remaining) };
1675
+ await writeState(root, stateToSave);
1676
+ }
1677
+ function collectFreshSlugs(extractions, state) {
1678
+ const freshSlugs = /* @__PURE__ */ new Set();
1679
+ for (const result of extractions) {
1680
+ const oldConcepts = new Set(state.sources[result.sourceFile]?.concepts ?? []);
1681
+ for (const c of result.concepts) {
1682
+ const slug = slugify(c.concept);
1683
+ if (!oldConcepts.has(slug)) freshSlugs.add(slug);
1684
+ }
1685
+ }
1686
+ return freshSlugs;
1687
+ }
1688
+ function findSlugOwners(slugs, conceptMap, excludeSets) {
1689
+ const affected = /* @__PURE__ */ new Set();
1690
+ for (const slug of slugs) {
1691
+ const owners = conceptMap.get(slug);
1692
+ if (!owners) continue;
1693
+ for (const owner of owners) {
1694
+ const isExcluded = excludeSets.some((s) => s.has(owner));
1695
+ if (!isExcluded) affected.add(owner);
1696
+ }
1697
+ }
1698
+ return Array.from(affected);
1699
+ }
1700
+ function findLateAffectedSources(extractions, state, allChanges) {
1701
+ const compilingFiles = filesByStatus(allChanges, "new", "changed");
1702
+ const deletedFiles = filesByStatus(allChanges, "deleted");
1703
+ const conceptMap = buildConceptToSourcesMap(state.sources);
1704
+ const freshSlugs = collectFreshSlugs(extractions, state);
1705
+ return findSlugOwners(freshSlugs, conceptMap, [compilingFiles, deletedFiles]);
1706
+ }
1707
+ function findSharedConcepts(sourceFile, state) {
1708
+ const shared = /* @__PURE__ */ new Set();
1709
+ const sourceEntry = state.sources[sourceFile];
1710
+ if (!sourceEntry) return shared;
1711
+ const conceptMap = buildConceptToSourcesMap(state.sources);
1712
+ for (const slug of sourceEntry.concepts) {
1713
+ const contributors = conceptMap.get(slug);
1714
+ if (contributors && contributors.length > 1) {
1715
+ shared.add(slug);
1716
+ }
1717
+ }
1718
+ return shared;
1719
+ }
1720
+ async function freezeFailedExtractions(root, results, frozenSlugs) {
1721
+ for (const result of results) {
1722
+ if (result.concepts.length > 0) continue;
1723
+ status("!", warn(`${result.sourceFile}: no concepts \u2014 will retry.`));
1724
+ const currentState = await readState(root);
1725
+ const oldConcepts = currentState.sources[result.sourceFile]?.concepts ?? [];
1726
+ for (const slug of oldConcepts) frozenSlugs.add(slug);
1727
+ await updateSourceState(root, result.sourceFile, {
1728
+ hash: "",
1729
+ concepts: oldConcepts,
1730
+ compiledAt: (/* @__PURE__ */ new Date()).toISOString()
1731
+ });
1732
+ }
1733
+ }
1734
+
1735
+ // src/compiler/orphan.ts
1736
+ import path13 from "path";
1737
+ async function markOrphaned(root, sourceFile, state) {
1738
+ const sourceEntry = state.sources[sourceFile];
1739
+ if (!sourceEntry) return;
1740
+ const sharedSlugs = findSharedConcepts(sourceFile, state);
1741
+ for (const slug of sourceEntry.concepts) {
1742
+ if (sharedSlugs.has(slug)) {
1743
+ status("i", dim(`Kept: ${slug}.md (shared with other sources)`));
1744
+ continue;
1745
+ }
1746
+ await orphanPage(root, slug, "source deleted");
1747
+ }
1748
+ await removeSourceState(root, sourceFile);
1749
+ }
1750
+ async function orphanUnownedFrozenPages(root, frozenSlugs) {
1751
+ const currentState = await readState(root);
1752
+ const ownedSlugs = /* @__PURE__ */ new Set();
1753
+ for (const entry of Object.values(currentState.sources)) {
1754
+ for (const slug of entry.concepts) ownedSlugs.add(slug);
1755
+ }
1756
+ for (const slug of frozenSlugs) {
1757
+ if (ownedSlugs.has(slug)) continue;
1758
+ await orphanPage(root, slug, "no remaining sources");
1759
+ }
1760
+ }
1761
+ async function orphanPage(root, slug, reason) {
1762
+ const pagePath = path13.join(root, CONCEPTS_DIR, `${slug}.md`);
1763
+ const content = await safeReadFile(pagePath);
1764
+ if (!content) return;
1765
+ const { meta } = parseFrontmatter(content);
1766
+ if (meta.orphaned === true) return;
1767
+ const updated = content.replace("---\n", "---\norphaned: true\n");
1768
+ await atomicWrite(pagePath, updated);
1769
+ status("\u26A0", warn(`Orphaned: ${slug}.md (${reason})`));
1770
+ }
1771
+
1772
+ // src/compiler/resolver.ts
1773
+ import { readdir as readdir2, readFile as readFile11 } from "fs/promises";
1774
+ import path14 from "path";
1775
+ import { existsSync as existsSync3 } from "fs";
1776
+ async function buildTitleIndex(root) {
1777
+ const conceptsDir = path14.join(root, CONCEPTS_DIR);
1778
+ if (!existsSync3(conceptsDir)) return [];
1779
+ const files = await readdir2(conceptsDir);
1780
+ const pages = [];
1781
+ for (const file of files) {
1782
+ if (!file.endsWith(".md")) continue;
1783
+ const filePath = path14.join(conceptsDir, file);
1784
+ const content = await readFile11(filePath, "utf-8");
1785
+ const { meta } = parseFrontmatter(content);
1786
+ if (meta.title && typeof meta.title === "string" && !meta.orphaned) {
1787
+ pages.push({
1788
+ slug: file.replace(/\.md$/, ""),
1789
+ title: meta.title,
1790
+ filePath
1791
+ });
1792
+ }
1793
+ }
1794
+ return pages;
1795
+ }
1796
+ function isInsideWikilink(text, position) {
1797
+ const before = text.lastIndexOf("[[", position);
1798
+ const after = text.indexOf("]]", position);
1799
+ if (before === -1 || after === -1) return false;
1800
+ const closeBefore = text.indexOf("]]", before);
1801
+ return closeBefore >= position;
1802
+ }
1803
+ function isInsideCitation(text, position) {
1804
+ const before = text.lastIndexOf("^[", position);
1805
+ const after = text.indexOf("]", position);
1806
+ if (before === -1 || after === -1) return false;
1807
+ const closeBefore = text.indexOf("]", before);
1808
+ return closeBefore >= position;
1809
+ }
1810
+ function isWordBoundary(text, start, end) {
1811
+ const before = start === 0 || /[\s,.:;!?()\[\]{}/"']/.test(text[start - 1]);
1812
+ const after = end >= text.length || /[\s,.:;!?()\[\]{}/"']/.test(text[end]);
1813
+ return before && after;
1814
+ }
1815
+ function findTitleMatches(text, title) {
1816
+ const escaped = title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1817
+ const regex = new RegExp(escaped, "gi");
1818
+ const matches = [];
1819
+ let match;
1820
+ while ((match = regex.exec(text)) !== null) {
1821
+ matches.push({ start: match.index, end: match.index + match[0].length });
1822
+ }
1823
+ return matches;
1824
+ }
1825
+ function isLinkablePosition(text, start, end) {
1826
+ if (isInsideWikilink(text, start)) return false;
1827
+ if (isInsideCitation(text, start)) return false;
1828
+ return isWordBoundary(text, start, end);
1829
+ }
1830
+ function addWikilinks(body, titles, selfTitle) {
1831
+ let result = body;
1832
+ const selfLower = selfTitle.toLowerCase();
1833
+ for (const page of titles) {
1834
+ if (page.title.toLowerCase() === selfLower) continue;
1835
+ const matches = findTitleMatches(result, page.title);
1836
+ for (const m of matches.reverse()) {
1837
+ if (!isLinkablePosition(result, m.start, m.end)) continue;
1838
+ result = result.slice(0, m.start) + `[[${page.slug}|${page.title}]]` + result.slice(m.end);
1839
+ }
1840
+ }
1841
+ return result;
1842
+ }
1843
+ async function resolveLinks(root, changedSlugs, newSlugs) {
1844
+ const titleIndex = await buildTitleIndex(root);
1845
+ if (titleIndex.length === 0) return 0;
1846
+ let linkCount = 0;
1847
+ linkCount += await resolveOutboundLinks(titleIndex, changedSlugs);
1848
+ linkCount += await resolveInboundLinks(titleIndex, newSlugs);
1849
+ if (linkCount > 0) {
1850
+ status("\u{1F517}", dim(`Resolved links in ${linkCount} page(s)`));
1851
+ }
1852
+ return linkCount;
1853
+ }
1854
+ async function resolveOutboundLinks(titleIndex, changedSlugs) {
1855
+ let count = 0;
1856
+ for (const page of titleIndex) {
1857
+ if (!changedSlugs.includes(page.slug)) continue;
1858
+ const didLink = await linkPage(page, titleIndex);
1859
+ if (didLink) count++;
1860
+ }
1861
+ return count;
1082
1862
  }
1083
- function coerceContradictedBy(raw) {
1084
- if (!Array.isArray(raw)) return void 0;
1085
- const refs = [];
1086
- for (const entry of raw) {
1087
- if (!entry || typeof entry !== "object") continue;
1088
- const obj = entry;
1089
- if (typeof obj.slug !== "string" || obj.slug.trim().length === 0) continue;
1090
- const ref = { slug: obj.slug.trim() };
1091
- if (typeof obj.reason === "string") ref.reason = obj.reason;
1092
- refs.push(ref);
1863
+ async function resolveInboundLinks(titleIndex, newSlugs) {
1864
+ if (newSlugs.length === 0) return 0;
1865
+ const newTitles = titleIndex.filter((p) => newSlugs.includes(p.slug));
1866
+ if (newTitles.length === 0) return 0;
1867
+ let count = 0;
1868
+ for (const page of titleIndex) {
1869
+ if (newSlugs.includes(page.slug)) continue;
1870
+ const content = await readFile11(page.filePath, "utf-8");
1871
+ const { body } = parseFrontmatter(content);
1872
+ const linked = addWikilinks(body, newTitles, page.title);
1873
+ if (linked !== body) {
1874
+ const newContent = content.replace(body, linked);
1875
+ await atomicWrite(page.filePath, newContent);
1876
+ count++;
1877
+ }
1093
1878
  }
1094
- return refs.length > 0 ? refs : void 0;
1879
+ return count;
1095
1880
  }
1096
- function mapRawConcept(c) {
1097
- const provenance = typeof c.provenance_state === "string" && PROVENANCE_STATE_VALUES.includes(c.provenance_state) ? c.provenance_state : void 0;
1098
- return {
1099
- concept: c.concept,
1100
- summary: c.summary,
1101
- is_new: c.is_new,
1102
- tags: Array.isArray(c.tags) ? c.tags : void 0,
1103
- confidence: typeof c.confidence === "number" ? c.confidence : void 0,
1104
- provenanceState: provenance,
1105
- contradictedBy: coerceContradictedBy(c.contradicted_by),
1106
- inferredParagraphs: typeof c.inferred_paragraphs === "number" && Number.isInteger(c.inferred_paragraphs) && c.inferred_paragraphs >= 0 ? c.inferred_paragraphs : void 0
1107
- };
1881
+ async function linkPage(page, titleIndex) {
1882
+ const content = await readFile11(page.filePath, "utf-8");
1883
+ const { body } = parseFrontmatter(content);
1884
+ const linked = addWikilinks(body, titleIndex, page.title);
1885
+ if (linked === body) return false;
1886
+ const newContent = content.replace(body, linked);
1887
+ await atomicWrite(page.filePath, newContent);
1888
+ return true;
1108
1889
  }
1109
- function parseConcepts(toolOutput) {
1890
+
1891
+ // src/compiler/indexgen.ts
1892
+ import { readdir as readdir3 } from "fs/promises";
1893
+ import path15 from "path";
1894
+ async function generateIndex(root) {
1895
+ status("*", info("Generating index..."));
1896
+ const conceptsPath = path15.join(root, CONCEPTS_DIR);
1897
+ const queriesPath = path15.join(root, QUERIES_DIR);
1898
+ const concepts = await collectPageSummaries(conceptsPath);
1899
+ const queries = await collectPageSummaries(queriesPath);
1900
+ concepts.sort((a, b) => a.title.localeCompare(b.title));
1901
+ queries.sort((a, b) => a.title.localeCompare(b.title));
1902
+ const indexContent = buildIndexContent(concepts, queries);
1903
+ const indexPath = path15.join(root, INDEX_FILE);
1904
+ await atomicWrite(indexPath, indexContent);
1905
+ const total = concepts.length + queries.length;
1906
+ status("+", success(`Index updated with ${total} pages.`));
1907
+ }
1908
+ async function scanWikiPages(dirPath) {
1909
+ let files;
1110
1910
  try {
1111
- const parsed = JSON.parse(toolOutput);
1112
- const concepts = parsed.concepts ?? [];
1113
- return concepts.filter(isValidRawConcept).map(mapRawConcept);
1911
+ files = await readdir3(dirPath);
1114
1912
  } catch {
1115
1913
  return [];
1116
1914
  }
1117
- }
1118
-
1119
- // src/compiler/deps.ts
1120
- function buildConceptToSourcesMap(sources) {
1121
- const conceptMap = /* @__PURE__ */ new Map();
1122
- for (const [sourceFile, entry] of Object.entries(sources)) {
1123
- for (const slug of entry.concepts) {
1124
- const existing = conceptMap.get(slug);
1125
- if (existing) {
1126
- existing.push(sourceFile);
1127
- } else {
1128
- conceptMap.set(slug, [sourceFile]);
1129
- }
1130
- }
1915
+ const scanned = [];
1916
+ for (const file of files.filter((f) => f.endsWith(".md"))) {
1917
+ const content = await safeReadFile(path15.join(dirPath, file));
1918
+ const { meta } = parseFrontmatter(content);
1919
+ scanned.push({ slug: file.replace(/\.md$/, ""), meta });
1131
1920
  }
1132
- return conceptMap;
1921
+ return scanned;
1133
1922
  }
1134
- function filesByStatus(changes, ...statuses) {
1135
- const statusSet = new Set(statuses);
1136
- return new Set(
1137
- changes.filter((c) => statusSet.has(c.status)).map((c) => c.file)
1138
- );
1923
+ async function collectPageSummaries(conceptsPath) {
1924
+ const scanned = await scanWikiPages(conceptsPath);
1925
+ return scanned.filter(({ meta }) => meta.title && typeof meta.title === "string" && !meta.orphaned).map(({ slug, meta }) => ({
1926
+ title: meta.title,
1927
+ slug,
1928
+ summary: typeof meta.summary === "string" ? meta.summary : ""
1929
+ }));
1139
1930
  }
1140
- function collectSharedContributors(sourceFile, state, conceptMap, excludeSets, out) {
1141
- const sourceEntry = state.sources[sourceFile];
1142
- if (!sourceEntry) return;
1143
- for (const slug of sourceEntry.concepts) {
1144
- const contributors = conceptMap.get(slug);
1145
- if (!contributors || contributors.length < 2) continue;
1146
- for (const contributor of contributors) {
1147
- const isExcluded = excludeSets.some((s) => s.has(contributor));
1148
- if (!isExcluded) out.add(contributor);
1149
- }
1150
- }
1931
+ function stripWikilinks(text) {
1932
+ return text.replace(/\[\[([^\]]+)\]\]/g, "$1");
1151
1933
  }
1152
- function findAffectedSources(state, directChanges) {
1153
- const changedFiles = filesByStatus(directChanges, "new", "changed");
1154
- const deletedFiles = filesByStatus(directChanges, "deleted");
1155
- const conceptMap = buildConceptToSourcesMap(state.sources);
1156
- const affected = /* @__PURE__ */ new Set();
1157
- for (const changedFile of changedFiles) {
1158
- collectSharedContributors(
1159
- changedFile,
1160
- state,
1161
- conceptMap,
1162
- [changedFiles, deletedFiles, affected],
1163
- affected
1164
- );
1934
+ function buildIndexContent(concepts, queries) {
1935
+ const lines = ["# Knowledge Wiki", "", "## Concepts", ""];
1936
+ for (const page of concepts) {
1937
+ lines.push(`- **[[${page.slug}|${page.title}]]** \u2014 ${stripWikilinks(page.summary)}`);
1165
1938
  }
1166
- return Array.from(affected);
1167
- }
1168
- function findFrozenSlugs(state, changes) {
1169
- const frozen = new Set(state.frozenSlugs ?? []);
1170
- const deletedFiles = changes.filter((c) => c.status === "deleted").map((c) => c.file);
1171
- const conceptMap = buildConceptToSourcesMap(state.sources);
1172
- for (const file of deletedFiles) {
1173
- const entry = state.sources[file];
1174
- if (!entry) continue;
1175
- for (const slug of entry.concepts) {
1176
- const contributors = conceptMap.get(slug);
1177
- if (contributors && contributors.length > 1) {
1178
- frozen.add(slug);
1179
- }
1939
+ if (queries.length > 0) {
1940
+ lines.push("", "## Saved Queries", "");
1941
+ for (const page of queries) {
1942
+ lines.push(`- **[[${page.slug}|${page.title}]]** \u2014 ${stripWikilinks(page.summary)}`);
1180
1943
  }
1181
1944
  }
1182
- return frozen;
1945
+ const total = concepts.length + queries.length;
1946
+ lines.push("");
1947
+ lines.push(`_${total} pages | Generated ${(/* @__PURE__ */ new Date()).toISOString()}_`);
1948
+ lines.push("");
1949
+ return lines.join("\n");
1183
1950
  }
1184
- async function persistFrozenSlugs(root, frozenSlugs, successfulExtractions) {
1185
- const currentState = await readState(root);
1186
- const conceptMap = buildConceptToSourcesMap(currentState.sources);
1187
- const extractedBy = /* @__PURE__ */ new Set();
1188
- for (const result of successfulExtractions) {
1189
- if (result.concepts.length === 0) continue;
1190
- for (const c of result.concepts) {
1191
- extractedBy.add(slugify(c.concept));
1192
- }
1951
+
1952
+ // src/compiler/obsidian.ts
1953
+ import { readdir as readdir4 } from "fs/promises";
1954
+ import path16 from "path";
1955
+ var ABBREVIATION_MIN_WORDS = 3;
1956
+ var SWAP_CONJUNCTIONS = [" and ", " or "];
1957
+ function addObsidianMeta(frontmatter, conceptTitle, tags) {
1958
+ frontmatter.tags = tags;
1959
+ frontmatter.aliases = generateAliases(conceptTitle);
1960
+ }
1961
+ function generateAliases(title) {
1962
+ const aliases = [];
1963
+ const slug = slugify(title);
1964
+ if (slug !== title) {
1965
+ aliases.push(slug);
1193
1966
  }
1194
- const compiledFiles = new Set(
1195
- successfulExtractions.filter((r) => r.concepts.length > 0).map((r) => r.sourceFile)
1196
- );
1197
- const remaining = /* @__PURE__ */ new Set();
1198
- for (const slug of frozenSlugs) {
1199
- const owners = conceptMap.get(slug) ?? [];
1200
- const allOwnersCompiled = owners.length > 0 && owners.every((f) => compiledFiles.has(f)) && extractedBy.has(slug);
1201
- if (!allOwnersCompiled) remaining.add(slug);
1967
+ const swapAlias = generateSwapAlias(title);
1968
+ if (swapAlias) {
1969
+ aliases.push(swapAlias);
1202
1970
  }
1203
- const stateToSave = { ...currentState, frozenSlugs: Array.from(remaining) };
1204
- await writeState(root, stateToSave);
1971
+ const abbreviation = generateAbbreviation(title);
1972
+ if (abbreviation) {
1973
+ aliases.push(abbreviation);
1974
+ }
1975
+ return aliases;
1205
1976
  }
1206
- function collectFreshSlugs(extractions, state) {
1207
- const freshSlugs = /* @__PURE__ */ new Set();
1208
- for (const result of extractions) {
1209
- const oldConcepts = new Set(state.sources[result.sourceFile]?.concepts ?? []);
1210
- for (const c of result.concepts) {
1211
- const slug = slugify(c.concept);
1212
- if (!oldConcepts.has(slug)) freshSlugs.add(slug);
1213
- }
1977
+ function generateSwapAlias(title) {
1978
+ for (const conjunction of SWAP_CONJUNCTIONS) {
1979
+ const index = title.toLowerCase().indexOf(conjunction);
1980
+ if (index === -1) continue;
1981
+ const before = title.slice(0, index);
1982
+ const after = title.slice(index + conjunction.length);
1983
+ const originalConjunction = title.slice(index, index + conjunction.length);
1984
+ return `${after}${originalConjunction}${before}`;
1214
1985
  }
1215
- return freshSlugs;
1986
+ return null;
1987
+ }
1988
+ function generateAbbreviation(title) {
1989
+ const words = title.split(/\s+/);
1990
+ if (words.length < ABBREVIATION_MIN_WORDS) return null;
1991
+ const abbreviation = words.map((w) => w[0].toUpperCase()).join("");
1992
+ if (abbreviation === title) return null;
1993
+ return abbreviation;
1994
+ }
1995
+ async function generateMOC(root) {
1996
+ const conceptsPath = path16.join(root, CONCEPTS_DIR);
1997
+ const pages = await loadConceptPages(conceptsPath);
1998
+ const tagGroups = groupPagesByTag(pages);
1999
+ const content = buildMOCContent(tagGroups);
2000
+ await atomicWrite(path16.join(root, MOC_FILE), content);
2001
+ }
2002
+ async function loadConceptPages(conceptsPath) {
2003
+ let files;
2004
+ try {
2005
+ files = await readdir4(conceptsPath);
2006
+ } catch {
2007
+ return [];
2008
+ }
2009
+ const pages = [];
2010
+ for (const file of files) {
2011
+ if (!file.endsWith(".md")) continue;
2012
+ const content = await safeReadFile(path16.join(conceptsPath, file));
2013
+ if (!content) continue;
2014
+ const { meta } = parseFrontmatter(content);
2015
+ if (meta.orphaned) continue;
2016
+ const slug = file.replace(/\.md$/, "");
2017
+ const title = typeof meta.title === "string" ? meta.title : slug;
2018
+ const tags = Array.isArray(meta.tags) ? meta.tags : [];
2019
+ pages.push({ slug, title, tags });
2020
+ }
2021
+ return pages;
1216
2022
  }
1217
- function findSlugOwners(slugs, conceptMap, excludeSets) {
1218
- const affected = /* @__PURE__ */ new Set();
1219
- for (const slug of slugs) {
1220
- const owners = conceptMap.get(slug);
1221
- if (!owners) continue;
1222
- for (const owner of owners) {
1223
- const isExcluded = excludeSets.some((s) => s.has(owner));
1224
- if (!isExcluded) affected.add(owner);
2023
+ function groupPagesByTag(pages) {
2024
+ const groups = /* @__PURE__ */ new Map();
2025
+ for (const page of pages) {
2026
+ if (page.tags.length === 0) {
2027
+ appendToGroup(groups, "Uncategorized", page);
2028
+ continue;
2029
+ }
2030
+ for (const tag of page.tags) {
2031
+ appendToGroup(groups, tag, page);
1225
2032
  }
1226
2033
  }
1227
- return Array.from(affected);
2034
+ return groups;
1228
2035
  }
1229
- function findLateAffectedSources(extractions, state, allChanges) {
1230
- const compilingFiles = filesByStatus(allChanges, "new", "changed");
1231
- const deletedFiles = filesByStatus(allChanges, "deleted");
1232
- const conceptMap = buildConceptToSourcesMap(state.sources);
1233
- const freshSlugs = collectFreshSlugs(extractions, state);
1234
- return findSlugOwners(freshSlugs, conceptMap, [compilingFiles, deletedFiles]);
2036
+ function appendToGroup(groups, key, page) {
2037
+ const existing = groups.get(key);
2038
+ if (existing) {
2039
+ existing.push(page);
2040
+ } else {
2041
+ groups.set(key, [page]);
2042
+ }
1235
2043
  }
1236
- function findSharedConcepts(sourceFile, state) {
1237
- const shared = /* @__PURE__ */ new Set();
1238
- const sourceEntry = state.sources[sourceFile];
1239
- if (!sourceEntry) return shared;
1240
- const conceptMap = buildConceptToSourcesMap(state.sources);
1241
- for (const slug of sourceEntry.concepts) {
1242
- const contributors = conceptMap.get(slug);
1243
- if (contributors && contributors.length > 1) {
1244
- shared.add(slug);
2044
+ function buildMOCContent(tagGroups) {
2045
+ const lines = ["# Map of Content", ""];
2046
+ const sortedTags = [...tagGroups.keys()].sort((a, b) => {
2047
+ if (a === "Uncategorized") return 1;
2048
+ if (b === "Uncategorized") return -1;
2049
+ return a.localeCompare(b);
2050
+ });
2051
+ for (const tag of sortedTags) {
2052
+ const pages = tagGroups.get(tag) ?? [];
2053
+ lines.push(`## ${tag}`, "");
2054
+ for (const page of pages.sort((a, b) => a.title.localeCompare(b.title))) {
2055
+ lines.push(`- [[${page.slug}|${page.title}]]`);
1245
2056
  }
2057
+ lines.push("");
1246
2058
  }
1247
- return shared;
2059
+ return lines.join("\n");
1248
2060
  }
1249
- async function freezeFailedExtractions(root, results, frozenSlugs) {
1250
- for (const result of results) {
1251
- if (result.concepts.length > 0) continue;
1252
- status("!", warn(`${result.sourceFile}: no concepts \u2014 will retry.`));
1253
- const currentState = await readState(root);
1254
- const oldConcepts = currentState.sources[result.sourceFile]?.concepts ?? [];
1255
- for (const slug of oldConcepts) frozenSlugs.add(slug);
1256
- await updateSourceState(root, result.sourceFile, {
1257
- hash: "",
1258
- concepts: oldConcepts,
1259
- compiledAt: (/* @__PURE__ */ new Date()).toISOString()
1260
- });
2061
+
2062
+ // src/utils/embeddings.ts
2063
+ import { readFile as readFile12, readdir as readdir5 } from "fs/promises";
2064
+ import { existsSync as existsSync4 } from "fs";
2065
+ import path17 from "path";
2066
+
2067
+ // src/utils/retrieval.ts
2068
+ import { createHash as createHash2 } from "crypto";
2069
+ function hashChunkText(text) {
2070
+ return createHash2("sha256").update(text, "utf8").digest("hex").slice(0, 16);
2071
+ }
2072
+ function splitIntoChunks(body) {
2073
+ const paragraphs = extractParagraphs(body);
2074
+ if (paragraphs.length === 0) return [];
2075
+ const chunks = [];
2076
+ let buffer = "";
2077
+ for (const paragraph of paragraphs) {
2078
+ for (const piece of splitOversizedParagraph(paragraph)) {
2079
+ buffer = appendParagraph(buffer, piece, chunks);
2080
+ }
1261
2081
  }
2082
+ if (buffer.length > 0) chunks.push(buffer);
2083
+ return mergeTrailingFragment(chunks);
1262
2084
  }
2085
+ function appendParagraph(buffer, paragraph, chunks) {
2086
+ const candidate = buffer ? `${buffer}
1263
2087
 
1264
- // src/compiler/orphan.ts
1265
- import path9 from "path";
1266
- async function markOrphaned(root, sourceFile, state) {
1267
- const sourceEntry = state.sources[sourceFile];
1268
- if (!sourceEntry) return;
1269
- const sharedSlugs = findSharedConcepts(sourceFile, state);
1270
- for (const slug of sourceEntry.concepts) {
1271
- if (sharedSlugs.has(slug)) {
1272
- status("i", dim(`Kept: ${slug}.md (shared with other sources)`));
1273
- continue;
2088
+ ${paragraph}` : paragraph;
2089
+ if (candidate.length <= CHUNK_TARGET_CHARS) return candidate;
2090
+ if (buffer.length > 0) {
2091
+ chunks.push(buffer);
2092
+ return paragraph;
2093
+ }
2094
+ chunks.push(candidate);
2095
+ return "";
2096
+ }
2097
+ function mergeTrailingFragment(chunks) {
2098
+ if (chunks.length < 2) return chunks;
2099
+ const last = chunks[chunks.length - 1];
2100
+ if (last.length >= CHUNK_MIN_CHARS) return chunks;
2101
+ const previous = chunks[chunks.length - 2];
2102
+ if (previous.length + last.length + 2 > CHUNK_MAX_CHARS) return chunks;
2103
+ const merged = chunks.slice(0, -2);
2104
+ merged.push(`${previous}
2105
+
2106
+ ${last}`);
2107
+ return merged;
2108
+ }
2109
+ function extractParagraphs(body) {
2110
+ return body.split(/\n{2,}/).map((p) => p.trim()).filter((p) => p.length > 0);
2111
+ }
2112
+ function splitOversizedParagraph(paragraph) {
2113
+ if (paragraph.length <= CHUNK_MAX_CHARS) return [paragraph];
2114
+ const sentences = paragraph.split(/(?<=[.!?])\s+/);
2115
+ const pieces = [];
2116
+ let buffer = "";
2117
+ for (const sentence of sentences) {
2118
+ if ((buffer + " " + sentence).length > CHUNK_MAX_CHARS && buffer.length > 0) {
2119
+ pieces.push(buffer.trim());
2120
+ buffer = sentence;
2121
+ } else {
2122
+ buffer = buffer ? `${buffer} ${sentence}` : sentence;
1274
2123
  }
1275
- await orphanPage(root, slug, "source deleted");
1276
2124
  }
1277
- await removeSourceState(root, sourceFile);
2125
+ if (buffer.length > 0) pieces.push(buffer.trim());
2126
+ return pieces.flatMap(hardCut);
1278
2127
  }
1279
- async function orphanUnownedFrozenPages(root, frozenSlugs) {
1280
- const currentState = await readState(root);
1281
- const ownedSlugs = /* @__PURE__ */ new Set();
1282
- for (const entry of Object.values(currentState.sources)) {
1283
- for (const slug of entry.concepts) ownedSlugs.add(slug);
2128
+ function hardCut(text) {
2129
+ if (text.length <= CHUNK_MAX_CHARS) return [text];
2130
+ const pieces = [];
2131
+ for (let start = 0; start < text.length; start += CHUNK_MAX_CHARS) {
2132
+ pieces.push(text.slice(start, start + CHUNK_MAX_CHARS));
1284
2133
  }
1285
- for (const slug of frozenSlugs) {
1286
- if (ownedSlugs.has(slug)) continue;
1287
- await orphanPage(root, slug, "no remaining sources");
2134
+ return pieces;
2135
+ }
2136
+ function rerankWithBm25(query, candidates) {
2137
+ if (candidates.length === 0) return [];
2138
+ const queryTerms = tokenize(query);
2139
+ if (queryTerms.length === 0) {
2140
+ return candidates.map((candidate) => ({ candidate, score: candidate.baseScore }));
1288
2141
  }
2142
+ const docs = candidates.map((c) => tokenize(c.text));
2143
+ const stats = buildCorpusStats(docs);
2144
+ return rankByBm25Score(candidates, docs, queryTerms, stats);
1289
2145
  }
1290
- async function orphanPage(root, slug, reason) {
1291
- const pagePath = path9.join(root, CONCEPTS_DIR, `${slug}.md`);
1292
- const content = await safeReadFile(pagePath);
1293
- if (!content) return;
1294
- const { meta } = parseFrontmatter(content);
1295
- if (meta.orphaned === true) return;
1296
- const updated = content.replace("---\n", "---\norphaned: true\n");
1297
- await atomicWrite(pagePath, updated);
1298
- status("\u26A0", warn(`Orphaned: ${slug}.md (${reason})`));
2146
+ function rankByBm25Score(candidates, docs, queryTerms, stats) {
2147
+ const scored = candidates.map((candidate, index) => {
2148
+ const lexical = bm25Score(queryTerms, docs[index], stats);
2149
+ return { candidate, score: lexical + candidate.baseScore * BASE_SCORE_WEIGHT };
2150
+ });
2151
+ scored.sort((a, b) => b.score - a.score);
2152
+ return scored;
2153
+ }
2154
+ function tokenize(text) {
2155
+ return text.toLowerCase().match(/[a-z0-9]+/g) ?? [];
2156
+ }
2157
+ function buildCorpusStats(docs) {
2158
+ const docFreq = /* @__PURE__ */ new Map();
2159
+ let totalLen = 0;
2160
+ for (const tokens of docs) {
2161
+ totalLen += tokens.length;
2162
+ const unique = new Set(tokens);
2163
+ for (const term of unique) docFreq.set(term, (docFreq.get(term) ?? 0) + 1);
2164
+ }
2165
+ const totalDocs = docs.length;
2166
+ const avgDocLen = totalDocs > 0 ? totalLen / totalDocs : 0;
2167
+ return { docFreq, avgDocLen, totalDocs };
2168
+ }
2169
+ var BM25_K1 = 1.5;
2170
+ var BM25_B = 0.75;
2171
+ var BASE_SCORE_WEIGHT = 0.5;
2172
+ function bm25Score(queryTerms, docTokens, stats) {
2173
+ if (docTokens.length === 0 || stats.totalDocs === 0) return 0;
2174
+ const termFreq = countTerms(docTokens);
2175
+ const lengthRatio = docTokens.length / (stats.avgDocLen || 1);
2176
+ let total = 0;
2177
+ for (const term of queryTerms) {
2178
+ const tf = termFreq.get(term) ?? 0;
2179
+ if (tf === 0) continue;
2180
+ const idf = idfWeight(stats.docFreq.get(term) ?? 0, stats.totalDocs);
2181
+ const numerator = tf * (BM25_K1 + 1);
2182
+ const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * lengthRatio);
2183
+ total += idf * (numerator / denominator);
2184
+ }
2185
+ return total;
2186
+ }
2187
+ function idfWeight(docFrequency, totalDocs) {
2188
+ const numerator = totalDocs - docFrequency + 0.5;
2189
+ const denominator = docFrequency + 0.5;
2190
+ return Math.log(1 + numerator / denominator);
2191
+ }
2192
+ function countTerms(tokens) {
2193
+ const counts = /* @__PURE__ */ new Map();
2194
+ for (const token of tokens) counts.set(token, (counts.get(token) ?? 0) + 1);
2195
+ return counts;
1299
2196
  }
1300
2197
 
1301
- // src/compiler/resolver.ts
1302
- import { readdir as readdir2, readFile as readFile6 } from "fs/promises";
1303
- import path10 from "path";
1304
- import { existsSync as existsSync2 } from "fs";
1305
- async function buildTitleIndex(root) {
1306
- const conceptsDir = path10.join(root, CONCEPTS_DIR);
1307
- if (!existsSync2(conceptsDir)) return [];
1308
- const files = await readdir2(conceptsDir);
1309
- const pages = [];
1310
- for (const file of files) {
1311
- if (!file.endsWith(".md")) continue;
1312
- const filePath = path10.join(conceptsDir, file);
1313
- const content = await readFile6(filePath, "utf-8");
1314
- const { meta } = parseFrontmatter(content);
1315
- if (meta.title && typeof meta.title === "string" && !meta.orphaned) {
1316
- pages.push({
1317
- slug: file.replace(/\.md$/, ""),
1318
- title: meta.title,
1319
- filePath
1320
- });
1321
- }
2198
+ // src/utils/embeddings.ts
2199
+ var STORE_VERSION = 2;
2200
+ function cosineSimilarity(a, b) {
2201
+ if (a.length !== b.length || a.length === 0) return 0;
2202
+ let dot = 0;
2203
+ let magA = 0;
2204
+ let magB = 0;
2205
+ for (let i = 0; i < a.length; i++) {
2206
+ dot += a[i] * b[i];
2207
+ magA += a[i] * a[i];
2208
+ magB += b[i] * b[i];
1322
2209
  }
1323
- return pages;
2210
+ if (magA === 0 || magB === 0) return 0;
2211
+ return dot / (Math.sqrt(magA) * Math.sqrt(magB));
1324
2212
  }
1325
- function isInsideWikilink(text, position) {
1326
- const before = text.lastIndexOf("[[", position);
1327
- const after = text.indexOf("]]", position);
1328
- if (before === -1 || after === -1) return false;
1329
- const closeBefore = text.indexOf("]]", before);
1330
- return closeBefore >= position;
2213
+ function findTopK(queryVec, store, k) {
2214
+ const scored = store.entries.map((entry) => ({
2215
+ entry,
2216
+ score: cosineSimilarity(queryVec, entry.vector)
2217
+ }));
2218
+ scored.sort((left, right) => right.score - left.score);
2219
+ return scored.slice(0, k).map((item) => item.entry);
1331
2220
  }
1332
- function isInsideCitation(text, position) {
1333
- const before = text.lastIndexOf("^[", position);
1334
- const after = text.indexOf("]", position);
1335
- if (before === -1 || after === -1) return false;
1336
- const closeBefore = text.indexOf("]", before);
1337
- return closeBefore >= position;
2221
+ function findTopKChunks(queryVec, chunks, k) {
2222
+ const scored = chunks.map((chunk) => ({
2223
+ chunk,
2224
+ score: cosineSimilarity(queryVec, chunk.vector)
2225
+ }));
2226
+ scored.sort((left, right) => right.score - left.score);
2227
+ return scored.slice(0, k);
1338
2228
  }
1339
- function isWordBoundary(text, start, end) {
1340
- const before = start === 0 || /[\s,.:;!?()\[\]{}/"']/.test(text[start - 1]);
1341
- const after = end >= text.length || /[\s,.:;!?()\[\]{}/"']/.test(text[end]);
1342
- return before && after;
2229
+ async function readEmbeddingStore(root) {
2230
+ const filePath = path17.join(root, EMBEDDINGS_FILE);
2231
+ if (!existsSync4(filePath)) return null;
2232
+ const raw = await readFile12(filePath, "utf-8");
2233
+ return JSON.parse(raw);
1343
2234
  }
1344
- function findTitleMatches(text, title) {
1345
- const escaped = title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
1346
- const regex = new RegExp(escaped, "gi");
1347
- const matches = [];
1348
- let match;
1349
- while ((match = regex.exec(text)) !== null) {
1350
- matches.push({ start: match.index, end: match.index + match[0].length });
1351
- }
1352
- return matches;
2235
+ async function writeEmbeddingStore(root, store) {
2236
+ const filePath = path17.join(root, EMBEDDINGS_FILE);
2237
+ await atomicWrite(filePath, JSON.stringify(store, null, 2));
1353
2238
  }
1354
- function isLinkablePosition(text, start, end) {
1355
- if (isInsideWikilink(text, start)) return false;
1356
- if (isInsideCitation(text, start)) return false;
1357
- return isWordBoundary(text, start, end);
2239
+ async function findRelevantPages(root, question) {
2240
+ const store = await loadActiveStore(root, (s) => s.entries.length > 0);
2241
+ if (!store) return [];
2242
+ const queryVec = await getProvider().embed(question);
2243
+ return findTopK(queryVec, store, EMBEDDING_TOP_K).map((entry) => ({
2244
+ slug: entry.slug,
2245
+ title: entry.title,
2246
+ summary: entry.summary
2247
+ }));
1358
2248
  }
1359
- function addWikilinks(body, titles, selfTitle) {
1360
- let result = body;
1361
- const selfLower = selfTitle.toLowerCase();
1362
- for (const page of titles) {
1363
- if (page.title.toLowerCase() === selfLower) continue;
1364
- const matches = findTitleMatches(result, page.title);
1365
- for (const m of matches.reverse()) {
1366
- if (!isLinkablePosition(result, m.start, m.end)) continue;
1367
- result = result.slice(0, m.start) + `[[${page.title}]]` + result.slice(m.end);
2249
+ async function findRelevantChunks(root, question, k) {
2250
+ const store = await loadActiveStore(root, (s) => Boolean(s.chunks && s.chunks.length > 0));
2251
+ if (!store) return [];
2252
+ const queryVec = await getProvider().embed(question);
2253
+ return findTopKChunks(queryVec, store.chunks ?? [], k);
2254
+ }
2255
+ async function loadActiveStore(root, hasContent) {
2256
+ const store = await readEmbeddingStore(root);
2257
+ if (!store || !hasContent(store)) return null;
2258
+ const activeModel = resolveEmbeddingModel();
2259
+ if (store.model !== activeModel) {
2260
+ warnStaleEmbeddingStore(store.model, activeModel);
2261
+ return null;
2262
+ }
2263
+ return store;
2264
+ }
2265
+ async function collectPageRecords(root) {
2266
+ const records = [];
2267
+ for (const dir of [CONCEPTS_DIR, QUERIES_DIR]) {
2268
+ const absDir = path17.join(root, dir);
2269
+ let files;
2270
+ try {
2271
+ files = await readdir5(absDir);
2272
+ } catch {
2273
+ continue;
2274
+ }
2275
+ for (const file of files.filter((f) => f.endsWith(".md"))) {
2276
+ const record = await readPageRecord(absDir, file);
2277
+ if (record) records.push(record);
1368
2278
  }
1369
2279
  }
1370
- return result;
2280
+ return records;
1371
2281
  }
1372
- async function resolveLinks(root, changedSlugs, newSlugs) {
1373
- const titleIndex = await buildTitleIndex(root);
1374
- if (titleIndex.length === 0) return 0;
1375
- let linkCount = 0;
1376
- linkCount += await resolveOutboundLinks(titleIndex, changedSlugs);
1377
- linkCount += await resolveInboundLinks(titleIndex, newSlugs);
1378
- if (linkCount > 0) {
1379
- status("\u{1F517}", dim(`Resolved links in ${linkCount} page(s)`));
2282
+ async function readPageRecord(absDir, file) {
2283
+ const content = await safeReadFile(path17.join(absDir, file));
2284
+ const { meta, body } = parseFrontmatter(content);
2285
+ if (meta.orphaned || typeof meta.title !== "string") return null;
2286
+ return {
2287
+ slug: file.replace(/\.md$/, ""),
2288
+ title: meta.title,
2289
+ summary: typeof meta.summary === "string" ? meta.summary : "",
2290
+ body
2291
+ };
2292
+ }
2293
+ function buildEmbeddingText(record) {
2294
+ return record.summary ? `${record.title}
2295
+
2296
+ ${record.summary}` : record.title;
2297
+ }
2298
+ async function embedPages(records, slugsToEmbed) {
2299
+ const provider = getProvider();
2300
+ const now = (/* @__PURE__ */ new Date()).toISOString();
2301
+ const fresh = [];
2302
+ for (const record of records) {
2303
+ if (!slugsToEmbed.has(record.slug)) continue;
2304
+ const vector = await provider.embed(buildEmbeddingText(record));
2305
+ fresh.push({
2306
+ slug: record.slug,
2307
+ title: record.title,
2308
+ summary: record.summary,
2309
+ vector,
2310
+ updatedAt: now
2311
+ });
2312
+ }
2313
+ return fresh;
2314
+ }
2315
+ var warnedStaleModels = /* @__PURE__ */ new Set();
2316
+ function warnStaleEmbeddingStore(storedModel, activeModel) {
2317
+ const key = `${storedModel}\u2192${activeModel}`;
2318
+ if (warnedStaleModels.has(key)) return;
2319
+ warnedStaleModels.add(key);
2320
+ status(
2321
+ "!",
2322
+ warn(
2323
+ `Embedding store was built with "${storedModel}" but active embedding model is "${activeModel}". Falling back to full-index selection. Run 'llmwiki compile' to rebuild embeddings.`
2324
+ )
2325
+ );
2326
+ }
2327
+ function resolveEmbeddingModel() {
2328
+ const providerName = getActiveProviderName();
2329
+ const configuredModel = process.env.LLMWIKI_EMBEDDING_MODEL?.trim();
2330
+ if (configuredModel && (providerName === "openai" || providerName === "ollama")) {
2331
+ return configuredModel;
1380
2332
  }
1381
- return linkCount;
2333
+ return EMBEDDING_MODELS[providerName] ?? EMBEDDING_MODELS.anthropic;
1382
2334
  }
1383
- async function resolveOutboundLinks(titleIndex, changedSlugs) {
1384
- let count = 0;
1385
- for (const page of titleIndex) {
1386
- if (!changedSlugs.includes(page.slug)) continue;
1387
- const didLink = await linkPage(page, titleIndex);
1388
- if (didLink) count++;
2335
+ function mergeEntries(existing, fresh, liveSlugs) {
2336
+ const bySlug = /* @__PURE__ */ new Map();
2337
+ for (const entry of existing) {
2338
+ if (liveSlugs.has(entry.slug)) bySlug.set(entry.slug, entry);
1389
2339
  }
1390
- return count;
2340
+ for (const entry of fresh) {
2341
+ bySlug.set(entry.slug, entry);
2342
+ }
2343
+ return Array.from(bySlug.values());
1391
2344
  }
1392
- async function resolveInboundLinks(titleIndex, newSlugs) {
1393
- if (newSlugs.length === 0) return 0;
1394
- const newTitles = titleIndex.filter((p) => newSlugs.includes(p.slug));
1395
- if (newTitles.length === 0) return 0;
1396
- let count = 0;
1397
- for (const page of titleIndex) {
1398
- if (newSlugs.includes(page.slug)) continue;
1399
- const content = await readFile6(page.filePath, "utf-8");
1400
- const { body } = parseFrontmatter(content);
1401
- const linked = addWikilinks(body, newTitles, page.title);
1402
- if (linked !== body) {
1403
- const newContent = content.replace(body, linked);
1404
- await atomicWrite(page.filePath, newContent);
1405
- count++;
2345
+ async function refreshChunkEmbeddings(records, existing, forceAll) {
2346
+ const liveSlugs = new Set(records.map((r) => r.slug));
2347
+ const existingByKey = indexChunksByKey(existing.filter((c) => liveSlugs.has(c.slug)));
2348
+ const now = (/* @__PURE__ */ new Date()).toISOString();
2349
+ const fresh = [];
2350
+ for (const record of records) {
2351
+ const pageChunks = await embedRecordChunks(record, existingByKey, forceAll, now);
2352
+ fresh.push(...pageChunks);
2353
+ }
2354
+ return fresh;
2355
+ }
2356
+ async function embedRecordChunks(record, existingByKey, forceAll, now) {
2357
+ const provider = getProvider();
2358
+ const chunkTexts = splitIntoChunks(record.body);
2359
+ const out = [];
2360
+ for (let i = 0; i < chunkTexts.length; i++) {
2361
+ const text = chunkTexts[i];
2362
+ const contentHash = hashChunkText(text);
2363
+ const reused = pickReusableChunk(existingByKey, record.slug, i, contentHash, forceAll);
2364
+ if (reused) {
2365
+ out.push({ ...reused, title: record.title });
2366
+ continue;
1406
2367
  }
2368
+ const vector = await provider.embed(text);
2369
+ out.push({
2370
+ slug: record.slug,
2371
+ title: record.title,
2372
+ chunkIndex: i,
2373
+ contentHash,
2374
+ text,
2375
+ vector,
2376
+ updatedAt: now
2377
+ });
1407
2378
  }
1408
- return count;
2379
+ return out;
1409
2380
  }
1410
- async function linkPage(page, titleIndex) {
1411
- const content = await readFile6(page.filePath, "utf-8");
1412
- const { body } = parseFrontmatter(content);
1413
- const linked = addWikilinks(body, titleIndex, page.title);
1414
- if (linked === body) return false;
1415
- const newContent = content.replace(body, linked);
1416
- await atomicWrite(page.filePath, newContent);
1417
- return true;
2381
+ function indexChunksByKey(chunks) {
2382
+ const byKey = /* @__PURE__ */ new Map();
2383
+ for (const chunk of chunks) byKey.set(chunkKey(chunk.slug, chunk.chunkIndex), chunk);
2384
+ return byKey;
1418
2385
  }
1419
-
1420
- // src/compiler/indexgen.ts
1421
- import { readdir as readdir3 } from "fs/promises";
1422
- import path11 from "path";
1423
- async function generateIndex(root) {
1424
- status("*", info("Generating index..."));
1425
- const conceptsPath = path11.join(root, CONCEPTS_DIR);
1426
- const queriesPath = path11.join(root, QUERIES_DIR);
1427
- const concepts = await collectPageSummaries(conceptsPath);
1428
- const queries = await collectPageSummaries(queriesPath);
1429
- concepts.sort((a, b) => a.title.localeCompare(b.title));
1430
- queries.sort((a, b) => a.title.localeCompare(b.title));
1431
- const indexContent = buildIndexContent(concepts, queries);
1432
- const indexPath = path11.join(root, INDEX_FILE);
1433
- await atomicWrite(indexPath, indexContent);
1434
- const total = concepts.length + queries.length;
1435
- status("+", success(`Index updated with ${total} pages.`));
2386
+ function chunkKey(slug, chunkIndex) {
2387
+ return `${slug}#${chunkIndex}`;
1436
2388
  }
1437
- async function scanWikiPages(dirPath) {
1438
- let files;
1439
- try {
1440
- files = await readdir3(dirPath);
1441
- } catch {
1442
- return [];
2389
+ function pickReusableChunk(byKey, slug, chunkIndex, contentHash, forceAll) {
2390
+ if (forceAll) return null;
2391
+ const existing = byKey.get(chunkKey(slug, chunkIndex));
2392
+ if (!existing) return null;
2393
+ return existing.contentHash === contentHash ? existing : null;
2394
+ }
2395
+ async function updateEmbeddings(root, changedSlugs) {
2396
+ const records = await collectPageRecords(root);
2397
+ const liveSlugs = new Set(records.map((r) => r.slug));
2398
+ const embeddingModel = resolveEmbeddingModel();
2399
+ const existingStore = await readEmbeddingStore(root);
2400
+ const modelChanged = Boolean(existingStore && existingStore.model !== embeddingModel);
2401
+ const toEmbed = new Set(changedSlugs.filter((slug) => liveSlugs.has(slug)));
2402
+ const previousEntries = modelChanged ? [] : existingStore?.entries ?? [];
2403
+ const previousChunks = modelChanged ? [] : existingStore?.chunks ?? [];
2404
+ const isEmptyStore = isStoreEmpty(existingStore);
2405
+ if (!existingStore || modelChanged || isEmptyStore && liveSlugs.size > 0) {
2406
+ for (const record of records) toEmbed.add(record.slug);
1443
2407
  }
1444
- const scanned = [];
1445
- for (const file of files.filter((f) => f.endsWith(".md"))) {
1446
- const content = await safeReadFile(path11.join(dirPath, file));
1447
- const { meta } = parseFrontmatter(content);
1448
- scanned.push({ slug: file.replace(/\.md$/, ""), meta });
2408
+ if (!shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs)) {
2409
+ return;
1449
2410
  }
1450
- return scanned;
2411
+ const freshEntries = await embedPages(records, toEmbed);
2412
+ const mergedEntries = mergeEntries(previousEntries, freshEntries, liveSlugs);
2413
+ const mergedChunks = await refreshChunkEmbeddings(records, previousChunks, modelChanged);
2414
+ await persistRefreshedStore(root, embeddingModel, mergedEntries, mergedChunks);
1451
2415
  }
1452
- async function collectPageSummaries(conceptsPath) {
1453
- const scanned = await scanWikiPages(conceptsPath);
1454
- return scanned.filter(({ meta }) => meta.title && typeof meta.title === "string" && !meta.orphaned).map(({ slug, meta }) => ({
1455
- title: meta.title,
1456
- slug,
1457
- summary: typeof meta.summary === "string" ? meta.summary : ""
1458
- }));
2416
+ async function persistRefreshedStore(root, embeddingModel, entries, chunks) {
2417
+ const dimensions = entries[0]?.vector.length ?? chunks[0]?.vector.length ?? 0;
2418
+ const store = {
2419
+ version: STORE_VERSION,
2420
+ model: embeddingModel,
2421
+ dimensions,
2422
+ entries,
2423
+ chunks
2424
+ };
2425
+ await writeEmbeddingStore(root, store);
2426
+ status(
2427
+ "*",
2428
+ dim(`Embeddings updated (${entries.length} pages, ${chunks.length} chunks).`)
2429
+ );
1459
2430
  }
1460
- function stripWikilinks(text) {
1461
- return text.replace(/\[\[([^\]]+)\]\]/g, "$1");
2431
+ function isStoreEmpty(store) {
2432
+ if (!store) return false;
2433
+ return store.entries.length === 0 && (!store.chunks || store.chunks.length === 0);
1462
2434
  }
1463
- function buildIndexContent(concepts, queries) {
1464
- const lines = ["# Knowledge Wiki", "", "## Concepts", ""];
1465
- for (const page of concepts) {
1466
- lines.push(`- **[[${page.title}]]** \u2014 ${stripWikilinks(page.summary)}`);
1467
- }
1468
- if (queries.length > 0) {
1469
- lines.push("", "## Saved Queries", "");
1470
- for (const page of queries) {
1471
- lines.push(`- **[[${page.title}]]** \u2014 ${stripWikilinks(page.summary)}`);
1472
- }
1473
- }
1474
- const total = concepts.length + queries.length;
1475
- lines.push("");
1476
- lines.push(`_${total} pages | Generated ${(/* @__PURE__ */ new Date()).toISOString()}_`);
1477
- lines.push("");
1478
- return lines.join("\n");
2435
+ function shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs) {
2436
+ if (modelChanged) return true;
2437
+ if (toEmbed.size > 0) return true;
2438
+ if (!previousEntries.every((e) => liveSlugs.has(e.slug))) return true;
2439
+ if (!previousChunks.every((c) => liveSlugs.has(c.slug))) return true;
2440
+ if (previousEntries.length > 0 && previousChunks.length === 0 && liveSlugs.size > 0) return true;
2441
+ return false;
1479
2442
  }
1480
2443
 
1481
- // src/compiler/obsidian.ts
1482
- import { readdir as readdir4 } from "fs/promises";
1483
- import path12 from "path";
1484
- var ABBREVIATION_MIN_WORDS = 3;
1485
- var SWAP_CONJUNCTIONS = [" and ", " or "];
1486
- function addObsidianMeta(frontmatter, conceptTitle, tags) {
1487
- frontmatter.tags = tags;
1488
- frontmatter.aliases = generateAliases(conceptTitle);
2444
+ // src/compiler/candidates.ts
2445
+ import { readdir as readdir6, rename as rename3, unlink as unlink2, writeFile as writeFile4, mkdir as mkdir5 } from "fs/promises";
2446
+ import { existsSync as existsSync5 } from "fs";
2447
+ import path18 from "path";
2448
+ import { randomBytes } from "crypto";
2449
+ var ID_SUFFIX_BYTES = 4;
2450
+ var CANDIDATE_EXT = ".json";
2451
+ function buildCandidateId(slug) {
2452
+ const suffix = randomBytes(ID_SUFFIX_BYTES).toString("hex");
2453
+ return `${slug}-${suffix}`;
1489
2454
  }
1490
- function generateAliases(title) {
1491
- const aliases = [];
1492
- const slug = slugify(title);
1493
- if (slug !== title) {
1494
- aliases.push(slug);
2455
+ function candidatePath(root, id) {
2456
+ return path18.join(root, CANDIDATES_DIR, `${id}${CANDIDATE_EXT}`);
2457
+ }
2458
+ function archivePath(root, id) {
2459
+ return path18.join(root, CANDIDATES_ARCHIVE_DIR, `${id}${CANDIDATE_EXT}`);
2460
+ }
2461
+ async function writeCandidate(root, draft) {
2462
+ const candidate = {
2463
+ id: buildCandidateId(draft.slug),
2464
+ title: draft.title,
2465
+ slug: draft.slug,
2466
+ summary: draft.summary,
2467
+ sources: draft.sources,
2468
+ body: draft.body,
2469
+ generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
2470
+ ...draft.sourceStates ? { sourceStates: draft.sourceStates } : {},
2471
+ ...draft.schemaViolations ? { schemaViolations: draft.schemaViolations } : {}
2472
+ };
2473
+ await atomicWrite(candidatePath(root, candidate.id), JSON.stringify(candidate, null, 2));
2474
+ return candidate;
2475
+ }
2476
+ function failWithError(message) {
2477
+ status("!", error(message));
2478
+ process.exitCode = 1;
2479
+ return null;
2480
+ }
2481
+ async function loadCandidateOrFail(root, id) {
2482
+ const candidate = await readCandidate(root, id);
2483
+ if (!candidate) return failWithError(`Candidate not found: ${id}`);
2484
+ return candidate;
2485
+ }
2486
+ async function loadCandidateUnderLockOrFail(root, id) {
2487
+ const candidate = await readCandidate(root, id);
2488
+ if (!candidate) {
2489
+ return failWithError(`Candidate ${id} was removed by another process during review.`);
1495
2490
  }
1496
- const swapAlias = generateSwapAlias(title);
1497
- if (swapAlias) {
1498
- aliases.push(swapAlias);
2491
+ return candidate;
2492
+ }
2493
+ async function readCandidate(root, id) {
2494
+ const raw = await safeReadFile(candidatePath(root, id));
2495
+ if (!raw) return null;
2496
+ try {
2497
+ const parsed = JSON.parse(raw);
2498
+ if (!isValidCandidate(parsed)) return null;
2499
+ return parsed;
2500
+ } catch {
2501
+ return null;
1499
2502
  }
1500
- const abbreviation = generateAbbreviation(title);
1501
- if (abbreviation) {
1502
- aliases.push(abbreviation);
2503
+ }
2504
+ function isValidCandidate(value) {
2505
+ if (!value || typeof value !== "object") return false;
2506
+ const candidate = value;
2507
+ return typeof candidate.id === "string" && typeof candidate.title === "string" && typeof candidate.slug === "string" && typeof candidate.body === "string" && Array.isArray(candidate.sources);
2508
+ }
2509
+ async function listCandidates(root) {
2510
+ const dir = path18.join(root, CANDIDATES_DIR);
2511
+ if (!existsSync5(dir)) return [];
2512
+ const entries = await readdir6(dir, { withFileTypes: true });
2513
+ const candidates = [];
2514
+ for (const entry of entries) {
2515
+ if (!entry.isFile() || !entry.name.endsWith(CANDIDATE_EXT)) continue;
2516
+ const id = entry.name.slice(0, -CANDIDATE_EXT.length);
2517
+ const candidate = await readCandidate(root, id);
2518
+ if (candidate) candidates.push(candidate);
1503
2519
  }
1504
- return aliases;
2520
+ candidates.sort((a, b) => a.generatedAt.localeCompare(b.generatedAt));
2521
+ return candidates;
1505
2522
  }
1506
- function generateSwapAlias(title) {
1507
- for (const conjunction of SWAP_CONJUNCTIONS) {
1508
- const index = title.toLowerCase().indexOf(conjunction);
1509
- if (index === -1) continue;
1510
- const before = title.slice(0, index);
1511
- const after = title.slice(index + conjunction.length);
1512
- const originalConjunction = title.slice(index, index + conjunction.length);
1513
- return `${after}${originalConjunction}${before}`;
2523
+ async function countCandidates(root) {
2524
+ const candidates = await listCandidates(root);
2525
+ return candidates.length;
2526
+ }
2527
+ async function deleteCandidate(root, id) {
2528
+ const filePath = candidatePath(root, id);
2529
+ if (!existsSync5(filePath)) return false;
2530
+ await unlink2(filePath);
2531
+ return true;
2532
+ }
2533
+ async function archiveCandidate(root, id) {
2534
+ const sourcePath = candidatePath(root, id);
2535
+ if (!existsSync5(sourcePath)) return false;
2536
+ const target = archivePath(root, id);
2537
+ await mkdir5(path18.dirname(target), { recursive: true });
2538
+ try {
2539
+ await rename3(sourcePath, target);
2540
+ } catch {
2541
+ const raw = await safeReadFile(sourcePath);
2542
+ await writeFile4(target, raw, "utf-8");
2543
+ await unlink2(sourcePath);
1514
2544
  }
1515
- return null;
2545
+ return true;
1516
2546
  }
1517
- function generateAbbreviation(title) {
1518
- const words = title.split(/\s+/);
1519
- if (words.length < ABBREVIATION_MIN_WORDS) return null;
1520
- const abbreviation = words.map((w) => w[0].toUpperCase()).join("");
1521
- if (abbreviation === title) return null;
1522
- return abbreviation;
2547
+
2548
+ // src/linter/rules.ts
2549
+ import { readdir as readdir7, readFile as readFile13 } from "fs/promises";
2550
+ import { existsSync as existsSync6 } from "fs";
2551
+ import path19 from "path";
2552
+ var MIN_BODY_LENGTH = 50;
2553
+ var WIKILINK_PATTERN2 = /\[\[([^\]]+)\]\]/g;
2554
+ var CITATION_PATTERN = /\^\[([^\]]+)\]/g;
2555
+ function findMatchesInContent(content, pattern) {
2556
+ const results = [];
2557
+ const lines = content.split("\n");
2558
+ for (let i = 0; i < lines.length; i++) {
2559
+ const matches = lines[i].matchAll(pattern);
2560
+ for (const match of matches) {
2561
+ results.push({ captured: match[1], line: i + 1 });
2562
+ }
2563
+ }
2564
+ return results;
1523
2565
  }
1524
- async function generateMOC(root) {
1525
- const conceptsPath = path12.join(root, CONCEPTS_DIR);
1526
- const pages = await loadConceptPages(conceptsPath);
1527
- const tagGroups = groupPagesByTag(pages);
1528
- const content = buildMOCContent(tagGroups);
1529
- await atomicWrite(path12.join(root, MOC_FILE), content);
2566
+ async function readMarkdownFiles(dirPath) {
2567
+ if (!existsSync6(dirPath)) return [];
2568
+ const entries = await readdir7(dirPath);
2569
+ const mdFiles = entries.filter((f) => f.endsWith(".md"));
2570
+ const results = await Promise.all(
2571
+ mdFiles.map(async (fileName) => {
2572
+ const filePath = path19.join(dirPath, fileName);
2573
+ const content = await readFile13(filePath, "utf-8");
2574
+ return { filePath, content };
2575
+ })
2576
+ );
2577
+ return results;
1530
2578
  }
1531
- async function loadConceptPages(conceptsPath) {
1532
- let files;
1533
- try {
1534
- files = await readdir4(conceptsPath);
1535
- } catch {
1536
- return [];
1537
- }
1538
- const pages = [];
1539
- for (const file of files) {
1540
- if (!file.endsWith(".md")) continue;
1541
- const content = await safeReadFile(path12.join(conceptsPath, file));
1542
- if (!content) continue;
1543
- const { meta } = parseFrontmatter(content);
1544
- if (meta.orphaned) continue;
1545
- const title = typeof meta.title === "string" ? meta.title : file.replace(/\.md$/, "");
1546
- const tags = Array.isArray(meta.tags) ? meta.tags : [];
1547
- pages.push({ title, tags });
2579
+ async function collectAllPages(root) {
2580
+ const conceptPages = await readMarkdownFiles(path19.join(root, CONCEPTS_DIR));
2581
+ const queryPages = await readMarkdownFiles(path19.join(root, QUERIES_DIR));
2582
+ return [...conceptPages, ...queryPages];
2583
+ }
2584
+ function buildPageSlugSet(pages) {
2585
+ const slugs = /* @__PURE__ */ new Set();
2586
+ for (const page of pages) {
2587
+ const baseName = path19.basename(page.filePath, ".md");
2588
+ slugs.add(baseName.toLowerCase());
1548
2589
  }
1549
- return pages;
2590
+ return slugs;
1550
2591
  }
1551
- function groupPagesByTag(pages) {
1552
- const groups = /* @__PURE__ */ new Map();
2592
+ async function checkBrokenWikilinks(root) {
2593
+ const pages = await collectAllPages(root);
2594
+ const existingSlugs = buildPageSlugSet(pages);
2595
+ const results = [];
1553
2596
  for (const page of pages) {
1554
- if (page.tags.length === 0) {
1555
- appendToGroup(groups, "Uncategorized", page.title);
1556
- continue;
1557
- }
1558
- for (const tag of page.tags) {
1559
- appendToGroup(groups, tag, page.title);
2597
+ for (const { captured, line } of findMatchesInContent(page.content, WIKILINK_PATTERN2)) {
2598
+ const linkSlug = slugify(captured);
2599
+ if (!existingSlugs.has(linkSlug)) {
2600
+ results.push({
2601
+ rule: "broken-wikilink",
2602
+ severity: "error",
2603
+ file: page.filePath,
2604
+ message: `Broken wikilink [[${captured}]] \u2014 no matching page found`,
2605
+ line
2606
+ });
2607
+ }
1560
2608
  }
1561
2609
  }
1562
- return groups;
2610
+ return results;
1563
2611
  }
1564
- function appendToGroup(groups, key, title) {
1565
- const existing = groups.get(key);
1566
- if (existing) {
1567
- existing.push(title);
1568
- } else {
1569
- groups.set(key, [title]);
2612
+ async function checkOrphanedPages(root) {
2613
+ const pages = await collectAllPages(root);
2614
+ const results = [];
2615
+ for (const page of pages) {
2616
+ const { meta } = parseFrontmatter(page.content);
2617
+ if (meta.orphaned === true) {
2618
+ results.push({
2619
+ rule: "orphaned-page",
2620
+ severity: "warning",
2621
+ file: page.filePath,
2622
+ message: `Page is marked as orphaned`
2623
+ });
2624
+ }
1570
2625
  }
2626
+ return results;
1571
2627
  }
1572
- function buildMOCContent(tagGroups) {
1573
- const lines = ["# Map of Content", ""];
1574
- const sortedTags = [...tagGroups.keys()].sort((a, b) => {
1575
- if (a === "Uncategorized") return 1;
1576
- if (b === "Uncategorized") return -1;
1577
- return a.localeCompare(b);
1578
- });
1579
- for (const tag of sortedTags) {
1580
- const titles = tagGroups.get(tag) ?? [];
1581
- lines.push(`## ${tag}`, "");
1582
- for (const title of titles.sort()) {
1583
- lines.push(`- [[${title}]]`);
2628
+ async function checkMissingSummaries(root) {
2629
+ const pages = await collectAllPages(root);
2630
+ const results = [];
2631
+ for (const page of pages) {
2632
+ const { meta } = parseFrontmatter(page.content);
2633
+ const summary = meta.summary;
2634
+ const isMissing = !summary || typeof summary === "string" && summary.trim() === "";
2635
+ if (isMissing) {
2636
+ results.push({
2637
+ rule: "missing-summary",
2638
+ severity: "warning",
2639
+ file: page.filePath,
2640
+ message: `Page has no summary in frontmatter`
2641
+ });
1584
2642
  }
1585
- lines.push("");
1586
2643
  }
1587
- return lines.join("\n");
2644
+ return results;
1588
2645
  }
1589
-
1590
- // src/utils/embeddings.ts
1591
- import { readFile as readFile7, readdir as readdir5 } from "fs/promises";
1592
- import { existsSync as existsSync3 } from "fs";
1593
- import path13 from "path";
1594
- function cosineSimilarity(a, b) {
1595
- if (a.length !== b.length || a.length === 0) return 0;
1596
- let dot = 0;
1597
- let magA = 0;
1598
- let magB = 0;
1599
- for (let i = 0; i < a.length; i++) {
1600
- dot += a[i] * b[i];
1601
- magA += a[i] * a[i];
1602
- magB += b[i] * b[i];
2646
+ async function checkDuplicateConcepts(root) {
2647
+ const pages = await collectAllPages(root);
2648
+ const titleMap = /* @__PURE__ */ new Map();
2649
+ for (const page of pages) {
2650
+ const { meta } = parseFrontmatter(page.content);
2651
+ const title = typeof meta.title === "string" ? meta.title : "";
2652
+ if (!title) continue;
2653
+ const normalizedTitle = title.toLowerCase().trim();
2654
+ const existing = titleMap.get(normalizedTitle) ?? [];
2655
+ existing.push(page.filePath);
2656
+ titleMap.set(normalizedTitle, existing);
1603
2657
  }
1604
- if (magA === 0 || magB === 0) return 0;
1605
- return dot / (Math.sqrt(magA) * Math.sqrt(magB));
1606
- }
1607
- function findTopK(queryVec, store, k) {
1608
- const scored = store.entries.map((entry) => ({
1609
- entry,
1610
- score: cosineSimilarity(queryVec, entry.vector)
1611
- }));
1612
- scored.sort((left, right) => right.score - left.score);
1613
- return scored.slice(0, k).map((item) => item.entry);
1614
- }
1615
- async function readEmbeddingStore(root) {
1616
- const filePath = path13.join(root, EMBEDDINGS_FILE);
1617
- if (!existsSync3(filePath)) return null;
1618
- const raw = await readFile7(filePath, "utf-8");
1619
- return JSON.parse(raw);
1620
- }
1621
- async function writeEmbeddingStore(root, store) {
1622
- const filePath = path13.join(root, EMBEDDINGS_FILE);
1623
- await atomicWrite(filePath, JSON.stringify(store, null, 2));
1624
- }
1625
- async function findRelevantPages(root, question) {
1626
- const store = await readEmbeddingStore(root);
1627
- if (!store || store.entries.length === 0) return [];
1628
- const activeModel = resolveEmbeddingModel();
1629
- if (store.model !== activeModel) {
1630
- warnStaleEmbeddingStore(store.model, activeModel);
1631
- return [];
2658
+ const results = [];
2659
+ for (const [title, files] of titleMap) {
2660
+ if (files.length <= 1) continue;
2661
+ for (const file of files) {
2662
+ results.push({
2663
+ rule: "duplicate-concept",
2664
+ severity: "error",
2665
+ file,
2666
+ message: `Duplicate title "${title}" \u2014 also in ${files.filter((f) => f !== file).join(", ")}`
2667
+ });
2668
+ }
1632
2669
  }
1633
- const queryVec = await getProvider().embed(question);
1634
- return findTopK(queryVec, store, EMBEDDING_TOP_K).map((entry) => ({
1635
- slug: entry.slug,
1636
- title: entry.title,
1637
- summary: entry.summary
1638
- }));
2670
+ return results;
1639
2671
  }
1640
- async function collectPageRecords(root) {
1641
- const records = [];
1642
- for (const dir of [CONCEPTS_DIR, QUERIES_DIR]) {
1643
- const absDir = path13.join(root, dir);
1644
- let files;
1645
- try {
1646
- files = await readdir5(absDir);
1647
- } catch {
1648
- continue;
1649
- }
1650
- for (const file of files.filter((f) => f.endsWith(".md"))) {
1651
- const content = await safeReadFile(path13.join(absDir, file));
1652
- const { meta } = parseFrontmatter(content);
1653
- if (meta.orphaned || typeof meta.title !== "string") continue;
1654
- records.push({
1655
- slug: file.replace(/\.md$/, ""),
1656
- title: meta.title,
1657
- summary: typeof meta.summary === "string" ? meta.summary : ""
2672
+ async function checkEmptyPages(root) {
2673
+ const pages = await collectAllPages(root);
2674
+ const results = [];
2675
+ for (const page of pages) {
2676
+ const { meta, body } = parseFrontmatter(page.content);
2677
+ const hasTitle = typeof meta.title === "string" && meta.title.trim() !== "";
2678
+ const isBodyEmpty = body.trim().length < MIN_BODY_LENGTH;
2679
+ if (hasTitle && isBodyEmpty) {
2680
+ results.push({
2681
+ rule: "empty-page",
2682
+ severity: "warning",
2683
+ file: page.filePath,
2684
+ message: `Page body is empty or too short (< ${MIN_BODY_LENGTH} chars)`
1658
2685
  });
1659
2686
  }
1660
2687
  }
1661
- return records;
2688
+ return results;
1662
2689
  }
1663
- function buildEmbeddingText(record) {
1664
- return record.summary ? `${record.title}
1665
-
1666
- ${record.summary}` : record.title;
2690
+ function stripSpanSuffix(entry) {
2691
+ const colonIdx = entry.indexOf(":");
2692
+ const hashIdx = entry.indexOf("#");
2693
+ const cuts = [colonIdx, hashIdx].filter((i) => i >= 0);
2694
+ if (cuts.length === 0) return entry;
2695
+ return entry.slice(0, Math.min(...cuts));
1667
2696
  }
1668
- async function embedPages(records, slugsToEmbed) {
1669
- const provider = getProvider();
1670
- const now = (/* @__PURE__ */ new Date()).toISOString();
1671
- const fresh = [];
1672
- for (const record of records) {
1673
- if (!slugsToEmbed.has(record.slug)) continue;
1674
- const vector = await provider.embed(buildEmbeddingText(record));
1675
- fresh.push({
1676
- slug: record.slug,
1677
- title: record.title,
1678
- summary: record.summary,
1679
- vector,
1680
- updatedAt: now
2697
+ async function checkLowConfidencePages(root) {
2698
+ const pages = await collectAllPages(root);
2699
+ const results = [];
2700
+ for (const page of pages) {
2701
+ const { meta } = parseFrontmatter(page.content);
2702
+ const { confidence } = parseProvenanceMetadata(meta);
2703
+ if (confidence === void 0 || confidence >= LOW_CONFIDENCE_THRESHOLD) continue;
2704
+ results.push({
2705
+ rule: "low-confidence",
2706
+ severity: "warning",
2707
+ file: page.filePath,
2708
+ message: `Page confidence ${confidence.toFixed(2)} is below ${LOW_CONFIDENCE_THRESHOLD}`
2709
+ });
2710
+ }
2711
+ return results;
2712
+ }
2713
+ async function checkContradictedPages(root) {
2714
+ const pages = await collectAllPages(root);
2715
+ const results = [];
2716
+ for (const page of pages) {
2717
+ const { meta } = parseFrontmatter(page.content);
2718
+ const { contradictedBy } = parseProvenanceMetadata(meta);
2719
+ if (!contradictedBy || contradictedBy.length === 0) continue;
2720
+ const slugs = contradictedBy.map((r) => r.slug).join(", ");
2721
+ results.push({
2722
+ rule: "contradicted-page",
2723
+ severity: "warning",
2724
+ file: page.filePath,
2725
+ message: `Page contradicts: ${slugs}`
2726
+ });
2727
+ }
2728
+ return results;
2729
+ }
2730
+ async function checkInferredWithoutCitations(root) {
2731
+ const pages = await collectAllPages(root);
2732
+ const results = [];
2733
+ for (const page of pages) {
2734
+ const { meta, body } = parseFrontmatter(page.content);
2735
+ const provenance = parseProvenanceMetadata(meta);
2736
+ const inferred = provenance.inferredParagraphs ?? countUncitedProseParagraphs(body);
2737
+ if (inferred <= MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS) continue;
2738
+ results.push({
2739
+ rule: "excess-inferred-paragraphs",
2740
+ severity: "warning",
2741
+ file: page.filePath,
2742
+ message: `Page has ${inferred} inferred paragraphs without citations (max ${MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS})`
1681
2743
  });
1682
2744
  }
1683
- return fresh;
1684
- }
1685
- var warnedStaleModels = /* @__PURE__ */ new Set();
1686
- function warnStaleEmbeddingStore(storedModel, activeModel) {
1687
- const key = `${storedModel}\u2192${activeModel}`;
1688
- if (warnedStaleModels.has(key)) return;
1689
- warnedStaleModels.add(key);
1690
- status(
1691
- "!",
1692
- warn(
1693
- `Embedding store was built with "${storedModel}" but active embedding model is "${activeModel}". Falling back to full-index selection. Run 'llmwiki compile' to rebuild embeddings.`
1694
- )
1695
- );
2745
+ return results;
1696
2746
  }
1697
- function resolveEmbeddingModel() {
1698
- const providerName = getActiveProviderName();
1699
- const configuredModel = process.env.LLMWIKI_EMBEDDING_MODEL?.trim();
1700
- if (configuredModel && (providerName === "openai" || providerName === "ollama")) {
1701
- return configuredModel;
2747
+ var PROSE_PARAGRAPH_LEAD = /^[A-Za-z]/;
2748
+ function countUncitedProseParagraphs(body) {
2749
+ const paragraphs = body.split(/\n\s*\n/);
2750
+ let count = 0;
2751
+ for (const block of paragraphs) {
2752
+ const trimmed = block.trim();
2753
+ if (trimmed.length === 0) continue;
2754
+ if (!PROSE_PARAGRAPH_LEAD.test(trimmed)) continue;
2755
+ if (CITATION_PATTERN.test(trimmed)) {
2756
+ CITATION_PATTERN.lastIndex = 0;
2757
+ continue;
2758
+ }
2759
+ CITATION_PATTERN.lastIndex = 0;
2760
+ count += 1;
1702
2761
  }
1703
- return EMBEDDING_MODELS[providerName] ?? EMBEDDING_MODELS.anthropic;
2762
+ return count;
1704
2763
  }
1705
- function mergeEntries(existing, fresh, liveSlugs) {
1706
- const bySlug = /* @__PURE__ */ new Map();
1707
- for (const entry of existing) {
1708
- if (liveSlugs.has(entry.slug)) bySlug.set(entry.slug, entry);
1709
- }
1710
- for (const entry of fresh) {
1711
- bySlug.set(entry.slug, entry);
2764
+ var COLON_SPAN_PATTERN = /^[^:#]+:(\d+)(?:-(\d+))?$/;
2765
+ var HASH_SPAN_PATTERN = /^[^:#]+#L(\d+)(?:-L(\d+))?$/;
2766
+ async function checkSchemaCrossLinks(root, schema) {
2767
+ const pages = await collectAllPages(root);
2768
+ const results = [];
2769
+ for (const page of pages) {
2770
+ const { meta, body } = parseFrontmatter(page.content);
2771
+ const kind = resolvePageKind(meta.kind, schema);
2772
+ const rule = schema.kinds[kind];
2773
+ if (rule.minWikilinks <= 0) continue;
2774
+ const linkCount = countWikilinks(body);
2775
+ if (linkCount >= rule.minWikilinks) continue;
2776
+ results.push({
2777
+ rule: "schema-cross-link-minimum",
2778
+ severity: "warning",
2779
+ file: page.filePath,
2780
+ message: `Page kind "${kind}" requires at least ${rule.minWikilinks} [[wikilinks]] but only ${linkCount} found.`
2781
+ });
1712
2782
  }
1713
- return Array.from(bySlug.values());
2783
+ return results;
1714
2784
  }
1715
- async function updateEmbeddings(root, changedSlugs) {
1716
- const records = await collectPageRecords(root);
1717
- const liveSlugs = new Set(records.map((r) => r.slug));
1718
- const embeddingModel = resolveEmbeddingModel();
1719
- const existingStore = await readEmbeddingStore(root);
1720
- const modelChanged = Boolean(existingStore && existingStore.model !== embeddingModel);
1721
- const toEmbed = new Set(changedSlugs.filter((slug) => liveSlugs.has(slug)));
1722
- const previousEntries = modelChanged ? [] : existingStore?.entries ?? [];
1723
- if (!existingStore || modelChanged) {
1724
- for (const record of records) toEmbed.add(record.slug);
2785
+ function checkPageCrossLinks(content, filePath, schema) {
2786
+ const { meta, body } = parseFrontmatter(content);
2787
+ const kind = resolvePageKind(meta.kind, schema);
2788
+ const rule = schema.kinds[kind];
2789
+ if (rule.minWikilinks <= 0) return [];
2790
+ const linkCount = countWikilinks(body);
2791
+ if (linkCount >= rule.minWikilinks) return [];
2792
+ return [
2793
+ {
2794
+ rule: "schema-cross-link-minimum",
2795
+ severity: "warning",
2796
+ file: filePath,
2797
+ message: `Page kind "${kind}" requires at least ${rule.minWikilinks} [[wikilinks]] but only ${linkCount} found.`
2798
+ }
2799
+ ];
2800
+ }
2801
+ function parseLineRange(entry) {
2802
+ const colonMatch = COLON_SPAN_PATTERN.exec(entry);
2803
+ if (colonMatch) {
2804
+ const start = Number(colonMatch[1]);
2805
+ const end = colonMatch[2] !== void 0 ? Number(colonMatch[2]) : start;
2806
+ return { start, end };
1725
2807
  }
1726
- if (!modelChanged && toEmbed.size === 0 && previousEntries.every((e) => liveSlugs.has(e.slug))) {
1727
- return;
2808
+ const hashMatch = HASH_SPAN_PATTERN.exec(entry);
2809
+ if (hashMatch) {
2810
+ const start = Number(hashMatch[1]);
2811
+ const end = hashMatch[2] !== void 0 ? Number(hashMatch[2]) : start;
2812
+ return { start, end };
1728
2813
  }
1729
- const freshEntries = await embedPages(records, toEmbed);
1730
- const mergedEntries = mergeEntries(previousEntries, freshEntries, liveSlugs);
1731
- const dimensions = mergedEntries[0]?.vector.length ?? 0;
1732
- const store = {
1733
- version: 1,
1734
- model: embeddingModel,
1735
- dimensions,
1736
- entries: mergedEntries
1737
- };
1738
- await writeEmbeddingStore(root, store);
1739
- status("*", dim(`Embeddings updated (${mergedEntries.length} pages).`));
1740
- }
1741
-
1742
- // src/compiler/candidates.ts
1743
- import { readdir as readdir6, rename as rename3, unlink as unlink2, writeFile as writeFile4, mkdir as mkdir5 } from "fs/promises";
1744
- import { existsSync as existsSync4 } from "fs";
1745
- import path14 from "path";
1746
- import { randomBytes } from "crypto";
1747
- var ID_SUFFIX_BYTES = 4;
1748
- var CANDIDATE_EXT = ".json";
1749
- function buildCandidateId(slug) {
1750
- const suffix = randomBytes(ID_SUFFIX_BYTES).toString("hex");
1751
- return `${slug}-${suffix}`;
1752
- }
1753
- function candidatePath(root, id) {
1754
- return path14.join(root, CANDIDATES_DIR, `${id}${CANDIDATE_EXT}`);
1755
- }
1756
- function archivePath(root, id) {
1757
- return path14.join(root, CANDIDATES_ARCHIVE_DIR, `${id}${CANDIDATE_EXT}`);
1758
- }
1759
- async function writeCandidate(root, draft) {
1760
- const candidate = {
1761
- id: buildCandidateId(draft.slug),
1762
- title: draft.title,
1763
- slug: draft.slug,
1764
- summary: draft.summary,
1765
- sources: draft.sources,
1766
- body: draft.body,
1767
- generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
1768
- ...draft.sourceStates ? { sourceStates: draft.sourceStates } : {}
1769
- };
1770
- await atomicWrite(candidatePath(root, candidate.id), JSON.stringify(candidate, null, 2));
1771
- return candidate;
1772
- }
1773
- function failWithError(message) {
1774
- status("!", error(message));
1775
- process.exitCode = 1;
1776
2814
  return null;
1777
2815
  }
1778
- async function loadCandidateOrFail(root, id) {
1779
- const candidate = await readCandidate(root, id);
1780
- if (!candidate) return failWithError(`Candidate not found: ${id}`);
1781
- return candidate;
1782
- }
1783
- async function loadCandidateUnderLockOrFail(root, id) {
1784
- const candidate = await readCandidate(root, id);
1785
- if (!candidate) {
1786
- return failWithError(`Candidate ${id} was removed by another process during review.`);
1787
- }
1788
- return candidate;
2816
+ function countLines(content) {
2817
+ if (content.length === 0) return 0;
2818
+ return content.split("\n").length;
1789
2819
  }
1790
- async function readCandidate(root, id) {
1791
- const raw = await safeReadFile(candidatePath(root, id));
1792
- if (!raw) return null;
1793
- try {
1794
- const parsed = JSON.parse(raw);
1795
- if (!isValidCandidate(parsed)) return null;
1796
- return parsed;
1797
- } catch {
1798
- return null;
2820
+ async function checkBrokenCitations(root) {
2821
+ const pages = await collectAllPages(root);
2822
+ const sourcesDir = path19.join(root, SOURCES_DIR);
2823
+ const results = [];
2824
+ const lineCountCache = /* @__PURE__ */ new Map();
2825
+ for (const page of pages) {
2826
+ for (const { captured, line } of findMatchesInContent(page.content, CITATION_PATTERN)) {
2827
+ await collectBrokenForMarker(captured, line, page.filePath, sourcesDir, lineCountCache, results);
2828
+ }
1799
2829
  }
2830
+ return results;
1800
2831
  }
1801
- function isValidCandidate(value) {
1802
- if (!value || typeof value !== "object") return false;
1803
- const candidate = value;
1804
- return typeof candidate.id === "string" && typeof candidate.title === "string" && typeof candidate.slug === "string" && typeof candidate.body === "string" && Array.isArray(candidate.sources);
1805
- }
1806
- async function listCandidates(root) {
1807
- const dir = path14.join(root, CANDIDATES_DIR);
1808
- if (!existsSync4(dir)) return [];
1809
- const entries = await readdir6(dir, { withFileTypes: true });
1810
- const candidates = [];
1811
- for (const entry of entries) {
1812
- if (!entry.isFile() || !entry.name.endsWith(CANDIDATE_EXT)) continue;
1813
- const id = entry.name.slice(0, -CANDIDATE_EXT.length);
1814
- const candidate = await readCandidate(root, id);
1815
- if (candidate) candidates.push(candidate);
2832
+ async function collectBrokenForMarker(captured, line, pageFile, sourcesDir, lineCountCache, out) {
2833
+ for (const part of captured.split(",")) {
2834
+ const trimmed = part.trim();
2835
+ if (trimmed.length === 0) continue;
2836
+ const filename = stripSpanSuffix(trimmed);
2837
+ const citedPath = path19.join(sourcesDir, filename);
2838
+ if (!existsSync6(citedPath)) {
2839
+ out.push({
2840
+ rule: "broken-citation",
2841
+ severity: "error",
2842
+ file: pageFile,
2843
+ message: `Broken citation ^[${filename}] \u2014 source file not found`,
2844
+ line
2845
+ });
2846
+ continue;
2847
+ }
2848
+ const range = parseLineRange(trimmed);
2849
+ if (range === null) continue;
2850
+ const lineCount = await resolveLineCount(citedPath, filename, lineCountCache);
2851
+ if (range.end <= lineCount) continue;
2852
+ out.push({
2853
+ rule: "broken-citation",
2854
+ severity: "error",
2855
+ file: pageFile,
2856
+ message: `Claim-level span ^[${trimmed}] is out of bounds (source has only ${lineCount} lines)`,
2857
+ line
2858
+ });
1816
2859
  }
1817
- candidates.sort((a, b) => a.generatedAt.localeCompare(b.generatedAt));
1818
- return candidates;
1819
- }
1820
- async function countCandidates(root) {
1821
- const candidates = await listCandidates(root);
1822
- return candidates.length;
1823
2860
  }
1824
- async function deleteCandidate(root, id) {
1825
- const filePath = candidatePath(root, id);
1826
- if (!existsSync4(filePath)) return false;
1827
- await unlink2(filePath);
1828
- return true;
2861
+ async function resolveLineCount(citedPath, filename, cache) {
2862
+ const cached = cache.get(filename);
2863
+ if (cached !== void 0) return cached;
2864
+ const content = await safeReadFile(citedPath);
2865
+ const lineCount = countLines(content);
2866
+ cache.set(filename, lineCount);
2867
+ return lineCount;
1829
2868
  }
1830
- async function archiveCandidate(root, id) {
1831
- const sourcePath = candidatePath(root, id);
1832
- if (!existsSync4(sourcePath)) return false;
1833
- const target = archivePath(root, id);
1834
- await mkdir5(path14.dirname(target), { recursive: true });
1835
- try {
1836
- await rename3(sourcePath, target);
1837
- } catch {
1838
- const raw = await safeReadFile(sourcePath);
1839
- await writeFile4(target, raw, "utf-8");
1840
- await unlink2(sourcePath);
2869
+ async function checkMalformedClaimCitations(root) {
2870
+ const pages = await collectAllPages(root);
2871
+ const results = [];
2872
+ for (const page of pages) {
2873
+ for (const { captured, line } of findMatchesInContent(page.content, CITATION_PATTERN)) {
2874
+ for (const part of captured.split(",")) {
2875
+ if (!isMalformedCitationEntry(part)) continue;
2876
+ results.push({
2877
+ rule: "malformed-claim-citation",
2878
+ severity: "error",
2879
+ file: page.filePath,
2880
+ message: `Malformed claim citation ^[${captured}] \u2014 expected file.md, file.md:N-N, or file.md#LN-LN`,
2881
+ line
2882
+ });
2883
+ }
2884
+ }
1841
2885
  }
1842
- return true;
2886
+ return results;
1843
2887
  }
1844
2888
 
1845
2889
  // src/compiler/page-renderer.ts
1846
- import { readdir as readdir7 } from "fs/promises";
1847
- import path15 from "path";
2890
+ import { readdir as readdir8 } from "fs/promises";
2891
+ import path20 from "path";
1848
2892
 
1849
2893
  // src/compiler/provenance.ts
1850
2894
  function addProvenanceMeta(fields, concept) {
@@ -1873,8 +2917,8 @@ function reportContradictionWarnings(conceptTitle, concept) {
1873
2917
 
1874
2918
  // src/compiler/page-renderer.ts
1875
2919
  var RELATED_PAGE_CONTEXT_LIMIT = 5;
1876
- async function renderMergedPageContent(root, entry) {
1877
- const pagePath = path15.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
2920
+ async function renderMergedPageContent(root, entry, schema) {
2921
+ const pagePath = path20.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
1878
2922
  const existingPage = await safeReadFile(pagePath);
1879
2923
  const relatedPages = await loadRelatedPages(root, entry.slug);
1880
2924
  const system = buildPagePrompt(
@@ -1889,14 +2933,14 @@ async function renderMergedPageContent(root, entry) {
1889
2933
  { role: "user", content: `Write the wiki page for "${entry.concept.concept}".` }
1890
2934
  ]
1891
2935
  });
1892
- const frontmatter = buildMergedFrontmatter(entry, existingPage);
2936
+ const frontmatter = buildMergedFrontmatter(entry, existingPage, schema);
1893
2937
  reportContradictionWarnings(entry.concept.concept, entry.concept);
1894
2938
  return `${frontmatter}
1895
2939
 
1896
2940
  ${pageBody}
1897
2941
  `;
1898
2942
  }
1899
- function buildMergedFrontmatter(entry, existingPage) {
2943
+ function buildMergedFrontmatter(entry, existingPage, schema) {
1900
2944
  const now = (/* @__PURE__ */ new Date()).toISOString();
1901
2945
  const existing = existingPage ? parseFrontmatter(existingPage) : null;
1902
2946
  const createdAt = existing?.meta.createdAt && typeof existing.meta.createdAt === "string" ? existing.meta.createdAt : now;
@@ -1904,6 +2948,7 @@ function buildMergedFrontmatter(entry, existingPage) {
1904
2948
  title: entry.concept.concept,
1905
2949
  summary: entry.concept.summary,
1906
2950
  sources: entry.sourceFiles,
2951
+ kind: schema.defaultKind,
1907
2952
  createdAt,
1908
2953
  updatedAt: now
1909
2954
  };
@@ -1912,17 +2957,17 @@ function buildMergedFrontmatter(entry, existingPage) {
1912
2957
  return buildFrontmatter(frontmatterFields);
1913
2958
  }
1914
2959
  async function loadRelatedPages(root, excludeSlug) {
1915
- const conceptsPath = path15.join(root, CONCEPTS_DIR);
2960
+ const conceptsPath = path20.join(root, CONCEPTS_DIR);
1916
2961
  let files;
1917
2962
  try {
1918
- files = await readdir7(conceptsPath);
2963
+ files = await readdir8(conceptsPath);
1919
2964
  } catch {
1920
2965
  return "";
1921
2966
  }
1922
2967
  const related = files.filter((f) => f.endsWith(".md") && f !== `${excludeSlug}.md`).slice(0, RELATED_PAGE_CONTEXT_LIMIT);
1923
2968
  const contents = [];
1924
2969
  for (const f of related) {
1925
- const content = await safeReadFile(path15.join(conceptsPath, f));
2970
+ const content = await safeReadFile(path20.join(conceptsPath, f));
1926
2971
  if (!content) continue;
1927
2972
  const { meta } = parseFrontmatter(content);
1928
2973
  if (meta.orphaned) continue;
@@ -1962,7 +3007,7 @@ function bucketChanges(changes) {
1962
3007
  unchanged: changes.filter((c) => c.status === "unchanged")
1963
3008
  };
1964
3009
  }
1965
- async function generatePagesPhase(root, extractions, frozenSlugs, options) {
3010
+ async function generatePagesPhase(root, extractions, frozenSlugs, schema, options) {
1966
3011
  const merged = mergeExtractions(extractions, frozenSlugs);
1967
3012
  const sourceStates = options.review ? await buildExtractionSourceStates(root, extractions) : {};
1968
3013
  const limit = pLimit(COMPILE_CONCURRENCY);
@@ -1970,7 +3015,7 @@ async function generatePagesPhase(root, extractions, frozenSlugs, options) {
1970
3015
  const candidates = [];
1971
3016
  const pages = await Promise.all(
1972
3017
  merged.map((entry) => limit(async () => {
1973
- const result = await generateMergedPage(root, entry, options, sourceStates);
3018
+ const result = await generateMergedPage(root, entry, schema, options, sourceStates);
1974
3019
  if (result.error) errors.push(result.error);
1975
3020
  if (result.candidateId) candidates.push(result.candidateId);
1976
3021
  return entry;
@@ -2016,12 +3061,24 @@ function summarizeCompile(buckets, generation, extractions, options) {
2016
3061
  return baseResult;
2017
3062
  }
2018
3063
  async function runCompilePipeline(root, options) {
3064
+ const schema = await loadSchema(root);
3065
+ reportSchemaStatus(schema);
2019
3066
  const state = await readState(root);
2020
3067
  const changes = await detectChanges(root, state);
2021
3068
  augmentWithAffectedSources(changes, findAffectedSources(state, changes));
2022
3069
  const buckets = bucketChanges(changes);
2023
3070
  if (buckets.toCompile.length === 0 && buckets.deleted.length === 0) {
2024
3071
  status("\u2713", success("Nothing to compile \u2014 all sources up to date."));
3072
+ if (!options.review) {
3073
+ const emptyGeneration = { pages: [], errors: [], candidates: [] };
3074
+ await generateSeedPages(root, schema, emptyGeneration);
3075
+ await finalizeWiki(root, emptyGeneration.pages);
3076
+ return {
3077
+ ...emptyCompileResult(),
3078
+ skipped: buckets.unchanged.length,
3079
+ errors: emptyGeneration.errors
3080
+ };
3081
+ }
2025
3082
  return { ...emptyCompileResult(), skipped: buckets.unchanged.length };
2026
3083
  }
2027
3084
  printChangesSummary(changes);
@@ -2034,17 +3091,23 @@ async function runCompilePipeline(root, options) {
2034
3091
  if (!options.review) {
2035
3092
  await freezeFailedExtractions(root, extractions, frozenSlugs);
2036
3093
  }
2037
- const generation = await generatePagesPhase(root, extractions, frozenSlugs, options);
3094
+ const generation = await generatePagesPhase(root, extractions, frozenSlugs, schema, options);
2038
3095
  if (!options.review) {
2039
3096
  await persistExtractionStates(root, extractions);
2040
3097
  if (frozenSlugs.size > 0) {
2041
3098
  await orphanUnownedFrozenPages(root, frozenSlugs);
2042
3099
  }
2043
3100
  await persistFrozenSlugs(root, frozenSlugs, extractions);
3101
+ await generateSeedPages(root, schema, generation);
2044
3102
  await finalizeWiki(root, generation.pages);
2045
3103
  }
2046
3104
  return summarizeCompile(buckets, generation, extractions, options);
2047
3105
  }
3106
+ function reportSchemaStatus(schema) {
3107
+ if (schema.loadedFrom) {
3108
+ status("i", dim(`Schema: ${schema.loadedFrom}`));
3109
+ }
3110
+ }
2048
3111
  function augmentWithAffectedSources(changes, affected) {
2049
3112
  for (const file of affected) {
2050
3113
  status("~", info(`${file} [affected by shared concept]`));
@@ -2105,9 +3168,9 @@ function printChangesSummary(changes) {
2105
3168
  }
2106
3169
  async function extractForSource(root, sourceFile) {
2107
3170
  status("*", info(`Extracting: ${sourceFile}`));
2108
- const sourcePath = path16.join(root, SOURCES_DIR, sourceFile);
2109
- const sourceContent = await readFile8(sourcePath, "utf-8");
2110
- const existingIndex = await safeReadFile(path16.join(root, INDEX_FILE));
3171
+ const sourcePath = path21.join(root, SOURCES_DIR, sourceFile);
3172
+ const sourceContent = await readFile14(sourcePath, "utf-8");
3173
+ const existingIndex = await safeReadFile(path21.join(root, INDEX_FILE));
2111
3174
  const concepts = await extractConcepts(sourceContent, existingIndex);
2112
3175
  if (concepts.length > 0) {
2113
3176
  const names = concepts.map((c) => c.concept).join(", ");
@@ -2165,27 +3228,77 @@ ${result.sourceContent}`
2165
3228
  }
2166
3229
  return Array.from(bySlug.values());
2167
3230
  }
2168
- async function generateMergedPage(root, entry, options, sourceStates) {
2169
- const fullPage = await renderMergedPageContent(root, entry);
3231
+ async function generateMergedPage(root, entry, schema, options, sourceStates) {
3232
+ const fullPage = await renderMergedPageContent(root, entry, schema);
2170
3233
  if (options.review) {
2171
- return await persistReviewCandidate(root, entry, fullPage, sourceStates);
3234
+ return await persistReviewCandidate(root, entry, fullPage, sourceStates, schema);
2172
3235
  }
2173
- const pagePath = path16.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
3236
+ const pagePath = path21.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
2174
3237
  const error2 = await writePageIfValid(pagePath, fullPage, entry.concept.concept);
2175
3238
  return { error: error2 ?? void 0 };
2176
3239
  }
2177
- async function persistReviewCandidate(root, entry, fullPage, sourceStates) {
3240
+ async function persistReviewCandidate(root, entry, fullPage, sourceStates, schema) {
3241
+ const virtualPath = `wiki/concepts/${entry.slug}.md`;
3242
+ const violations = checkPageCrossLinks(fullPage, virtualPath, schema);
2178
3243
  const candidate = await writeCandidate(root, {
2179
3244
  title: entry.concept.concept,
2180
3245
  slug: entry.slug,
2181
3246
  summary: entry.concept.summary,
2182
3247
  sources: entry.sourceFiles,
2183
3248
  body: fullPage,
2184
- sourceStates: pickStatesForSources(sourceStates, entry.sourceFiles)
3249
+ sourceStates: pickStatesForSources(sourceStates, entry.sourceFiles),
3250
+ schemaViolations: violations.length > 0 ? violations : void 0
2185
3251
  });
2186
3252
  status("?", info(`Candidate ready: ${candidate.id} (${entry.slug})`));
2187
3253
  return { candidateId: candidate.id };
2188
3254
  }
3255
+ async function generateSeedPages(root, schema, generation) {
3256
+ if (schema.seedPages.length === 0) return;
3257
+ for (const seed of schema.seedPages) {
3258
+ const error2 = await generateSingleSeedPage(root, schema, seed);
3259
+ if (error2) generation.errors.push(error2);
3260
+ }
3261
+ }
3262
+ async function generateSingleSeedPage(root, schema, seed) {
3263
+ const slug = slugify(seed.title);
3264
+ const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
3265
+ const relatedContent = await loadSeedRelatedPages(root, seed.relatedSlugs ?? []);
3266
+ const rule = schema.kinds[seed.kind];
3267
+ const system = buildSeedPagePrompt(seed, rule, relatedContent);
3268
+ const pageBody = await callClaude({
3269
+ system,
3270
+ messages: [{ role: "user", content: `Write the ${seed.kind} page titled "${seed.title}".` }]
3271
+ });
3272
+ const now = (/* @__PURE__ */ new Date()).toISOString();
3273
+ const existing = await safeReadFile(pagePath);
3274
+ const existingMeta = existing ? parseFrontmatter(existing).meta : null;
3275
+ const createdAt = typeof existingMeta?.createdAt === "string" ? existingMeta.createdAt : now;
3276
+ const typedFields = {
3277
+ title: seed.title,
3278
+ summary: seed.summary,
3279
+ sources: [],
3280
+ kind: seed.kind,
3281
+ createdAt,
3282
+ updatedAt: now
3283
+ };
3284
+ const frontmatterFields = { ...typedFields };
3285
+ addObsidianMeta(frontmatterFields, seed.title, []);
3286
+ const frontmatter = buildFrontmatter(frontmatterFields);
3287
+ return await writePageIfValid(pagePath, `${frontmatter}
3288
+
3289
+ ${pageBody}
3290
+ `, seed.title);
3291
+ }
3292
+ async function loadSeedRelatedPages(root, slugs) {
3293
+ if (slugs.length === 0) return "";
3294
+ const contents = [];
3295
+ for (const slug of slugs) {
3296
+ const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
3297
+ const content = await safeReadFile(pagePath);
3298
+ if (content) contents.push(content);
3299
+ }
3300
+ return contents.join("\n\n---\n\n");
3301
+ }
2189
3302
  async function extractConcepts(sourceContent, existingIndex) {
2190
3303
  const system = buildExtractionPrompt(sourceContent, existingIndex);
2191
3304
  const rawOutput = await callClaude({
@@ -2223,7 +3336,7 @@ async function persistSourceState(root, sourcePath, sourceFile, concepts) {
2223
3336
 
2224
3337
  // src/commands/compile.ts
2225
3338
  async function compileCommand(options = {}) {
2226
- if (!existsSync5(SOURCES_DIR)) {
3339
+ if (!existsSync7(SOURCES_DIR)) {
2227
3340
  status(
2228
3341
  "!",
2229
3342
  warn("No sources found. Run `llmwiki ingest <url>` first.")
@@ -2234,8 +3347,8 @@ async function compileCommand(options = {}) {
2234
3347
  }
2235
3348
 
2236
3349
  // src/commands/query.ts
2237
- import { existsSync as existsSync6 } from "fs";
2238
- import path17 from "path";
3350
+ import { existsSync as existsSync8 } from "fs";
3351
+ import path22 from "path";
2239
3352
  var PAGE_DIRS = [CONCEPTS_DIR, QUERIES_DIR];
2240
3353
  var PAGE_SELECTION_TOOL = {
2241
3354
  name: "select_pages",
@@ -2283,16 +3396,92 @@ ${indexContent}`;
2283
3396
  function buildFilteredIndex(candidates) {
2284
3397
  return candidates.map((entry) => `- **${entry.slug}**: ${entry.title} \u2014 ${entry.summary}`).join("\n");
2285
3398
  }
2286
- async function selectRelevantPages(root, question) {
3399
+ async function selectRelevantPages(root, question, debug) {
3400
+ const chunkSelection = await trySelectViaChunks(root, question, debug);
3401
+ if (chunkSelection) return chunkSelection;
2287
3402
  const candidates = await tryFindRelevantPages(root, question);
2288
3403
  if (candidates.length > 0) {
2289
3404
  const filteredIndex = buildFilteredIndex(candidates);
2290
3405
  const { pages: rawPages2, reasoning: reasoning2 } = await selectPages(question, filteredIndex);
2291
- return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2 };
3406
+ return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2, chunks: [] };
2292
3407
  }
2293
- const indexContent = await safeReadFile(path17.join(root, INDEX_FILE));
3408
+ const indexContent = await safeReadFile(path22.join(root, INDEX_FILE));
2294
3409
  const { pages: rawPages, reasoning } = await selectPages(question, indexContent);
2295
- return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning };
3410
+ return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning, chunks: [] };
3411
+ }
3412
+ async function trySelectViaChunks(root, question, debug) {
3413
+ const ranked = await tryFindRelevantChunks(root, question);
3414
+ if (ranked.length === 0) return null;
3415
+ const reranked = rerankWithBm25(
3416
+ question,
3417
+ ranked.map(({ chunk, score }) => ({ text: chunk.text, baseScore: score, chunk }))
3418
+ );
3419
+ const kept = reranked.slice(0, CHUNK_RERANK_KEEP);
3420
+ const reorderingHappened = wasReordered(ranked, kept.map((k) => k.candidate.chunk));
3421
+ const chunkCitations = toChunkCitations(kept);
3422
+ const pageSlugs = collapseToPages(chunkCitations, QUERY_PAGE_LIMIT);
3423
+ const reasoning = buildChunkReasoning(chunkCitations, pageSlugs);
3424
+ return {
3425
+ pages: pageSlugs,
3426
+ rawPages: pageSlugs,
3427
+ reasoning,
3428
+ chunks: chunkCitations,
3429
+ debug: debug ? buildDebug(chunkCitations, pageSlugs, reorderingHappened) : void 0
3430
+ };
3431
+ }
3432
+ function wasReordered(before, after) {
3433
+ const limit = Math.min(before.length, after.length);
3434
+ for (let i = 0; i < limit; i++) {
3435
+ if (before[i].chunk !== after[i]) return true;
3436
+ }
3437
+ return false;
3438
+ }
3439
+ function toChunkCitations(ranked) {
3440
+ return ranked.map(({ candidate, score }) => ({
3441
+ slug: candidate.chunk.slug,
3442
+ title: candidate.chunk.title,
3443
+ chunkIndex: candidate.chunk.chunkIndex,
3444
+ score,
3445
+ text: candidate.chunk.text
3446
+ }));
3447
+ }
3448
+ function collapseToPages(chunks, limit) {
3449
+ const slugs = [];
3450
+ const seen = /* @__PURE__ */ new Set();
3451
+ for (const chunk of chunks) {
3452
+ if (seen.has(chunk.slug)) continue;
3453
+ seen.add(chunk.slug);
3454
+ slugs.push(chunk.slug);
3455
+ if (slugs.length >= limit) break;
3456
+ }
3457
+ return slugs;
3458
+ }
3459
+ function buildChunkReasoning(chunks, pages) {
3460
+ const top = chunks.slice(0, pages.length);
3461
+ const summary = top.map((c) => `${c.slug}#${c.chunkIndex} (${c.score.toFixed(3)})`).join(", ");
3462
+ return `Selected ${pages.length} page(s) from ${chunks.length} reranked chunks: ${summary}`;
3463
+ }
3464
+ function buildDebug(chunks, pageSlugs, reranked) {
3465
+ const bestPerPage = /* @__PURE__ */ new Map();
3466
+ for (const c of chunks) {
3467
+ const prev = bestPerPage.get(c.slug);
3468
+ if (prev === void 0 || c.score > prev) bestPerPage.set(c.slug, c.score);
3469
+ }
3470
+ return {
3471
+ pages: pageSlugs.map((slug) => ({ slug, score: bestPerPage.get(slug) ?? 0 })),
3472
+ chunks,
3473
+ usedChunks: true,
3474
+ reranked
3475
+ };
3476
+ }
3477
+ async function tryFindRelevantChunks(root, question) {
3478
+ try {
3479
+ return await findRelevantChunks(root, question, CHUNK_TOP_K);
3480
+ } catch (err) {
3481
+ const message = err instanceof Error ? err.message : String(err);
3482
+ status("!", dim(`Chunk pre-filter unavailable (${message}); falling back.`));
3483
+ return [];
3484
+ }
2296
3485
  }
2297
3486
  async function tryFindRelevantPages(root, question) {
2298
3487
  try {
@@ -2308,7 +3497,7 @@ async function loadSelectedPages(root, slugs) {
2308
3497
  for (const slug of slugs) {
2309
3498
  let content = "";
2310
3499
  for (const dir of PAGE_DIRS) {
2311
- const candidate = await safeReadFile(path17.join(root, dir, `${slug}.md`));
3500
+ const candidate = await safeReadFile(path22.join(root, dir, `${slug}.md`));
2312
3501
  if (!candidate) continue;
2313
3502
  const { meta } = parseFrontmatter(candidate);
2314
3503
  if (meta.orphaned) continue;
@@ -2325,11 +3514,12 @@ ${content}`);
2325
3514
  return sections.join("\n\n");
2326
3515
  }
2327
3516
  var ANSWER_SYSTEM_PROMPT = "You are a knowledge assistant. Answer the question using ONLY the wiki content provided. Cite specific pages using [[Page Title]] wikilinks. If the wiki doesn't contain enough information, say so.";
2328
- async function callAnswerLLM(question, pagesContent, onToken) {
3517
+ async function callAnswerLLM(question, pagesContent, chunks, onToken) {
3518
+ const provenance = chunks.length > 0 ? buildChunkProvenance(chunks) : "";
2329
3519
  const userMessage = `Question: ${question}
2330
3520
 
2331
3521
  Relevant wiki pages:
2332
- ${pagesContent}`;
3522
+ ${pagesContent}${provenance}`;
2333
3523
  return callClaude({
2334
3524
  system: ANSWER_SYSTEM_PROMPT,
2335
3525
  messages: [{ role: "user", content: userMessage }],
@@ -2337,6 +3527,16 @@ ${pagesContent}`;
2337
3527
  onToken
2338
3528
  });
2339
3529
  }
3530
+ function buildChunkProvenance(chunks) {
3531
+ const sections = chunks.map(
3532
+ (chunk) => `--- ${chunk.slug} (chunk ${chunk.chunkIndex}) ---
3533
+ ${chunk.text}`
3534
+ );
3535
+ return `
3536
+
3537
+ Most relevant excerpts (from chunk-level retrieval):
3538
+ ${sections.join("\n\n")}`;
3539
+ }
2340
3540
  function summarizeAnswer(answer) {
2341
3541
  const firstLine = answer.trim().split(/\n/)[0] ?? "";
2342
3542
  const firstSentence = firstLine.split(/(?<=[.!?])\s/)[0] ?? firstLine;
@@ -2344,7 +3544,7 @@ function summarizeAnswer(answer) {
2344
3544
  }
2345
3545
  async function saveQueryPage(root, question, answer) {
2346
3546
  const slug = slugify(question);
2347
- const filePath = path17.join(root, QUERIES_DIR, `${slug}.md`);
3547
+ const filePath = path22.join(root, QUERIES_DIR, `${slug}.md`);
2348
3548
  const frontmatter = buildFrontmatter({
2349
3549
  title: question,
2350
3550
  summary: summarizeAnswer(answer),
@@ -2370,30 +3570,42 @@ ${answer}
2370
3570
  return slug;
2371
3571
  }
2372
3572
  async function generateAnswer(root, question, options = {}) {
2373
- if (!existsSync6(path17.join(root, INDEX_FILE))) {
3573
+ if (!existsSync8(path22.join(root, INDEX_FILE))) {
2374
3574
  throw new Error("Wiki index not found. Run `llmwiki compile` first.");
2375
3575
  }
2376
- const { pages, reasoning } = await selectRelevantPages(root, question);
2377
- options.onPageSelection?.(pages, reasoning);
2378
- const pagesContent = await loadSelectedPages(root, pages);
3576
+ const selection = await selectRelevantPages(root, question, Boolean(options.debug));
3577
+ options.onPageSelection?.(selection.pages, selection.reasoning);
3578
+ const pagesContent = await loadSelectedPages(root, selection.pages);
2379
3579
  if (!pagesContent) {
2380
- return { answer: "", selectedPages: pages, reasoning };
2381
- }
2382
- const answer = await callAnswerLLM(question, pagesContent, options.onToken);
2383
- let saved;
2384
- if (options.save) {
2385
- saved = await saveQueryPage(root, question, answer);
3580
+ return buildEmptyResult(selection);
2386
3581
  }
2387
- return { answer, selectedPages: pages, reasoning, saved };
3582
+ const answer = await callAnswerLLM(question, pagesContent, selection.chunks, options.onToken);
3583
+ const saved = options.save ? await saveQueryPage(root, question, answer) : void 0;
3584
+ return {
3585
+ answer,
3586
+ selectedPages: selection.pages,
3587
+ reasoning: selection.reasoning,
3588
+ saved,
3589
+ debug: selection.debug
3590
+ };
3591
+ }
3592
+ function buildEmptyResult(selection) {
3593
+ return {
3594
+ answer: "",
3595
+ selectedPages: selection.pages,
3596
+ reasoning: selection.reasoning,
3597
+ debug: selection.debug
3598
+ };
2388
3599
  }
2389
3600
  async function queryCommand(root, question, options) {
2390
- if (!existsSync6(path17.join(root, INDEX_FILE))) {
3601
+ if (!existsSync8(path22.join(root, INDEX_FILE))) {
2391
3602
  status("!", error("Wiki index not found. Run `llmwiki compile` first."));
2392
3603
  return;
2393
3604
  }
2394
3605
  header("Selecting relevant pages");
2395
3606
  const result = await generateAnswer(root, question, {
2396
3607
  save: options.save,
3608
+ debug: options.debug,
2397
3609
  onToken: (text) => process.stdout.write(text),
2398
3610
  onPageSelection: (pages, reasoning) => {
2399
3611
  status("i", dim(`Reasoning: ${reasoning}`));
@@ -2402,6 +3614,7 @@ async function queryCommand(root, question, options) {
2402
3614
  }
2403
3615
  });
2404
3616
  process.stdout.write("\n");
3617
+ if (result.debug) printDebugSnapshot(result.debug);
2405
3618
  if (!result.answer) {
2406
3619
  status("!", error("No matching pages found. Try refining your question."));
2407
3620
  return;
@@ -2412,15 +3625,35 @@ async function queryCommand(root, question, options) {
2412
3625
  status("\u2192", dim("Tip: use --save to add this answer to your wiki"));
2413
3626
  }
2414
3627
  }
3628
+ function printDebugSnapshot(debug) {
3629
+ header("Retrieval debug");
3630
+ status(
3631
+ "i",
3632
+ dim(
3633
+ `Source: ${debug.usedChunks ? "chunk-level" : "page-level"}; reranked: ${debug.reranked ? "yes" : "no"}`
3634
+ )
3635
+ );
3636
+ for (const page of debug.pages) {
3637
+ status("\u2022", `${page.slug} (best chunk score ${page.score.toFixed(3)})`);
3638
+ }
3639
+ for (const chunk of debug.chunks) {
3640
+ const preview = chunk.text.slice(0, DEBUG_CHUNK_PREVIEW_CHARS).replace(/\s+/g, " ").trim();
3641
+ status(
3642
+ "\xB7",
3643
+ dim(`${chunk.slug}#${chunk.chunkIndex} score=${chunk.score.toFixed(3)} :: ${preview}\u2026`)
3644
+ );
3645
+ }
3646
+ }
3647
+ var DEBUG_CHUNK_PREVIEW_CHARS = 120;
2415
3648
 
2416
3649
  // src/commands/watch.ts
2417
3650
  import { watch as chokidarWatch } from "chokidar";
2418
- import { existsSync as existsSync7 } from "fs";
2419
- import path18 from "path";
3651
+ import { existsSync as existsSync9 } from "fs";
3652
+ import path23 from "path";
2420
3653
  var DEBOUNCE_MS = 500;
2421
3654
  async function watchCommand() {
2422
- const sourcesPath = path18.resolve(SOURCES_DIR);
2423
- if (!existsSync7(sourcesPath)) {
3655
+ const sourcesPath = path23.resolve(SOURCES_DIR);
3656
+ if (!existsSync9(sourcesPath)) {
2424
3657
  status(
2425
3658
  "!",
2426
3659
  warn("No sources/ directory found. Run `llmwiki ingest <url>` first.")
@@ -2454,7 +3687,7 @@ async function watchCommand() {
2454
3687
  const scheduleCompile = (eventPath, event) => {
2455
3688
  status(
2456
3689
  "~",
2457
- dim(`${event}: ${path18.basename(eventPath)}`)
3690
+ dim(`${event}: ${path23.basename(eventPath)}`)
2458
3691
  );
2459
3692
  if (debounceTimer) clearTimeout(debounceTimer);
2460
3693
  debounceTimer = setTimeout(triggerCompile, DEBOUNCE_MS);
@@ -2468,261 +3701,30 @@ async function watchCommand() {
2468
3701
  });
2469
3702
  }
2470
3703
 
2471
- // src/linter/rules.ts
2472
- import { readdir as readdir8, readFile as readFile9 } from "fs/promises";
2473
- import { existsSync as existsSync8 } from "fs";
2474
- import path19 from "path";
2475
- var MIN_BODY_LENGTH = 50;
2476
- var WIKILINK_PATTERN = /\[\[([^\]]+)\]\]/g;
2477
- var CITATION_PATTERN = /\^\[([^\]]+)\]/g;
2478
- function findMatchesInContent(content, pattern) {
2479
- const results = [];
2480
- const lines = content.split("\n");
2481
- for (let i = 0; i < lines.length; i++) {
2482
- const matches = lines[i].matchAll(pattern);
2483
- for (const match of matches) {
2484
- results.push({ captured: match[1], line: i + 1 });
2485
- }
2486
- }
2487
- return results;
2488
- }
2489
- async function readMarkdownFiles(dirPath) {
2490
- if (!existsSync8(dirPath)) return [];
2491
- const entries = await readdir8(dirPath);
2492
- const mdFiles = entries.filter((f) => f.endsWith(".md"));
2493
- const results = await Promise.all(
2494
- mdFiles.map(async (fileName) => {
2495
- const filePath = path19.join(dirPath, fileName);
2496
- const content = await readFile9(filePath, "utf-8");
2497
- return { filePath, content };
2498
- })
2499
- );
2500
- return results;
2501
- }
2502
- async function collectAllPages(root) {
2503
- const conceptPages = await readMarkdownFiles(path19.join(root, CONCEPTS_DIR));
2504
- const queryPages = await readMarkdownFiles(path19.join(root, QUERIES_DIR));
2505
- return [...conceptPages, ...queryPages];
2506
- }
2507
- function buildPageSlugSet(pages) {
2508
- const slugs = /* @__PURE__ */ new Set();
2509
- for (const page of pages) {
2510
- const baseName = path19.basename(page.filePath, ".md");
2511
- slugs.add(baseName.toLowerCase());
2512
- }
2513
- return slugs;
2514
- }
2515
- async function checkBrokenWikilinks(root) {
2516
- const pages = await collectAllPages(root);
2517
- const existingSlugs = buildPageSlugSet(pages);
2518
- const results = [];
2519
- for (const page of pages) {
2520
- for (const { captured, line } of findMatchesInContent(page.content, WIKILINK_PATTERN)) {
2521
- const linkSlug = slugify(captured);
2522
- if (!existingSlugs.has(linkSlug)) {
2523
- results.push({
2524
- rule: "broken-wikilink",
2525
- severity: "error",
2526
- file: page.filePath,
2527
- message: `Broken wikilink [[${captured}]] \u2014 no matching page found`,
2528
- line
2529
- });
2530
- }
2531
- }
2532
- }
2533
- return results;
2534
- }
2535
- async function checkOrphanedPages(root) {
2536
- const pages = await collectAllPages(root);
2537
- const results = [];
2538
- for (const page of pages) {
2539
- const { meta } = parseFrontmatter(page.content);
2540
- if (meta.orphaned === true) {
2541
- results.push({
2542
- rule: "orphaned-page",
2543
- severity: "warning",
2544
- file: page.filePath,
2545
- message: `Page is marked as orphaned`
2546
- });
2547
- }
2548
- }
2549
- return results;
2550
- }
2551
- async function checkMissingSummaries(root) {
2552
- const pages = await collectAllPages(root);
2553
- const results = [];
2554
- for (const page of pages) {
2555
- const { meta } = parseFrontmatter(page.content);
2556
- const summary = meta.summary;
2557
- const isMissing = !summary || typeof summary === "string" && summary.trim() === "";
2558
- if (isMissing) {
2559
- results.push({
2560
- rule: "missing-summary",
2561
- severity: "warning",
2562
- file: page.filePath,
2563
- message: `Page has no summary in frontmatter`
2564
- });
2565
- }
2566
- }
2567
- return results;
2568
- }
2569
- async function checkDuplicateConcepts(root) {
2570
- const pages = await collectAllPages(root);
2571
- const titleMap = /* @__PURE__ */ new Map();
2572
- for (const page of pages) {
2573
- const { meta } = parseFrontmatter(page.content);
2574
- const title = typeof meta.title === "string" ? meta.title : "";
2575
- if (!title) continue;
2576
- const normalizedTitle = title.toLowerCase().trim();
2577
- const existing = titleMap.get(normalizedTitle) ?? [];
2578
- existing.push(page.filePath);
2579
- titleMap.set(normalizedTitle, existing);
2580
- }
2581
- const results = [];
2582
- for (const [title, files] of titleMap) {
2583
- if (files.length <= 1) continue;
2584
- for (const file of files) {
2585
- results.push({
2586
- rule: "duplicate-concept",
2587
- severity: "error",
2588
- file,
2589
- message: `Duplicate title "${title}" \u2014 also in ${files.filter((f) => f !== file).join(", ")}`
2590
- });
2591
- }
2592
- }
2593
- return results;
2594
- }
2595
- async function checkEmptyPages(root) {
2596
- const pages = await collectAllPages(root);
2597
- const results = [];
2598
- for (const page of pages) {
2599
- const { meta, body } = parseFrontmatter(page.content);
2600
- const hasTitle = typeof meta.title === "string" && meta.title.trim() !== "";
2601
- const isBodyEmpty = body.trim().length < MIN_BODY_LENGTH;
2602
- if (hasTitle && isBodyEmpty) {
2603
- results.push({
2604
- rule: "empty-page",
2605
- severity: "warning",
2606
- file: page.filePath,
2607
- message: `Page body is empty or too short (< ${MIN_BODY_LENGTH} chars)`
2608
- });
2609
- }
2610
- }
2611
- return results;
2612
- }
2613
- async function checkLowConfidencePages(root) {
2614
- const pages = await collectAllPages(root);
2615
- const results = [];
2616
- for (const page of pages) {
2617
- const { meta } = parseFrontmatter(page.content);
2618
- const { confidence } = parseProvenanceMetadata(meta);
2619
- if (confidence === void 0 || confidence >= LOW_CONFIDENCE_THRESHOLD) continue;
2620
- results.push({
2621
- rule: "low-confidence",
2622
- severity: "warning",
2623
- file: page.filePath,
2624
- message: `Page confidence ${confidence.toFixed(2)} is below ${LOW_CONFIDENCE_THRESHOLD}`
2625
- });
2626
- }
2627
- return results;
2628
- }
2629
- async function checkContradictedPages(root) {
2630
- const pages = await collectAllPages(root);
2631
- const results = [];
2632
- for (const page of pages) {
2633
- const { meta } = parseFrontmatter(page.content);
2634
- const { contradictedBy } = parseProvenanceMetadata(meta);
2635
- if (!contradictedBy || contradictedBy.length === 0) continue;
2636
- const slugs = contradictedBy.map((r) => r.slug).join(", ");
2637
- results.push({
2638
- rule: "contradicted-page",
2639
- severity: "warning",
2640
- file: page.filePath,
2641
- message: `Page contradicts: ${slugs}`
2642
- });
2643
- }
2644
- return results;
2645
- }
2646
- async function checkInferredWithoutCitations(root) {
2647
- const pages = await collectAllPages(root);
2648
- const results = [];
2649
- for (const page of pages) {
2650
- const { meta, body } = parseFrontmatter(page.content);
2651
- const provenance = parseProvenanceMetadata(meta);
2652
- const inferred = provenance.inferredParagraphs ?? countUncitedProseParagraphs(body);
2653
- if (inferred <= MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS) continue;
2654
- results.push({
2655
- rule: "excess-inferred-paragraphs",
2656
- severity: "warning",
2657
- file: page.filePath,
2658
- message: `Page has ${inferred} inferred paragraphs without citations (max ${MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS})`
2659
- });
2660
- }
2661
- return results;
2662
- }
2663
- var PROSE_PARAGRAPH_LEAD = /^[A-Za-z]/;
2664
- function countUncitedProseParagraphs(body) {
2665
- const paragraphs = body.split(/\n\s*\n/);
2666
- let count = 0;
2667
- for (const block of paragraphs) {
2668
- const trimmed = block.trim();
2669
- if (trimmed.length === 0) continue;
2670
- if (!PROSE_PARAGRAPH_LEAD.test(trimmed)) continue;
2671
- if (CITATION_PATTERN.test(trimmed)) {
2672
- CITATION_PATTERN.lastIndex = 0;
2673
- continue;
2674
- }
2675
- CITATION_PATTERN.lastIndex = 0;
2676
- count += 1;
2677
- }
2678
- return count;
2679
- }
2680
- function splitCitationFilenames(captured) {
2681
- return captured.split(",").map((s) => s.trim()).filter((s) => s.length > 0);
2682
- }
2683
- async function checkBrokenCitations(root) {
2684
- const pages = await collectAllPages(root);
2685
- const sourcesDir = path19.join(root, SOURCES_DIR);
2686
- const results = [];
2687
- for (const page of pages) {
2688
- for (const { captured, line } of findMatchesInContent(page.content, CITATION_PATTERN)) {
2689
- for (const filename of splitCitationFilenames(captured)) {
2690
- const citedPath = path19.join(sourcesDir, filename);
2691
- if (!existsSync8(citedPath)) {
2692
- results.push({
2693
- rule: "broken-citation",
2694
- severity: "error",
2695
- file: page.filePath,
2696
- message: `Broken citation ^[${filename}] \u2014 source file not found`,
2697
- line
2698
- });
2699
- }
2700
- }
2701
- }
2702
- }
2703
- return results;
2704
- }
2705
-
2706
3704
  // src/linter/index.ts
2707
- var ALL_RULES = [
3705
+ var RULES_WITHOUT_SCHEMA = [
2708
3706
  checkBrokenWikilinks,
2709
3707
  checkOrphanedPages,
2710
3708
  checkMissingSummaries,
2711
3709
  checkDuplicateConcepts,
2712
3710
  checkEmptyPages,
2713
3711
  checkBrokenCitations,
3712
+ checkMalformedClaimCitations,
2714
3713
  checkLowConfidencePages,
2715
3714
  checkContradictedPages,
2716
3715
  checkInferredWithoutCitations
2717
3716
  ];
3717
+ var RULES_WITH_SCHEMA = [checkSchemaCrossLinks];
2718
3718
  function countBySeverity(results, severity) {
2719
3719
  return results.filter((r) => r.severity === severity).length;
2720
3720
  }
2721
3721
  async function lint(root) {
2722
- const ruleResults = await Promise.all(
2723
- ALL_RULES.map((rule) => rule(root))
2724
- );
2725
- const results = ruleResults.flat();
3722
+ const schema = await loadSchema(root);
3723
+ const [plainResults, schemaResults] = await Promise.all([
3724
+ Promise.all(RULES_WITHOUT_SCHEMA.map((rule) => rule(root))),
3725
+ Promise.all(RULES_WITH_SCHEMA.map((rule) => rule(root, schema)))
3726
+ ]);
3727
+ const results = [...plainResults.flat(), ...schemaResults.flat()];
2726
3728
  return {
2727
3729
  errors: countBySeverity(results, "error"),
2728
3730
  warnings: countBySeverity(results, "warning"),
@@ -2750,6 +3752,9 @@ function printResult(result) {
2750
3752
  }
2751
3753
  async function lintCommand() {
2752
3754
  header("Linting wiki");
3755
+ const schema = await loadSchema(process.cwd());
3756
+ const schemaSource = schema.loadedFrom ?? "defaults (no schema file)";
3757
+ status("i", dim(`Schema: ${schemaSource}`));
2753
3758
  const summary = await lint(process.cwd());
2754
3759
  for (const result of summary.results) {
2755
3760
  printResult(result);
@@ -2766,6 +3771,36 @@ async function lintCommand() {
2766
3771
  }
2767
3772
  }
2768
3773
 
3774
+ // src/commands/schema.ts
3775
+ import { existsSync as existsSync10 } from "fs";
3776
+ import { mkdir as mkdir6, writeFile as writeFile5 } from "fs/promises";
3777
+ import path24 from "path";
3778
+ async function schemaInitCommand() {
3779
+ const root = process.cwd();
3780
+ const defaults = buildDefaultSchema();
3781
+ const targetPath = defaultSchemaInitPath(root);
3782
+ if (existsSync10(targetPath)) {
3783
+ status("!", warn(`Schema file already exists at ${targetPath}`));
3784
+ return;
3785
+ }
3786
+ await mkdir6(path24.dirname(targetPath), { recursive: true });
3787
+ const serializable = {
3788
+ version: defaults.version,
3789
+ defaultKind: defaults.defaultKind,
3790
+ kinds: defaults.kinds,
3791
+ seedPages: defaults.seedPages
3792
+ };
3793
+ await writeFile5(targetPath, `${JSON.stringify(serializable, null, 2)}
3794
+ `, "utf-8");
3795
+ status("+", success(`Wrote schema to ${targetPath}`));
3796
+ }
3797
+ async function schemaShowCommand() {
3798
+ const schema = await loadSchema(process.cwd());
3799
+ const loadedFrom = schema.loadedFrom ?? "(defaults \u2014 no schema file found)";
3800
+ header(`Schema (${loadedFrom})`);
3801
+ console.log(serializeSchemaToYaml(schema));
3802
+ }
3803
+
2769
3804
  // src/commands/review-list.ts
2770
3805
  async function reviewListCommand() {
2771
3806
  header("Pending review candidates");
@@ -2797,10 +3832,17 @@ async function reviewShowCommand(id) {
2797
3832
  status("i", dim(`generated: ${candidate.generatedAt}`));
2798
3833
  console.log();
2799
3834
  console.log(candidate.body);
3835
+ if (candidate.schemaViolations && candidate.schemaViolations.length > 0) {
3836
+ console.log();
3837
+ header("Schema violations");
3838
+ for (const v of candidate.schemaViolations) {
3839
+ status("!", warn(`[${v.severity}] ${v.message}`));
3840
+ }
3841
+ }
2800
3842
  }
2801
3843
 
2802
3844
  // src/commands/review-approve.ts
2803
- import path20 from "path";
3845
+ import path25 from "path";
2804
3846
 
2805
3847
  // src/commands/review-helpers.ts
2806
3848
  async function runReviewUnderLock(id, underLock) {
@@ -2832,7 +3874,7 @@ async function approveUnderLock(root, id) {
2832
3874
  process.exitCode = 1;
2833
3875
  return;
2834
3876
  }
2835
- const pagePath = path20.join(root, CONCEPTS_DIR, `${candidate.slug}.md`);
3877
+ const pagePath = path25.join(root, CONCEPTS_DIR, `${candidate.slug}.md`);
2836
3878
  await atomicWrite(pagePath, candidate.body);
2837
3879
  status("+", success(`Approved \u2192 ${source(pagePath)}`));
2838
3880
  await persistCandidateSourceStates(root, candidate);
@@ -2892,7 +3934,7 @@ import { McpServer as McpServer2 } from "@modelcontextprotocol/sdk/server/mcp.js
2892
3934
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
2893
3935
 
2894
3936
  // src/mcp/tools.ts
2895
- import path21 from "path";
3937
+ import path26 from "path";
2896
3938
  import { z } from "zod";
2897
3939
 
2898
3940
  // src/mcp/provider-check.ts
@@ -2985,15 +4027,16 @@ function registerQueryTool(server, root) {
2985
4027
  "query_wiki",
2986
4028
  {
2987
4029
  title: "Query Wiki",
2988
- description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Requires an LLM provider.",
4030
+ description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Set debug=true to include the selected chunks and their scores. Requires an LLM provider.",
2989
4031
  inputSchema: {
2990
4032
  question: z.string().describe("The natural-language question to answer."),
2991
- save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true.")
4033
+ save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true."),
4034
+ debug: z.boolean().optional().describe("Include retrieval debug info (selected chunks/pages + scores).")
2992
4035
  }
2993
4036
  },
2994
- async ({ question, save }) => {
4037
+ async ({ question, save, debug }) => {
2995
4038
  ensureProviderAvailable();
2996
- const result = await generateAnswer(root, question, { save });
4039
+ const result = await generateAnswer(root, question, { save, debug });
2997
4040
  return jsonResult(result);
2998
4041
  }
2999
4042
  );
@@ -3017,15 +4060,30 @@ function registerSearchTool(server, root) {
3017
4060
  );
3018
4061
  }
3019
4062
  async function pickSearchSlugs(root, question) {
4063
+ try {
4064
+ const chunks = await findRelevantChunks(root, question, CHUNK_TOP_K);
4065
+ if (chunks.length > 0) return dedupePreservingOrder(chunks.map((c) => c.chunk.slug));
4066
+ } catch {
4067
+ }
3020
4068
  try {
3021
4069
  const candidates = await findRelevantPages(root, question);
3022
4070
  if (candidates.length > 0) return candidates.map((c) => c.slug);
3023
4071
  } catch {
3024
4072
  }
3025
- const indexContent = await safeReadFile(path21.join(root, INDEX_FILE));
4073
+ const indexContent = await safeReadFile(path26.join(root, INDEX_FILE));
3026
4074
  const { pages } = await selectPages(question, indexContent);
3027
4075
  return pages;
3028
4076
  }
4077
+ function dedupePreservingOrder(slugs) {
4078
+ const seen = /* @__PURE__ */ new Set();
4079
+ const out = [];
4080
+ for (const slug of slugs) {
4081
+ if (seen.has(slug)) continue;
4082
+ seen.add(slug);
4083
+ out.push(slug);
4084
+ }
4085
+ return out;
4086
+ }
3029
4087
  function registerReadTool(server, root) {
3030
4088
  server.registerTool(
3031
4089
  "read_page",
@@ -3071,8 +4129,8 @@ function registerStatusTool(server, root) {
3071
4129
  );
3072
4130
  }
3073
4131
  async function collectStatus(root) {
3074
- const concepts = await collectPageSummaries(path21.join(root, CONCEPTS_DIR));
3075
- const queries = await collectPageSummaries(path21.join(root, QUERIES_DIR));
4132
+ const concepts = await collectPageSummaries(path26.join(root, CONCEPTS_DIR));
4133
+ const queries = await collectPageSummaries(path26.join(root, QUERIES_DIR));
3076
4134
  const state = await readState(root);
3077
4135
  const changes = await detectChanges(root, state);
3078
4136
  const orphans = await findOrphanedSlugs(root);
@@ -3089,7 +4147,7 @@ async function collectStatus(root) {
3089
4147
  };
3090
4148
  }
3091
4149
  async function findOrphanedSlugs(root) {
3092
- const scanned = await scanWikiPages(path21.join(root, CONCEPTS_DIR));
4150
+ const scanned = await scanWikiPages(path26.join(root, CONCEPTS_DIR));
3093
4151
  return scanned.filter(({ meta }) => meta.orphaned).map(({ slug }) => slug);
3094
4152
  }
3095
4153
  async function loadPageRecords(root, slugs) {
@@ -3102,7 +4160,7 @@ async function loadPageRecords(root, slugs) {
3102
4160
  }
3103
4161
  async function readPage(root, slug) {
3104
4162
  for (const dir of PAGE_DIRS2) {
3105
- const content = await safeReadFile(path21.join(root, dir, `${slug}.md`));
4163
+ const content = await safeReadFile(path26.join(root, dir, `${slug}.md`));
3106
4164
  if (!content) continue;
3107
4165
  const { meta, body } = parseFrontmatter(content);
3108
4166
  if (meta.orphaned) continue;
@@ -3117,7 +4175,7 @@ async function readPage(root, slug) {
3117
4175
  }
3118
4176
 
3119
4177
  // src/mcp/resources.ts
3120
- import path22 from "path";
4178
+ import path27 from "path";
3121
4179
  import { readdir as readdir9 } from "fs/promises";
3122
4180
  import { ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
3123
4181
  function jsonContent(uri, payload) {
@@ -3151,7 +4209,7 @@ function registerIndexResource(server, root) {
3151
4209
  mimeType: "text/markdown"
3152
4210
  },
3153
4211
  async (uri) => {
3154
- const content = await safeReadFile(path22.join(root, INDEX_FILE));
4212
+ const content = await safeReadFile(path27.join(root, INDEX_FILE));
3155
4213
  return { contents: [markdownContent(uri, content)] };
3156
4214
  }
3157
4215
  );
@@ -3218,7 +4276,7 @@ function registerQueryResource(server, root) {
3218
4276
  );
3219
4277
  }
3220
4278
  async function listSources(root) {
3221
- const sourcesPath = path22.join(root, SOURCES_DIR);
4279
+ const sourcesPath = path27.join(root, SOURCES_DIR);
3222
4280
  let files;
3223
4281
  try {
3224
4282
  files = await readdir9(sourcesPath);
@@ -3227,14 +4285,14 @@ async function listSources(root) {
3227
4285
  }
3228
4286
  const records = [];
3229
4287
  for (const file of files.filter((f) => f.endsWith(".md"))) {
3230
- const content = await safeReadFile(path22.join(sourcesPath, file));
4288
+ const content = await safeReadFile(path27.join(sourcesPath, file));
3231
4289
  const { meta } = parseFrontmatter(content);
3232
4290
  records.push({ filename: file, ...meta });
3233
4291
  }
3234
4292
  return records;
3235
4293
  }
3236
4294
  async function loadPageWithMeta(root, dir, slug) {
3237
- const filePath = path22.join(root, dir, `${slug}.md`);
4295
+ const filePath = path27.join(root, dir, `${slug}.md`);
3238
4296
  const content = await safeReadFile(filePath);
3239
4297
  if (!content) {
3240
4298
  throw new Error(`Page not found: ${dir}/${slug}.md`);
@@ -3243,7 +4301,7 @@ async function loadPageWithMeta(root, dir, slug) {
3243
4301
  return { slug, meta, body: body.trim() };
3244
4302
  }
3245
4303
  async function listPagesUnder(root, dir, scheme) {
3246
- const pagesPath = path22.join(root, dir);
4304
+ const pagesPath = path27.join(root, dir);
3247
4305
  let files;
3248
4306
  try {
3249
4307
  files = await readdir9(pagesPath);
@@ -3327,7 +4385,7 @@ reviewCommand.command("reject <id>").description("Reject a candidate and archive
3327
4385
  process.exit(1);
3328
4386
  }
3329
4387
  });
3330
- program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").action(async (question, options) => {
4388
+ program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").option("--debug", "Print which pages and chunks were selected and their scores").action(async (question, options) => {
3331
4389
  try {
3332
4390
  requireProvider();
3333
4391
  await queryCommand(process.cwd(), question, options);
@@ -3353,6 +4411,23 @@ program.command("lint").description("Run rule-based quality checks against the w
3353
4411
  process.exit(1);
3354
4412
  }
3355
4413
  });
4414
+ var schemaCmd = program.command("schema").description("Inspect or initialize the project's wiki schema config");
4415
+ schemaCmd.command("init").description("Write a starter schema file to .llmwiki/schema.json").action(async () => {
4416
+ try {
4417
+ await schemaInitCommand();
4418
+ } catch (err) {
4419
+ console.error(`\x1B[31mError:\x1B[0m ${err instanceof Error ? err.message : err}`);
4420
+ process.exit(1);
4421
+ }
4422
+ });
4423
+ schemaCmd.command("show").description("Print the resolved schema for this project").action(async () => {
4424
+ try {
4425
+ await schemaShowCommand();
4426
+ } catch (err) {
4427
+ console.error(`\x1B[31mError:\x1B[0m ${err instanceof Error ? err.message : err}`);
4428
+ process.exit(1);
4429
+ }
4430
+ });
3356
4431
  program.command("serve").description("Start an MCP server exposing wiki tools and resources over stdio").option("--root <dir>", "Project root directory", process.cwd()).action(async (options) => {
3357
4432
  try {
3358
4433
  await startMCPServer({ root: options.root, version });