llm-wiki-compiler 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -6,8 +6,8 @@ import { createRequire } from "module";
6
6
  import { Command } from "commander";
7
7
 
8
8
  // src/commands/ingest.ts
9
- import path3 from "path";
10
- import { mkdir as mkdir2, writeFile as writeFile2 } from "fs/promises";
9
+ import path7 from "path";
10
+ import { mkdir as mkdir2, readFile as readFile6, writeFile as writeFile2 } from "fs/promises";
11
11
 
12
12
  // src/utils/markdown.ts
13
13
  import { writeFile, rename, readFile, mkdir } from "fs/promises";
@@ -150,9 +150,17 @@ var LOCK_FILE = ".llmwiki/lock";
150
150
  var INDEX_FILE = "wiki/index.md";
151
151
  var MOC_FILE = "wiki/MOC.md";
152
152
  var EMBEDDINGS_FILE = ".llmwiki/embeddings.json";
153
+ var IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".jpg", ".jpeg", ".png", ".gif", ".webp"]);
154
+ var TRANSCRIPT_EXTENSIONS = /* @__PURE__ */ new Set([".vtt", ".srt"]);
155
+ var IMAGE_DESCRIBE_MAX_TOKENS = 2048;
153
156
  var CANDIDATES_DIR = ".llmwiki/candidates";
154
157
  var CANDIDATES_ARCHIVE_DIR = ".llmwiki/candidates/archive";
155
158
  var EMBEDDING_TOP_K = 15;
159
+ var CHUNK_TOP_K = 30;
160
+ var CHUNK_RERANK_KEEP = 12;
161
+ var CHUNK_TARGET_CHARS = 800;
162
+ var CHUNK_MAX_CHARS = 1400;
163
+ var CHUNK_MIN_CHARS = 200;
156
164
  var LOW_CONFIDENCE_THRESHOLD = 0.5;
157
165
  var MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS = 2;
158
166
  var EMBEDDING_MODELS = {
@@ -237,19 +245,24 @@ async function ingestWeb(url) {
237
245
 
238
246
  // src/ingest/file.ts
239
247
  import { readFile as readFile2 } from "fs/promises";
248
+ import path3 from "path";
249
+
250
+ // src/ingest/shared.ts
240
251
  import path2 from "path";
241
- var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
242
252
  function titleFromFilename(filePath) {
243
253
  const basename = path2.basename(filePath, path2.extname(filePath));
244
254
  return basename.replace(/[-_]+/g, " ").trim();
245
255
  }
256
+
257
+ // src/ingest/file.ts
258
+ var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
246
259
  function wrapPlainText(text) {
247
260
  return `\`\`\`
248
261
  ${text}
249
262
  \`\`\``;
250
263
  }
251
264
  async function ingestFile(filePath) {
252
- const ext = path2.extname(filePath).toLowerCase();
265
+ const ext = path3.extname(filePath).toLowerCase();
253
266
  if (!SUPPORTED_EXTENSIONS.has(ext)) {
254
267
  throw new Error(
255
268
  `Unsupported file type "${ext}". Only .md and .txt files are supported.`
@@ -261,10 +274,440 @@ async function ingestFile(filePath) {
261
274
  return { title, content };
262
275
  }
263
276
 
277
+ // src/ingest/pdf.ts
278
+ import { readFile as readFile3 } from "fs/promises";
279
+ function resolveTitle(filePath, info2) {
280
+ if (info2 && typeof info2 === "object") {
281
+ const titleField = info2["Title"];
282
+ if (typeof titleField === "string" && titleField.trim().length > 0) {
283
+ return titleField.trim();
284
+ }
285
+ }
286
+ return titleFromFilename(filePath);
287
+ }
288
+ async function ingestPdf(filePath) {
289
+ const { PDFParse } = await import("pdf-parse");
290
+ const buffer = await readFile3(filePath);
291
+ const parser = new PDFParse({ data: new Uint8Array(buffer) });
292
+ try {
293
+ const textResult = await parser.getText();
294
+ const infoResult = await parser.getInfo();
295
+ const title = resolveTitle(filePath, infoResult.info);
296
+ const content = textResult.text.trim();
297
+ return { title, content };
298
+ } finally {
299
+ await parser.destroy();
300
+ }
301
+ }
302
+
303
+ // src/ingest/image.ts
304
+ import { readFile as readFile4 } from "fs/promises";
305
+ import path5 from "path";
306
+ import Anthropic2 from "@anthropic-ai/sdk";
307
+
308
+ // src/providers/anthropic.ts
309
+ import Anthropic from "@anthropic-ai/sdk";
310
+ var VOYAGE_EMBEDDINGS_URL = "https://api.voyageai.com/v1/embeddings";
311
+ function buildAnthropicClientOptions(options = {}) {
312
+ const trimmedBaseURL = options.baseURL?.trim();
313
+ const trimmedApiKey = options.apiKey?.trim();
314
+ const trimmedAuthToken = options.authToken?.trim();
315
+ const result = {};
316
+ if (trimmedApiKey) {
317
+ result.apiKey = trimmedApiKey;
318
+ }
319
+ if (trimmedAuthToken) {
320
+ result.authToken = trimmedAuthToken;
321
+ }
322
+ if (!trimmedBaseURL) {
323
+ return result;
324
+ }
325
+ const normalizedBaseURL = trimmedBaseURL.endsWith("/") && trimmedBaseURL.length > 1 ? trimmedBaseURL.slice(0, -1) : trimmedBaseURL;
326
+ result.baseURL = normalizedBaseURL;
327
+ return result;
328
+ }
329
+ var AnthropicProvider = class {
330
+ client;
331
+ model;
332
+ constructor(model, options = {}) {
333
+ this.model = model;
334
+ this.client = new Anthropic(buildAnthropicClientOptions(options));
335
+ }
336
+ /** Send a single non-streaming completion request. */
337
+ async complete(system, messages, maxTokens) {
338
+ const response = await this.client.messages.create({
339
+ model: this.model,
340
+ max_tokens: maxTokens,
341
+ system,
342
+ messages
343
+ });
344
+ const textBlock = response.content.find((block) => block.type === "text");
345
+ return textBlock?.type === "text" ? textBlock.text : "";
346
+ }
347
+ /** Stream a completion, invoking onToken for each text chunk. */
348
+ async stream(system, messages, maxTokens, onToken) {
349
+ const stream = this.client.messages.stream({
350
+ model: this.model,
351
+ max_tokens: maxTokens,
352
+ system,
353
+ messages
354
+ });
355
+ let fullText = "";
356
+ for await (const event of stream) {
357
+ if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
358
+ fullText += event.delta.text;
359
+ onToken?.(event.delta.text);
360
+ }
361
+ }
362
+ return fullText;
363
+ }
364
+ /** Call Claude with tool definitions and return the parsed tool input as JSON. */
365
+ async toolCall(system, messages, tools, maxTokens) {
366
+ const anthropicTools = tools.map((t) => ({
367
+ name: t.name,
368
+ description: t.description,
369
+ input_schema: t.input_schema
370
+ }));
371
+ const response = await this.client.messages.create({
372
+ model: this.model,
373
+ max_tokens: maxTokens,
374
+ system,
375
+ messages,
376
+ tools: anthropicTools
377
+ });
378
+ const toolBlock = response.content.find((block) => block.type === "tool_use");
379
+ if (toolBlock?.type === "tool_use") {
380
+ return JSON.stringify(toolBlock.input);
381
+ }
382
+ const textBlock = response.content.find((block) => block.type === "text");
383
+ return textBlock?.type === "text" ? textBlock.text : "";
384
+ }
385
+ /**
386
+ * Produce a single embedding vector via the Voyage API.
387
+ *
388
+ * Anthropic does not ship a first-party embeddings endpoint, so we delegate
389
+ * to Voyage (their recommended partner). Requires VOYAGE_API_KEY.
390
+ */
391
+ async embed(text) {
392
+ const apiKey = process.env.VOYAGE_API_KEY?.trim();
393
+ if (!apiKey) {
394
+ throw new Error(
395
+ "VOYAGE_API_KEY is not set. Anthropic embeddings use Voyage \u2014 set VOYAGE_API_KEY to enable semantic search."
396
+ );
397
+ }
398
+ const response = await fetch(VOYAGE_EMBEDDINGS_URL, {
399
+ method: "POST",
400
+ headers: {
401
+ "Content-Type": "application/json",
402
+ Authorization: `Bearer ${apiKey}`
403
+ },
404
+ body: JSON.stringify({ input: text, model: EMBEDDING_MODELS.anthropic })
405
+ });
406
+ if (!response.ok) {
407
+ const detail = await response.text();
408
+ throw new Error(`Voyage embeddings request failed (${response.status}): ${detail}`);
409
+ }
410
+ const json = await response.json();
411
+ const vector = json.data?.[0]?.embedding;
412
+ if (!Array.isArray(vector)) {
413
+ throw new Error("Voyage embeddings response did not include a vector.");
414
+ }
415
+ return vector;
416
+ }
417
+ };
418
+
419
+ // src/utils/claude-settings.ts
420
+ import { readFileSync } from "fs";
421
+ import { homedir } from "os";
422
+ import path4 from "path";
423
+ var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
424
+ function isRecord(value) {
425
+ return typeof value === "object" && value !== null;
426
+ }
427
+ function normalize(value) {
428
+ if (typeof value !== "string") return void 0;
429
+ const trimmed = value.trim();
430
+ return trimmed.length > 0 ? trimmed : void 0;
431
+ }
432
+ function resolveClaudeSettingsPath(env) {
433
+ return env[CLAUDE_SETTINGS_PATH_ENV] ?? path4.join(homedir(), ".claude", "settings.json");
434
+ }
435
+ function readClaudeSettingsFile(settingsPath) {
436
+ try {
437
+ return readFileSync(settingsPath, "utf8");
438
+ } catch (err) {
439
+ if (isRecord(err) && err.code === "ENOENT") {
440
+ return void 0;
441
+ }
442
+ const message = err instanceof Error ? err.message : String(err);
443
+ throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
444
+ }
445
+ }
446
+ function readClaudeSettingsEnv(env = process.env) {
447
+ const settingsPath = resolveClaudeSettingsPath(env);
448
+ const raw = readClaudeSettingsFile(settingsPath);
449
+ if (!raw) return void 0;
450
+ let parsed;
451
+ try {
452
+ parsed = JSON.parse(raw);
453
+ } catch (err) {
454
+ const message = err instanceof Error ? err.message : String(err);
455
+ throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
456
+ }
457
+ if (!isRecord(parsed) || !isRecord(parsed.env)) {
458
+ return void 0;
459
+ }
460
+ const values = {
461
+ ANTHROPIC_API_KEY: normalize(parsed.env.ANTHROPIC_API_KEY),
462
+ ANTHROPIC_AUTH_TOKEN: normalize(parsed.env.ANTHROPIC_AUTH_TOKEN),
463
+ ANTHROPIC_BASE_URL: normalize(parsed.env.ANTHROPIC_BASE_URL),
464
+ ANTHROPIC_MODEL: normalize(parsed.env.ANTHROPIC_MODEL)
465
+ };
466
+ if (!values.ANTHROPIC_API_KEY && !values.ANTHROPIC_AUTH_TOKEN && !values.ANTHROPIC_BASE_URL && !values.ANTHROPIC_MODEL) {
467
+ return void 0;
468
+ }
469
+ return values;
470
+ }
471
+ function tryReadClaudeSettingsEnv(env) {
472
+ try {
473
+ return readClaudeSettingsEnv(env);
474
+ } catch {
475
+ return void 0;
476
+ }
477
+ }
478
+ function validateAnthropicBaseURL(value) {
479
+ const normalized = value.trim();
480
+ try {
481
+ const parsed = new URL(normalized);
482
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
483
+ throw new Error("Must use http:// or https:// protocol.");
484
+ }
485
+ } catch (err) {
486
+ const message = err instanceof Error ? err.message : "Must be a valid http(s) URL.";
487
+ throw new Error(`Invalid ANTHROPIC_BASE_URL: "${normalized}". ${message}`);
488
+ }
489
+ return normalized;
490
+ }
491
+ function resolveAnthropicAuthFromEnv(env = process.env) {
492
+ const explicitApiKey = normalize(env.ANTHROPIC_API_KEY);
493
+ if (explicitApiKey) return { apiKey: explicitApiKey };
494
+ const explicitAuthToken = normalize(env.ANTHROPIC_AUTH_TOKEN);
495
+ if (explicitAuthToken) return { authToken: explicitAuthToken };
496
+ const fallback = readClaudeSettingsEnv(env);
497
+ if (fallback?.ANTHROPIC_API_KEY) return { apiKey: fallback.ANTHROPIC_API_KEY };
498
+ if (fallback?.ANTHROPIC_AUTH_TOKEN) return { authToken: fallback.ANTHROPIC_AUTH_TOKEN };
499
+ return {};
500
+ }
501
+ function resolveAnthropicModelFromEnv(env = process.env) {
502
+ const explicitModel = env.LLMWIKI_MODEL;
503
+ if (explicitModel !== void 0) return explicitModel;
504
+ return tryReadClaudeSettingsEnv(env)?.ANTHROPIC_MODEL;
505
+ }
506
+ function resolveAnthropicBaseURLFromEnv(env = process.env) {
507
+ const explicitBaseURL = normalize(env.ANTHROPIC_BASE_URL);
508
+ if (explicitBaseURL) return validateAnthropicBaseURL(explicitBaseURL);
509
+ const fallbackBaseURL = tryReadClaudeSettingsEnv(env)?.ANTHROPIC_BASE_URL;
510
+ if (!fallbackBaseURL) return void 0;
511
+ return validateAnthropicBaseURL(fallbackBaseURL);
512
+ }
513
+
514
+ // src/ingest/image.ts
515
+ var EXTENSION_TO_MIME = {
516
+ ".jpg": "image/jpeg",
517
+ ".jpeg": "image/jpeg",
518
+ ".png": "image/png",
519
+ ".gif": "image/gif",
520
+ ".webp": "image/webp"
521
+ };
522
+ function mimeTypeForExtension(ext) {
523
+ const mimeType = EXTENSION_TO_MIME[ext.toLowerCase()];
524
+ if (!mimeType) {
525
+ throw new Error(
526
+ `Unsupported image extension "${ext}". Supported: ${Object.keys(EXTENSION_TO_MIME).join(", ")}`
527
+ );
528
+ }
529
+ return mimeType;
530
+ }
531
+ function buildClient() {
532
+ const baseURL = resolveAnthropicBaseURLFromEnv();
533
+ const auth = resolveAnthropicAuthFromEnv();
534
+ return new Anthropic2(buildAnthropicClientOptions({ baseURL, ...auth }));
535
+ }
536
+ async function describeImageWithVision(client, model, imageData, mimeType) {
537
+ const response = await client.messages.create({
538
+ model,
539
+ max_tokens: IMAGE_DESCRIBE_MAX_TOKENS,
540
+ messages: [
541
+ {
542
+ role: "user",
543
+ content: [
544
+ {
545
+ type: "image",
546
+ source: { type: "base64", media_type: mimeType, data: imageData }
547
+ },
548
+ {
549
+ type: "text",
550
+ text: "Extract and transcribe all text visible in this image. Then provide a detailed description of any non-text visual content. Format your response as markdown."
551
+ }
552
+ ]
553
+ }
554
+ ]
555
+ });
556
+ const textBlock = response.content.find((block) => block.type === "text");
557
+ return textBlock?.type === "text" ? textBlock.text : "";
558
+ }
559
+ async function ingestImage(filePath) {
560
+ const providerName = process.env.LLMWIKI_PROVIDER ?? "anthropic";
561
+ if (providerName !== "anthropic") {
562
+ throw new Error(
563
+ `Image ingest requires the Anthropic provider (vision). Current provider: "${providerName}". Set LLMWIKI_PROVIDER=anthropic and ANTHROPIC_API_KEY to use image ingest.`
564
+ );
565
+ }
566
+ const ext = path5.extname(filePath).toLowerCase();
567
+ const mimeType = mimeTypeForExtension(ext);
568
+ const imageBuffer = await readFile4(filePath);
569
+ const imageData = imageBuffer.toString("base64");
570
+ const client = buildClient();
571
+ const model = resolveAnthropicModelFromEnv() ?? PROVIDER_MODELS.anthropic;
572
+ const content = await describeImageWithVision(client, model, imageData, mimeType);
573
+ const title = titleFromFilename(filePath);
574
+ return { title, content };
575
+ }
576
+
577
+ // src/ingest/transcript.ts
578
+ import { readFile as readFile5 } from "fs/promises";
579
+ import path6 from "path";
580
+ import { YoutubeTranscript as YoutubeTranscriptUntyped } from "youtube-transcript/dist/youtube-transcript.esm.js";
581
+ var YoutubeTranscript = YoutubeTranscriptUntyped;
582
+ var YOUTUBE_URL_PATTERN = /^https?:\/\/(www\.)?(youtube\.com\/watch|youtu\.be\/)/;
583
+ var SRT_SEQUENCE_PATTERN = /^\d+$/;
584
+ var TIMESTAMP_PATTERN = /\d{2}:\d{2}[:.]\d{2}/;
585
+ var MS_PER_MINUTE = 6e4;
586
+ var MS_PER_SECOND = 1e3;
587
+ function isYoutubeUrl(source2) {
588
+ return YOUTUBE_URL_PATTERN.test(source2);
589
+ }
590
+ function extractVideoId(url) {
591
+ const match = url.match(/(?:v=|youtu\.be\/)([^&?/]+)/);
592
+ if (!match) {
593
+ throw new Error(`Could not extract video ID from YouTube URL: ${url}`);
594
+ }
595
+ return match[1];
596
+ }
597
+ function formatOffset(offsetMs) {
598
+ const minutes = Math.floor(offsetMs / MS_PER_MINUTE);
599
+ const seconds = Math.floor(offsetMs % MS_PER_MINUTE / MS_PER_SECOND);
600
+ return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}`;
601
+ }
602
+ async function fetchYoutubeTranscript(url) {
603
+ const videoId = extractVideoId(url);
604
+ const segments = await YoutubeTranscript.fetchTranscript(videoId);
605
+ if (!segments || segments.length === 0) {
606
+ throw new Error(`No transcript available for YouTube video: ${url}`);
607
+ }
608
+ const lines = segments.map((seg) => `[${formatOffset(seg.offset)}] ${seg.text}`);
609
+ return {
610
+ title: `YouTube Transcript ${videoId}`,
611
+ content: lines.join("\n")
612
+ };
613
+ }
614
+ function isCueTimestamp(trimmed) {
615
+ return TIMESTAMP_PATTERN.test(trimmed) && trimmed.includes("-->");
616
+ }
617
+ function parseVtt(raw, filePath) {
618
+ const lines = raw.split("\n");
619
+ const output = [];
620
+ let inCue = false;
621
+ for (const line of lines) {
622
+ const trimmed = line.trim();
623
+ if (trimmed === "WEBVTT" || trimmed === "") {
624
+ inCue = false;
625
+ continue;
626
+ }
627
+ if (isCueTimestamp(trimmed)) {
628
+ output.push(`
629
+ **[${trimmed}]**`);
630
+ inCue = true;
631
+ continue;
632
+ }
633
+ if (inCue && trimmed.length > 0) {
634
+ output.push(trimmed);
635
+ }
636
+ }
637
+ return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
638
+ }
639
+ function parseSrt(raw, filePath) {
640
+ const lines = raw.split("\n");
641
+ const output = [];
642
+ for (const line of lines) {
643
+ const trimmed = line.trim();
644
+ if (trimmed === "" || SRT_SEQUENCE_PATTERN.test(trimmed)) {
645
+ continue;
646
+ }
647
+ if (isCueTimestamp(trimmed)) {
648
+ output.push(`
649
+ **[${trimmed}]**`);
650
+ continue;
651
+ }
652
+ if (trimmed.length > 0) {
653
+ output.push(trimmed);
654
+ }
655
+ }
656
+ return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
657
+ }
658
+ function parsePlainTranscript(raw, filePath) {
659
+ return { title: titleFromFilename(filePath), content: raw.trim() };
660
+ }
661
+ async function ingestTranscript(source2) {
662
+ if (isYoutubeUrl(source2)) {
663
+ return fetchYoutubeTranscript(source2);
664
+ }
665
+ const ext = path6.extname(source2).toLowerCase();
666
+ const raw = await readFile5(source2, "utf-8");
667
+ if (ext === ".vtt") return parseVtt(raw, source2);
668
+ if (ext === ".srt") return parseSrt(raw, source2);
669
+ if (ext === ".txt") return parsePlainTranscript(raw, source2);
670
+ throw new Error(
671
+ `Unsupported transcript file type "${ext}". Supported: .vtt, .srt, .txt`
672
+ );
673
+ }
674
+
264
675
  // src/commands/ingest.ts
265
676
  function isUrl(source2) {
266
677
  return source2.startsWith("http://") || source2.startsWith("https://");
267
678
  }
679
+ var TXT_SNIFF_BYTES = 2048;
680
+ var SPEAKER_TAG_PATTERN = /^([A-Z][a-zA-Z .'-]{0,40}):\s/gm;
681
+ var TIMESTAMP_PATTERN2 = /^\s*\d{1,2}:\d{2}(:\d{2})?/;
682
+ var MIN_TIMESTAMP_MATCHES = 3;
683
+ var MIN_SPEAKER_REPEAT_COUNT = 2;
684
+ var MIN_DISTINCT_SPEAKERS = 2;
685
+ function countSpeakerOccurrences(sample) {
686
+ const counts = /* @__PURE__ */ new Map();
687
+ SPEAKER_TAG_PATTERN.lastIndex = 0;
688
+ let match;
689
+ while ((match = SPEAKER_TAG_PATTERN.exec(sample)) !== null) {
690
+ const name = match[1].trim();
691
+ counts.set(name, (counts.get(name) ?? 0) + 1);
692
+ }
693
+ return counts;
694
+ }
695
+ function hasSpeakerDialoguePattern(sample) {
696
+ const counts = countSpeakerOccurrences(sample);
697
+ const distinctSpeakers = counts.size;
698
+ const hasEnoughSpeakers = distinctSpeakers >= MIN_DISTINCT_SPEAKERS;
699
+ const hasRepeatedSpeaker = [...counts.values()].some(
700
+ (n) => n >= MIN_SPEAKER_REPEAT_COUNT
701
+ );
702
+ return hasEnoughSpeakers && hasRepeatedSpeaker;
703
+ }
704
+ async function looksLikeTxtTranscript(filePath) {
705
+ const raw = await readFile6(filePath, "utf-8");
706
+ const sample = raw.slice(0, TXT_SNIFF_BYTES);
707
+ if (hasSpeakerDialoguePattern(sample)) return true;
708
+ const timestampMatches = sample.match(new RegExp(TIMESTAMP_PATTERN2.source, "gm"));
709
+ return (timestampMatches?.length ?? 0) >= MIN_TIMESTAMP_MATCHES;
710
+ }
268
711
  function enforceCharLimit(content) {
269
712
  if (content.length <= MAX_SOURCE_CHARS) {
270
713
  return { content, truncated: false, originalChars: content.length };
@@ -297,12 +740,30 @@ function enforceMinContent(content) {
297
740
  );
298
741
  }
299
742
  }
300
- function buildDocument(title, source2, result) {
743
+ async function detectSourceType(source2) {
744
+ if (!isUrl(source2)) {
745
+ const ext = path7.extname(source2).toLowerCase();
746
+ if (ext === ".pdf") return "pdf";
747
+ if (IMAGE_EXTENSIONS.has(ext)) return "image";
748
+ if (TRANSCRIPT_EXTENSIONS.has(ext)) return "transcript";
749
+ if (ext === ".txt") {
750
+ const isTranscript = await looksLikeTxtTranscript(source2);
751
+ return isTranscript ? "transcript" : "file";
752
+ }
753
+ return "file";
754
+ }
755
+ if (isYoutubeUrl(source2)) return "transcript";
756
+ return "web";
757
+ }
758
+ function buildDocument(title, source2, result, sourceType) {
301
759
  const meta = {
302
760
  title,
303
761
  source: source2,
304
762
  ingestedAt: (/* @__PURE__ */ new Date()).toISOString()
305
763
  };
764
+ if (sourceType !== void 0) {
765
+ meta.sourceType = sourceType;
766
+ }
306
767
  if (result.truncated) {
307
768
  meta.truncated = true;
308
769
  meta.originalChars = result.originalChars;
@@ -313,30 +774,46 @@ function buildDocument(title, source2, result) {
313
774
  ${result.content}
314
775
  `;
315
776
  }
777
+ async function fetchContent(source2, sourceType) {
778
+ switch (sourceType) {
779
+ case "web":
780
+ return ingestWeb(source2);
781
+ case "pdf":
782
+ return ingestPdf(source2);
783
+ case "image":
784
+ return ingestImage(source2);
785
+ case "transcript":
786
+ return ingestTranscript(source2);
787
+ case "file":
788
+ return ingestFile(source2);
789
+ }
790
+ }
316
791
  async function saveSource(title, document) {
317
792
  const filename = `${slugify(title)}.md`;
318
- const destPath = path3.join(SOURCES_DIR, filename);
793
+ const destPath = path7.join(SOURCES_DIR, filename);
319
794
  await mkdir2(SOURCES_DIR, { recursive: true });
320
795
  await writeFile2(destPath, document, "utf-8");
321
796
  return destPath;
322
797
  }
323
798
  async function ingestSource(source2) {
324
- status("*", info(`Ingesting: ${source2}`));
325
- const { title, content } = isUrl(source2) ? await ingestWeb(source2) : await ingestFile(source2);
799
+ const sourceType = await detectSourceType(source2);
800
+ status("*", info(`Ingesting [${sourceType}]: ${source2}`));
801
+ const { title, content } = await fetchContent(source2, sourceType);
326
802
  const result = enforceCharLimit(content);
327
803
  enforceMinContent(result.content);
328
- const document = buildDocument(title, source2, result);
804
+ const document = buildDocument(title, source2, result, sourceType);
329
805
  const savedPath = await saveSource(title, document);
330
806
  return {
331
- filename: path3.basename(savedPath),
807
+ filename: path7.basename(savedPath),
332
808
  charCount: result.content.length,
333
809
  truncated: result.truncated,
334
- source: source2
810
+ source: source2,
811
+ sourceType
335
812
  };
336
813
  }
337
814
  async function ingest(source2) {
338
815
  const result = await ingestSource(source2);
339
- const savedPath = path3.join(SOURCES_DIR, result.filename);
816
+ const savedPath = path7.join(SOURCES_DIR, result.filename);
340
817
  status(
341
818
  "+",
342
819
  success(`Saved ${bold(result.filename)} \u2192 ${source(savedPath)}`)
@@ -348,23 +825,23 @@ async function ingest(source2) {
348
825
  import { existsSync as existsSync7 } from "fs";
349
826
 
350
827
  // src/compiler/index.ts
351
- import { readFile as readFile10 } from "fs/promises";
352
- import path18 from "path";
828
+ import { readFile as readFile14 } from "fs/promises";
829
+ import path21 from "path";
353
830
 
354
831
  // src/utils/state.ts
355
- import { readFile as readFile3, writeFile as writeFile3, rename as rename2, mkdir as mkdir3, copyFile } from "fs/promises";
832
+ import { readFile as readFile7, writeFile as writeFile3, rename as rename2, mkdir as mkdir3, copyFile } from "fs/promises";
356
833
  import { existsSync } from "fs";
357
- import path4 from "path";
834
+ import path8 from "path";
358
835
  function emptyState() {
359
836
  return { version: 1, indexHash: "", sources: {} };
360
837
  }
361
838
  async function readState(root) {
362
- const filePath = path4.join(root, STATE_FILE);
839
+ const filePath = path8.join(root, STATE_FILE);
363
840
  if (!existsSync(filePath)) {
364
841
  return emptyState();
365
842
  }
366
843
  try {
367
- const raw = await readFile3(filePath, "utf-8");
844
+ const raw = await readFile7(filePath, "utf-8");
368
845
  return JSON.parse(raw);
369
846
  } catch {
370
847
  const bakPath = filePath + ".bak";
@@ -374,9 +851,9 @@ async function readState(root) {
374
851
  }
375
852
  }
376
853
  async function writeState(root, state) {
377
- const dir = path4.join(root, LLMWIKI_DIR);
854
+ const dir = path8.join(root, LLMWIKI_DIR);
378
855
  await mkdir3(dir, { recursive: true });
379
- const filePath = path4.join(root, STATE_FILE);
856
+ const filePath = path8.join(root, STATE_FILE);
380
857
  const tmpPath = filePath + ".tmp";
381
858
  await writeFile3(tmpPath, JSON.stringify(state, null, 2), "utf-8");
382
859
  await rename2(tmpPath, filePath);
@@ -393,18 +870,18 @@ async function removeSourceState(root, sourceFile) {
393
870
  }
394
871
 
395
872
  // src/compiler/source-state.ts
396
- import path6 from "path";
873
+ import path10 from "path";
397
874
 
398
875
  // src/compiler/hasher.ts
399
876
  import { createHash } from "crypto";
400
- import { readFile as readFile4, readdir } from "fs/promises";
401
- import path5 from "path";
877
+ import { readFile as readFile8, readdir } from "fs/promises";
878
+ import path9 from "path";
402
879
  async function hashFile(filePath) {
403
- const content = await readFile4(filePath, "utf-8");
880
+ const content = await readFile8(filePath, "utf-8");
404
881
  return createHash("sha256").update(content).digest("hex");
405
882
  }
406
883
  async function detectChanges(root, prevState) {
407
- const sourcesPath = path5.join(root, SOURCES_DIR);
884
+ const sourcesPath = path9.join(root, SOURCES_DIR);
408
885
  const currentFiles = await listSourceFiles(sourcesPath);
409
886
  const changes = [];
410
887
  for (const file of currentFiles) {
@@ -424,7 +901,7 @@ async function listSourceFiles(sourcesPath) {
424
901
  }
425
902
  }
426
903
  async function classifyFile(root, file, prevState) {
427
- const filePath = path5.join(root, SOURCES_DIR, file);
904
+ const filePath = path9.join(root, SOURCES_DIR, file);
428
905
  const hash = await hashFile(filePath);
429
906
  const prev = prevState.sources[file];
430
907
  if (!prev) return "new";
@@ -447,133 +924,22 @@ async function buildExtractionSourceStates(root, extractions) {
447
924
  return snapshot;
448
925
  }
449
926
  async function buildEntry(root, result, compiledAt) {
450
- const filePath = path6.join(root, SOURCES_DIR, result.sourceFile);
927
+ const filePath = path10.join(root, SOURCES_DIR, result.sourceFile);
451
928
  const hash = await hashFile(filePath);
452
- return {
453
- hash,
454
- concepts: result.concepts.map((concept) => slugify(concept.concept)),
455
- compiledAt
456
- };
457
- }
458
- function pickStatesForSources(allStates, sourceFiles) {
459
- const picked = {};
460
- for (const file of sourceFiles) {
461
- const entry = allStates[file];
462
- if (entry) picked[file] = entry;
463
- }
464
- return picked;
465
- }
466
-
467
- // src/providers/anthropic.ts
468
- import Anthropic from "@anthropic-ai/sdk";
469
- var VOYAGE_EMBEDDINGS_URL = "https://api.voyageai.com/v1/embeddings";
470
- function buildAnthropicClientOptions(options = {}) {
471
- const trimmedBaseURL = options.baseURL?.trim();
472
- const trimmedApiKey = options.apiKey?.trim();
473
- const trimmedAuthToken = options.authToken?.trim();
474
- const result = {};
475
- if (trimmedApiKey) {
476
- result.apiKey = trimmedApiKey;
477
- }
478
- if (trimmedAuthToken) {
479
- result.authToken = trimmedAuthToken;
480
- }
481
- if (!trimmedBaseURL) {
482
- return result;
483
- }
484
- const normalizedBaseURL = trimmedBaseURL.endsWith("/") && trimmedBaseURL.length > 1 ? trimmedBaseURL.slice(0, -1) : trimmedBaseURL;
485
- result.baseURL = normalizedBaseURL;
486
- return result;
487
- }
488
- var AnthropicProvider = class {
489
- client;
490
- model;
491
- constructor(model, options = {}) {
492
- this.model = model;
493
- this.client = new Anthropic(buildAnthropicClientOptions(options));
494
- }
495
- /** Send a single non-streaming completion request. */
496
- async complete(system, messages, maxTokens) {
497
- const response = await this.client.messages.create({
498
- model: this.model,
499
- max_tokens: maxTokens,
500
- system,
501
- messages
502
- });
503
- const textBlock = response.content.find((block) => block.type === "text");
504
- return textBlock?.type === "text" ? textBlock.text : "";
505
- }
506
- /** Stream a completion, invoking onToken for each text chunk. */
507
- async stream(system, messages, maxTokens, onToken) {
508
- const stream = this.client.messages.stream({
509
- model: this.model,
510
- max_tokens: maxTokens,
511
- system,
512
- messages
513
- });
514
- let fullText = "";
515
- for await (const event of stream) {
516
- if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
517
- fullText += event.delta.text;
518
- onToken?.(event.delta.text);
519
- }
520
- }
521
- return fullText;
522
- }
523
- /** Call Claude with tool definitions and return the parsed tool input as JSON. */
524
- async toolCall(system, messages, tools, maxTokens) {
525
- const anthropicTools = tools.map((t) => ({
526
- name: t.name,
527
- description: t.description,
528
- input_schema: t.input_schema
529
- }));
530
- const response = await this.client.messages.create({
531
- model: this.model,
532
- max_tokens: maxTokens,
533
- system,
534
- messages,
535
- tools: anthropicTools
536
- });
537
- const toolBlock = response.content.find((block) => block.type === "tool_use");
538
- if (toolBlock?.type === "tool_use") {
539
- return JSON.stringify(toolBlock.input);
540
- }
541
- const textBlock = response.content.find((block) => block.type === "text");
542
- return textBlock?.type === "text" ? textBlock.text : "";
543
- }
544
- /**
545
- * Produce a single embedding vector via the Voyage API.
546
- *
547
- * Anthropic does not ship a first-party embeddings endpoint, so we delegate
548
- * to Voyage (their recommended partner). Requires VOYAGE_API_KEY.
549
- */
550
- async embed(text) {
551
- const apiKey = process.env.VOYAGE_API_KEY?.trim();
552
- if (!apiKey) {
553
- throw new Error(
554
- "VOYAGE_API_KEY is not set. Anthropic embeddings use Voyage \u2014 set VOYAGE_API_KEY to enable semantic search."
555
- );
556
- }
557
- const response = await fetch(VOYAGE_EMBEDDINGS_URL, {
558
- method: "POST",
559
- headers: {
560
- "Content-Type": "application/json",
561
- Authorization: `Bearer ${apiKey}`
562
- },
563
- body: JSON.stringify({ input: text, model: EMBEDDING_MODELS.anthropic })
564
- });
565
- if (!response.ok) {
566
- const detail = await response.text();
567
- throw new Error(`Voyage embeddings request failed (${response.status}): ${detail}`);
568
- }
569
- const json = await response.json();
570
- const vector = json.data?.[0]?.embedding;
571
- if (!Array.isArray(vector)) {
572
- throw new Error("Voyage embeddings response did not include a vector.");
573
- }
574
- return vector;
929
+ return {
930
+ hash,
931
+ concepts: result.concepts.map((concept) => slugify(concept.concept)),
932
+ compiledAt
933
+ };
934
+ }
935
+ function pickStatesForSources(allStates, sourceFiles) {
936
+ const picked = {};
937
+ for (const file of sourceFiles) {
938
+ const entry = allStates[file];
939
+ if (entry) picked[file] = entry;
575
940
  }
576
- };
941
+ return picked;
942
+ }
577
943
 
578
944
  // src/providers/openai.ts
579
945
  import OpenAI from "openai";
@@ -704,101 +1070,6 @@ var MiniMaxProvider = class extends OpenAIProvider {
704
1070
  }
705
1071
  };
706
1072
 
707
- // src/utils/claude-settings.ts
708
- import { readFileSync } from "fs";
709
- import { homedir } from "os";
710
- import path7 from "path";
711
- var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
712
- function isRecord(value) {
713
- return typeof value === "object" && value !== null;
714
- }
715
- function normalize(value) {
716
- if (typeof value !== "string") return void 0;
717
- const trimmed = value.trim();
718
- return trimmed.length > 0 ? trimmed : void 0;
719
- }
720
- function resolveClaudeSettingsPath(env) {
721
- return env[CLAUDE_SETTINGS_PATH_ENV] ?? path7.join(homedir(), ".claude", "settings.json");
722
- }
723
- function readClaudeSettingsFile(settingsPath) {
724
- try {
725
- return readFileSync(settingsPath, "utf8");
726
- } catch (err) {
727
- if (isRecord(err) && err.code === "ENOENT") {
728
- return void 0;
729
- }
730
- const message = err instanceof Error ? err.message : String(err);
731
- throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
732
- }
733
- }
734
- function readClaudeSettingsEnv(env = process.env) {
735
- const settingsPath = resolveClaudeSettingsPath(env);
736
- const raw = readClaudeSettingsFile(settingsPath);
737
- if (!raw) return void 0;
738
- let parsed;
739
- try {
740
- parsed = JSON.parse(raw);
741
- } catch (err) {
742
- const message = err instanceof Error ? err.message : String(err);
743
- throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
744
- }
745
- if (!isRecord(parsed) || !isRecord(parsed.env)) {
746
- return void 0;
747
- }
748
- const values = {
749
- ANTHROPIC_API_KEY: normalize(parsed.env.ANTHROPIC_API_KEY),
750
- ANTHROPIC_AUTH_TOKEN: normalize(parsed.env.ANTHROPIC_AUTH_TOKEN),
751
- ANTHROPIC_BASE_URL: normalize(parsed.env.ANTHROPIC_BASE_URL),
752
- ANTHROPIC_MODEL: normalize(parsed.env.ANTHROPIC_MODEL)
753
- };
754
- if (!values.ANTHROPIC_API_KEY && !values.ANTHROPIC_AUTH_TOKEN && !values.ANTHROPIC_BASE_URL && !values.ANTHROPIC_MODEL) {
755
- return void 0;
756
- }
757
- return values;
758
- }
759
- function tryReadClaudeSettingsEnv(env) {
760
- try {
761
- return readClaudeSettingsEnv(env);
762
- } catch {
763
- return void 0;
764
- }
765
- }
766
- function validateAnthropicBaseURL(value) {
767
- const normalized = value.trim();
768
- try {
769
- const parsed = new URL(normalized);
770
- if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
771
- throw new Error("Must use http:// or https:// protocol.");
772
- }
773
- } catch (err) {
774
- const message = err instanceof Error ? err.message : "Must be a valid http(s) URL.";
775
- throw new Error(`Invalid ANTHROPIC_BASE_URL: "${normalized}". ${message}`);
776
- }
777
- return normalized;
778
- }
779
- function resolveAnthropicAuthFromEnv(env = process.env) {
780
- const explicitApiKey = normalize(env.ANTHROPIC_API_KEY);
781
- if (explicitApiKey) return { apiKey: explicitApiKey };
782
- const explicitAuthToken = normalize(env.ANTHROPIC_AUTH_TOKEN);
783
- if (explicitAuthToken) return { authToken: explicitAuthToken };
784
- const fallback = readClaudeSettingsEnv(env);
785
- if (fallback?.ANTHROPIC_API_KEY) return { apiKey: fallback.ANTHROPIC_API_KEY };
786
- if (fallback?.ANTHROPIC_AUTH_TOKEN) return { authToken: fallback.ANTHROPIC_AUTH_TOKEN };
787
- return {};
788
- }
789
- function resolveAnthropicModelFromEnv(env = process.env) {
790
- const explicitModel = env.LLMWIKI_MODEL;
791
- if (explicitModel !== void 0) return explicitModel;
792
- return tryReadClaudeSettingsEnv(env)?.ANTHROPIC_MODEL;
793
- }
794
- function resolveAnthropicBaseURLFromEnv(env = process.env) {
795
- const explicitBaseURL = normalize(env.ANTHROPIC_BASE_URL);
796
- if (explicitBaseURL) return validateAnthropicBaseURL(explicitBaseURL);
797
- const fallbackBaseURL = tryReadClaudeSettingsEnv(env)?.ANTHROPIC_BASE_URL;
798
- if (!fallbackBaseURL) return void 0;
799
- return validateAnthropicBaseURL(fallbackBaseURL);
800
- }
801
-
802
1073
  // src/utils/provider.ts
803
1074
  var SUPPORTED_PROVIDERS = /* @__PURE__ */ new Set(["anthropic", "openai", "ollama", "minimax"]);
804
1075
  function getProvider() {
@@ -891,8 +1162,8 @@ async function callClaude(options) {
891
1162
  }
892
1163
 
893
1164
  // src/utils/lock.ts
894
- import { open, readFile as readFile5, unlink, mkdir as mkdir4 } from "fs/promises";
895
- import path8 from "path";
1165
+ import { open, readFile as readFile9, unlink, mkdir as mkdir4 } from "fs/promises";
1166
+ import path11 from "path";
896
1167
  var RECLAIM_SUFFIX = ".reclaim";
897
1168
  var MAX_ACQUIRE_ATTEMPTS = 2;
898
1169
  function isProcessAlive(pid) {
@@ -904,8 +1175,8 @@ function isProcessAlive(pid) {
904
1175
  }
905
1176
  }
906
1177
  async function acquireLock(root) {
907
- const lockPath = path8.join(root, LOCK_FILE);
908
- await mkdir4(path8.join(root, LLMWIKI_DIR), { recursive: true });
1178
+ const lockPath = path11.join(root, LOCK_FILE);
1179
+ await mkdir4(path11.join(root, LLMWIKI_DIR), { recursive: true });
909
1180
  for (let attempt = 0; attempt < MAX_ACQUIRE_ATTEMPTS; attempt++) {
910
1181
  const created = await tryCreateLock(lockPath);
911
1182
  if (created) return true;
@@ -968,7 +1239,7 @@ async function tryCreateLock(lockPath) {
968
1239
  }
969
1240
  async function isLockStale(lockPath) {
970
1241
  try {
971
- const content = await readFile5(lockPath, "utf-8");
1242
+ const content = await readFile9(lockPath, "utf-8");
972
1243
  const pid = parseInt(content.trim(), 10);
973
1244
  if (isNaN(pid)) return true;
974
1245
  return !isProcessAlive(pid);
@@ -977,7 +1248,7 @@ async function isLockStale(lockPath) {
977
1248
  }
978
1249
  }
979
1250
  async function releaseLock(root) {
980
- const lockPath = path8.join(root, LOCK_FILE);
1251
+ const lockPath = path11.join(root, LOCK_FILE);
981
1252
  try {
982
1253
  await unlink(lockPath);
983
1254
  } catch {
@@ -1220,8 +1491,8 @@ function buildDefaultSchema() {
1220
1491
 
1221
1492
  // src/schema/loader.ts
1222
1493
  import { existsSync as existsSync2 } from "fs";
1223
- import { readFile as readFile6 } from "fs/promises";
1224
- import path9 from "path";
1494
+ import { readFile as readFile10 } from "fs/promises";
1495
+ import path12 from "path";
1225
1496
  import yaml2 from "js-yaml";
1226
1497
  var SCHEMA_CANDIDATE_PATHS = [
1227
1498
  ".llmwiki/schema.json",
@@ -1232,7 +1503,7 @@ var SCHEMA_CANDIDATE_PATHS = [
1232
1503
  ];
1233
1504
  function findSchemaPath(root) {
1234
1505
  for (const candidate of SCHEMA_CANDIDATE_PATHS) {
1235
- const absolute = path9.join(root, candidate);
1506
+ const absolute = path12.join(root, candidate);
1236
1507
  if (existsSync2(absolute)) return absolute;
1237
1508
  }
1238
1509
  return null;
@@ -1285,12 +1556,12 @@ async function loadSchema(root) {
1285
1556
  const defaults = buildDefaultSchema();
1286
1557
  const schemaPath = findSchemaPath(root);
1287
1558
  if (!schemaPath) return defaults;
1288
- const raw = await readFile6(schemaPath, "utf-8");
1559
+ const raw = await readFile10(schemaPath, "utf-8");
1289
1560
  const parsed = parseSchemaFile(schemaPath, raw);
1290
1561
  return applyOverrides(defaults, parsed, schemaPath);
1291
1562
  }
1292
1563
  function defaultSchemaInitPath(root) {
1293
- return path9.join(root, SCHEMA_CANDIDATE_PATHS[0]);
1564
+ return path12.join(root, SCHEMA_CANDIDATE_PATHS[0]);
1294
1565
  }
1295
1566
 
1296
1567
  // src/schema/helpers.ts
@@ -1462,7 +1733,7 @@ async function freezeFailedExtractions(root, results, frozenSlugs) {
1462
1733
  }
1463
1734
 
1464
1735
  // src/compiler/orphan.ts
1465
- import path10 from "path";
1736
+ import path13 from "path";
1466
1737
  async function markOrphaned(root, sourceFile, state) {
1467
1738
  const sourceEntry = state.sources[sourceFile];
1468
1739
  if (!sourceEntry) return;
@@ -1488,7 +1759,7 @@ async function orphanUnownedFrozenPages(root, frozenSlugs) {
1488
1759
  }
1489
1760
  }
1490
1761
  async function orphanPage(root, slug, reason) {
1491
- const pagePath = path10.join(root, CONCEPTS_DIR, `${slug}.md`);
1762
+ const pagePath = path13.join(root, CONCEPTS_DIR, `${slug}.md`);
1492
1763
  const content = await safeReadFile(pagePath);
1493
1764
  if (!content) return;
1494
1765
  const { meta } = parseFrontmatter(content);
@@ -1499,18 +1770,18 @@ async function orphanPage(root, slug, reason) {
1499
1770
  }
1500
1771
 
1501
1772
  // src/compiler/resolver.ts
1502
- import { readdir as readdir2, readFile as readFile7 } from "fs/promises";
1503
- import path11 from "path";
1773
+ import { readdir as readdir2, readFile as readFile11 } from "fs/promises";
1774
+ import path14 from "path";
1504
1775
  import { existsSync as existsSync3 } from "fs";
1505
1776
  async function buildTitleIndex(root) {
1506
- const conceptsDir = path11.join(root, CONCEPTS_DIR);
1777
+ const conceptsDir = path14.join(root, CONCEPTS_DIR);
1507
1778
  if (!existsSync3(conceptsDir)) return [];
1508
1779
  const files = await readdir2(conceptsDir);
1509
1780
  const pages = [];
1510
1781
  for (const file of files) {
1511
1782
  if (!file.endsWith(".md")) continue;
1512
- const filePath = path11.join(conceptsDir, file);
1513
- const content = await readFile7(filePath, "utf-8");
1783
+ const filePath = path14.join(conceptsDir, file);
1784
+ const content = await readFile11(filePath, "utf-8");
1514
1785
  const { meta } = parseFrontmatter(content);
1515
1786
  if (meta.title && typeof meta.title === "string" && !meta.orphaned) {
1516
1787
  pages.push({
@@ -1596,7 +1867,7 @@ async function resolveInboundLinks(titleIndex, newSlugs) {
1596
1867
  let count = 0;
1597
1868
  for (const page of titleIndex) {
1598
1869
  if (newSlugs.includes(page.slug)) continue;
1599
- const content = await readFile7(page.filePath, "utf-8");
1870
+ const content = await readFile11(page.filePath, "utf-8");
1600
1871
  const { body } = parseFrontmatter(content);
1601
1872
  const linked = addWikilinks(body, newTitles, page.title);
1602
1873
  if (linked !== body) {
@@ -1608,7 +1879,7 @@ async function resolveInboundLinks(titleIndex, newSlugs) {
1608
1879
  return count;
1609
1880
  }
1610
1881
  async function linkPage(page, titleIndex) {
1611
- const content = await readFile7(page.filePath, "utf-8");
1882
+ const content = await readFile11(page.filePath, "utf-8");
1612
1883
  const { body } = parseFrontmatter(content);
1613
1884
  const linked = addWikilinks(body, titleIndex, page.title);
1614
1885
  if (linked === body) return false;
@@ -1619,17 +1890,17 @@ async function linkPage(page, titleIndex) {
1619
1890
 
1620
1891
  // src/compiler/indexgen.ts
1621
1892
  import { readdir as readdir3 } from "fs/promises";
1622
- import path12 from "path";
1893
+ import path15 from "path";
1623
1894
  async function generateIndex(root) {
1624
1895
  status("*", info("Generating index..."));
1625
- const conceptsPath = path12.join(root, CONCEPTS_DIR);
1626
- const queriesPath = path12.join(root, QUERIES_DIR);
1896
+ const conceptsPath = path15.join(root, CONCEPTS_DIR);
1897
+ const queriesPath = path15.join(root, QUERIES_DIR);
1627
1898
  const concepts = await collectPageSummaries(conceptsPath);
1628
1899
  const queries = await collectPageSummaries(queriesPath);
1629
1900
  concepts.sort((a, b) => a.title.localeCompare(b.title));
1630
1901
  queries.sort((a, b) => a.title.localeCompare(b.title));
1631
1902
  const indexContent = buildIndexContent(concepts, queries);
1632
- const indexPath = path12.join(root, INDEX_FILE);
1903
+ const indexPath = path15.join(root, INDEX_FILE);
1633
1904
  await atomicWrite(indexPath, indexContent);
1634
1905
  const total = concepts.length + queries.length;
1635
1906
  status("+", success(`Index updated with ${total} pages.`));
@@ -1643,7 +1914,7 @@ async function scanWikiPages(dirPath) {
1643
1914
  }
1644
1915
  const scanned = [];
1645
1916
  for (const file of files.filter((f) => f.endsWith(".md"))) {
1646
- const content = await safeReadFile(path12.join(dirPath, file));
1917
+ const content = await safeReadFile(path15.join(dirPath, file));
1647
1918
  const { meta } = parseFrontmatter(content);
1648
1919
  scanned.push({ slug: file.replace(/\.md$/, ""), meta });
1649
1920
  }
@@ -1680,7 +1951,7 @@ function buildIndexContent(concepts, queries) {
1680
1951
 
1681
1952
  // src/compiler/obsidian.ts
1682
1953
  import { readdir as readdir4 } from "fs/promises";
1683
- import path13 from "path";
1954
+ import path16 from "path";
1684
1955
  var ABBREVIATION_MIN_WORDS = 3;
1685
1956
  var SWAP_CONJUNCTIONS = [" and ", " or "];
1686
1957
  function addObsidianMeta(frontmatter, conceptTitle, tags) {
@@ -1722,11 +1993,11 @@ function generateAbbreviation(title) {
1722
1993
  return abbreviation;
1723
1994
  }
1724
1995
  async function generateMOC(root) {
1725
- const conceptsPath = path13.join(root, CONCEPTS_DIR);
1996
+ const conceptsPath = path16.join(root, CONCEPTS_DIR);
1726
1997
  const pages = await loadConceptPages(conceptsPath);
1727
1998
  const tagGroups = groupPagesByTag(pages);
1728
1999
  const content = buildMOCContent(tagGroups);
1729
- await atomicWrite(path13.join(root, MOC_FILE), content);
2000
+ await atomicWrite(path16.join(root, MOC_FILE), content);
1730
2001
  }
1731
2002
  async function loadConceptPages(conceptsPath) {
1732
2003
  let files;
@@ -1738,7 +2009,7 @@ async function loadConceptPages(conceptsPath) {
1738
2009
  const pages = [];
1739
2010
  for (const file of files) {
1740
2011
  if (!file.endsWith(".md")) continue;
1741
- const content = await safeReadFile(path13.join(conceptsPath, file));
2012
+ const content = await safeReadFile(path16.join(conceptsPath, file));
1742
2013
  if (!content) continue;
1743
2014
  const { meta } = parseFrontmatter(content);
1744
2015
  if (meta.orphaned) continue;
@@ -1789,9 +2060,143 @@ function buildMOCContent(tagGroups) {
1789
2060
  }
1790
2061
 
1791
2062
  // src/utils/embeddings.ts
1792
- import { readFile as readFile8, readdir as readdir5 } from "fs/promises";
2063
+ import { readFile as readFile12, readdir as readdir5 } from "fs/promises";
1793
2064
  import { existsSync as existsSync4 } from "fs";
1794
- import path14 from "path";
2065
+ import path17 from "path";
2066
+
2067
+ // src/utils/retrieval.ts
2068
+ import { createHash as createHash2 } from "crypto";
2069
+ function hashChunkText(text) {
2070
+ return createHash2("sha256").update(text, "utf8").digest("hex").slice(0, 16);
2071
+ }
2072
+ function splitIntoChunks(body) {
2073
+ const paragraphs = extractParagraphs(body);
2074
+ if (paragraphs.length === 0) return [];
2075
+ const chunks = [];
2076
+ let buffer = "";
2077
+ for (const paragraph of paragraphs) {
2078
+ for (const piece of splitOversizedParagraph(paragraph)) {
2079
+ buffer = appendParagraph(buffer, piece, chunks);
2080
+ }
2081
+ }
2082
+ if (buffer.length > 0) chunks.push(buffer);
2083
+ return mergeTrailingFragment(chunks);
2084
+ }
2085
+ function appendParagraph(buffer, paragraph, chunks) {
2086
+ const candidate = buffer ? `${buffer}
2087
+
2088
+ ${paragraph}` : paragraph;
2089
+ if (candidate.length <= CHUNK_TARGET_CHARS) return candidate;
2090
+ if (buffer.length > 0) {
2091
+ chunks.push(buffer);
2092
+ return paragraph;
2093
+ }
2094
+ chunks.push(candidate);
2095
+ return "";
2096
+ }
2097
+ function mergeTrailingFragment(chunks) {
2098
+ if (chunks.length < 2) return chunks;
2099
+ const last = chunks[chunks.length - 1];
2100
+ if (last.length >= CHUNK_MIN_CHARS) return chunks;
2101
+ const previous = chunks[chunks.length - 2];
2102
+ if (previous.length + last.length + 2 > CHUNK_MAX_CHARS) return chunks;
2103
+ const merged = chunks.slice(0, -2);
2104
+ merged.push(`${previous}
2105
+
2106
+ ${last}`);
2107
+ return merged;
2108
+ }
2109
+ function extractParagraphs(body) {
2110
+ return body.split(/\n{2,}/).map((p) => p.trim()).filter((p) => p.length > 0);
2111
+ }
2112
+ function splitOversizedParagraph(paragraph) {
2113
+ if (paragraph.length <= CHUNK_MAX_CHARS) return [paragraph];
2114
+ const sentences = paragraph.split(/(?<=[.!?])\s+/);
2115
+ const pieces = [];
2116
+ let buffer = "";
2117
+ for (const sentence of sentences) {
2118
+ if ((buffer + " " + sentence).length > CHUNK_MAX_CHARS && buffer.length > 0) {
2119
+ pieces.push(buffer.trim());
2120
+ buffer = sentence;
2121
+ } else {
2122
+ buffer = buffer ? `${buffer} ${sentence}` : sentence;
2123
+ }
2124
+ }
2125
+ if (buffer.length > 0) pieces.push(buffer.trim());
2126
+ return pieces.flatMap(hardCut);
2127
+ }
2128
+ function hardCut(text) {
2129
+ if (text.length <= CHUNK_MAX_CHARS) return [text];
2130
+ const pieces = [];
2131
+ for (let start = 0; start < text.length; start += CHUNK_MAX_CHARS) {
2132
+ pieces.push(text.slice(start, start + CHUNK_MAX_CHARS));
2133
+ }
2134
+ return pieces;
2135
+ }
2136
+ function rerankWithBm25(query, candidates) {
2137
+ if (candidates.length === 0) return [];
2138
+ const queryTerms = tokenize(query);
2139
+ if (queryTerms.length === 0) {
2140
+ return candidates.map((candidate) => ({ candidate, score: candidate.baseScore }));
2141
+ }
2142
+ const docs = candidates.map((c) => tokenize(c.text));
2143
+ const stats = buildCorpusStats(docs);
2144
+ return rankByBm25Score(candidates, docs, queryTerms, stats);
2145
+ }
2146
+ function rankByBm25Score(candidates, docs, queryTerms, stats) {
2147
+ const scored = candidates.map((candidate, index) => {
2148
+ const lexical = bm25Score(queryTerms, docs[index], stats);
2149
+ return { candidate, score: lexical + candidate.baseScore * BASE_SCORE_WEIGHT };
2150
+ });
2151
+ scored.sort((a, b) => b.score - a.score);
2152
+ return scored;
2153
+ }
2154
+ function tokenize(text) {
2155
+ return text.toLowerCase().match(/[a-z0-9]+/g) ?? [];
2156
+ }
2157
+ function buildCorpusStats(docs) {
2158
+ const docFreq = /* @__PURE__ */ new Map();
2159
+ let totalLen = 0;
2160
+ for (const tokens of docs) {
2161
+ totalLen += tokens.length;
2162
+ const unique = new Set(tokens);
2163
+ for (const term of unique) docFreq.set(term, (docFreq.get(term) ?? 0) + 1);
2164
+ }
2165
+ const totalDocs = docs.length;
2166
+ const avgDocLen = totalDocs > 0 ? totalLen / totalDocs : 0;
2167
+ return { docFreq, avgDocLen, totalDocs };
2168
+ }
2169
+ var BM25_K1 = 1.5;
2170
+ var BM25_B = 0.75;
2171
+ var BASE_SCORE_WEIGHT = 0.5;
2172
+ function bm25Score(queryTerms, docTokens, stats) {
2173
+ if (docTokens.length === 0 || stats.totalDocs === 0) return 0;
2174
+ const termFreq = countTerms(docTokens);
2175
+ const lengthRatio = docTokens.length / (stats.avgDocLen || 1);
2176
+ let total = 0;
2177
+ for (const term of queryTerms) {
2178
+ const tf = termFreq.get(term) ?? 0;
2179
+ if (tf === 0) continue;
2180
+ const idf = idfWeight(stats.docFreq.get(term) ?? 0, stats.totalDocs);
2181
+ const numerator = tf * (BM25_K1 + 1);
2182
+ const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * lengthRatio);
2183
+ total += idf * (numerator / denominator);
2184
+ }
2185
+ return total;
2186
+ }
2187
+ function idfWeight(docFrequency, totalDocs) {
2188
+ const numerator = totalDocs - docFrequency + 0.5;
2189
+ const denominator = docFrequency + 0.5;
2190
+ return Math.log(1 + numerator / denominator);
2191
+ }
2192
+ function countTerms(tokens) {
2193
+ const counts = /* @__PURE__ */ new Map();
2194
+ for (const token of tokens) counts.set(token, (counts.get(token) ?? 0) + 1);
2195
+ return counts;
2196
+ }
2197
+
2198
+ // src/utils/embeddings.ts
2199
+ var STORE_VERSION = 2;
1795
2200
  function cosineSimilarity(a, b) {
1796
2201
  if (a.length !== b.length || a.length === 0) return 0;
1797
2202
  let dot = 0;
@@ -1813,24 +2218,27 @@ function findTopK(queryVec, store, k) {
1813
2218
  scored.sort((left, right) => right.score - left.score);
1814
2219
  return scored.slice(0, k).map((item) => item.entry);
1815
2220
  }
2221
+ function findTopKChunks(queryVec, chunks, k) {
2222
+ const scored = chunks.map((chunk) => ({
2223
+ chunk,
2224
+ score: cosineSimilarity(queryVec, chunk.vector)
2225
+ }));
2226
+ scored.sort((left, right) => right.score - left.score);
2227
+ return scored.slice(0, k);
2228
+ }
1816
2229
  async function readEmbeddingStore(root) {
1817
- const filePath = path14.join(root, EMBEDDINGS_FILE);
2230
+ const filePath = path17.join(root, EMBEDDINGS_FILE);
1818
2231
  if (!existsSync4(filePath)) return null;
1819
- const raw = await readFile8(filePath, "utf-8");
2232
+ const raw = await readFile12(filePath, "utf-8");
1820
2233
  return JSON.parse(raw);
1821
2234
  }
1822
2235
  async function writeEmbeddingStore(root, store) {
1823
- const filePath = path14.join(root, EMBEDDINGS_FILE);
2236
+ const filePath = path17.join(root, EMBEDDINGS_FILE);
1824
2237
  await atomicWrite(filePath, JSON.stringify(store, null, 2));
1825
2238
  }
1826
2239
  async function findRelevantPages(root, question) {
1827
- const store = await readEmbeddingStore(root);
1828
- if (!store || store.entries.length === 0) return [];
1829
- const activeModel = resolveEmbeddingModel();
1830
- if (store.model !== activeModel) {
1831
- warnStaleEmbeddingStore(store.model, activeModel);
1832
- return [];
1833
- }
2240
+ const store = await loadActiveStore(root, (s) => s.entries.length > 0);
2241
+ if (!store) return [];
1834
2242
  const queryVec = await getProvider().embed(question);
1835
2243
  return findTopK(queryVec, store, EMBEDDING_TOP_K).map((entry) => ({
1836
2244
  slug: entry.slug,
@@ -1838,10 +2246,26 @@ async function findRelevantPages(root, question) {
1838
2246
  summary: entry.summary
1839
2247
  }));
1840
2248
  }
2249
+ async function findRelevantChunks(root, question, k) {
2250
+ const store = await loadActiveStore(root, (s) => Boolean(s.chunks && s.chunks.length > 0));
2251
+ if (!store) return [];
2252
+ const queryVec = await getProvider().embed(question);
2253
+ return findTopKChunks(queryVec, store.chunks ?? [], k);
2254
+ }
2255
+ async function loadActiveStore(root, hasContent) {
2256
+ const store = await readEmbeddingStore(root);
2257
+ if (!store || !hasContent(store)) return null;
2258
+ const activeModel = resolveEmbeddingModel();
2259
+ if (store.model !== activeModel) {
2260
+ warnStaleEmbeddingStore(store.model, activeModel);
2261
+ return null;
2262
+ }
2263
+ return store;
2264
+ }
1841
2265
  async function collectPageRecords(root) {
1842
2266
  const records = [];
1843
2267
  for (const dir of [CONCEPTS_DIR, QUERIES_DIR]) {
1844
- const absDir = path14.join(root, dir);
2268
+ const absDir = path17.join(root, dir);
1845
2269
  let files;
1846
2270
  try {
1847
2271
  files = await readdir5(absDir);
@@ -1849,18 +2273,23 @@ async function collectPageRecords(root) {
1849
2273
  continue;
1850
2274
  }
1851
2275
  for (const file of files.filter((f) => f.endsWith(".md"))) {
1852
- const content = await safeReadFile(path14.join(absDir, file));
1853
- const { meta } = parseFrontmatter(content);
1854
- if (meta.orphaned || typeof meta.title !== "string") continue;
1855
- records.push({
1856
- slug: file.replace(/\.md$/, ""),
1857
- title: meta.title,
1858
- summary: typeof meta.summary === "string" ? meta.summary : ""
1859
- });
2276
+ const record = await readPageRecord(absDir, file);
2277
+ if (record) records.push(record);
1860
2278
  }
1861
2279
  }
1862
2280
  return records;
1863
2281
  }
2282
+ async function readPageRecord(absDir, file) {
2283
+ const content = await safeReadFile(path17.join(absDir, file));
2284
+ const { meta, body } = parseFrontmatter(content);
2285
+ if (meta.orphaned || typeof meta.title !== "string") return null;
2286
+ return {
2287
+ slug: file.replace(/\.md$/, ""),
2288
+ title: meta.title,
2289
+ summary: typeof meta.summary === "string" ? meta.summary : "",
2290
+ body
2291
+ };
2292
+ }
1864
2293
  function buildEmbeddingText(record) {
1865
2294
  return record.summary ? `${record.title}
1866
2295
 
@@ -1913,6 +2342,56 @@ function mergeEntries(existing, fresh, liveSlugs) {
1913
2342
  }
1914
2343
  return Array.from(bySlug.values());
1915
2344
  }
2345
+ async function refreshChunkEmbeddings(records, existing, forceAll) {
2346
+ const liveSlugs = new Set(records.map((r) => r.slug));
2347
+ const existingByKey = indexChunksByKey(existing.filter((c) => liveSlugs.has(c.slug)));
2348
+ const now = (/* @__PURE__ */ new Date()).toISOString();
2349
+ const fresh = [];
2350
+ for (const record of records) {
2351
+ const pageChunks = await embedRecordChunks(record, existingByKey, forceAll, now);
2352
+ fresh.push(...pageChunks);
2353
+ }
2354
+ return fresh;
2355
+ }
2356
+ async function embedRecordChunks(record, existingByKey, forceAll, now) {
2357
+ const provider = getProvider();
2358
+ const chunkTexts = splitIntoChunks(record.body);
2359
+ const out = [];
2360
+ for (let i = 0; i < chunkTexts.length; i++) {
2361
+ const text = chunkTexts[i];
2362
+ const contentHash = hashChunkText(text);
2363
+ const reused = pickReusableChunk(existingByKey, record.slug, i, contentHash, forceAll);
2364
+ if (reused) {
2365
+ out.push({ ...reused, title: record.title });
2366
+ continue;
2367
+ }
2368
+ const vector = await provider.embed(text);
2369
+ out.push({
2370
+ slug: record.slug,
2371
+ title: record.title,
2372
+ chunkIndex: i,
2373
+ contentHash,
2374
+ text,
2375
+ vector,
2376
+ updatedAt: now
2377
+ });
2378
+ }
2379
+ return out;
2380
+ }
2381
+ function indexChunksByKey(chunks) {
2382
+ const byKey = /* @__PURE__ */ new Map();
2383
+ for (const chunk of chunks) byKey.set(chunkKey(chunk.slug, chunk.chunkIndex), chunk);
2384
+ return byKey;
2385
+ }
2386
+ function chunkKey(slug, chunkIndex) {
2387
+ return `${slug}#${chunkIndex}`;
2388
+ }
2389
+ function pickReusableChunk(byKey, slug, chunkIndex, contentHash, forceAll) {
2390
+ if (forceAll) return null;
2391
+ const existing = byKey.get(chunkKey(slug, chunkIndex));
2392
+ if (!existing) return null;
2393
+ return existing.contentHash === contentHash ? existing : null;
2394
+ }
1916
2395
  async function updateEmbeddings(root, changedSlugs) {
1917
2396
  const records = await collectPageRecords(root);
1918
2397
  const liveSlugs = new Set(records.map((r) => r.slug));
@@ -1921,29 +2400,51 @@ async function updateEmbeddings(root, changedSlugs) {
1921
2400
  const modelChanged = Boolean(existingStore && existingStore.model !== embeddingModel);
1922
2401
  const toEmbed = new Set(changedSlugs.filter((slug) => liveSlugs.has(slug)));
1923
2402
  const previousEntries = modelChanged ? [] : existingStore?.entries ?? [];
1924
- if (!existingStore || modelChanged) {
2403
+ const previousChunks = modelChanged ? [] : existingStore?.chunks ?? [];
2404
+ const isEmptyStore = isStoreEmpty(existingStore);
2405
+ if (!existingStore || modelChanged || isEmptyStore && liveSlugs.size > 0) {
1925
2406
  for (const record of records) toEmbed.add(record.slug);
1926
2407
  }
1927
- if (!modelChanged && toEmbed.size === 0 && previousEntries.every((e) => liveSlugs.has(e.slug))) {
2408
+ if (!shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs)) {
1928
2409
  return;
1929
2410
  }
1930
2411
  const freshEntries = await embedPages(records, toEmbed);
1931
2412
  const mergedEntries = mergeEntries(previousEntries, freshEntries, liveSlugs);
1932
- const dimensions = mergedEntries[0]?.vector.length ?? 0;
2413
+ const mergedChunks = await refreshChunkEmbeddings(records, previousChunks, modelChanged);
2414
+ await persistRefreshedStore(root, embeddingModel, mergedEntries, mergedChunks);
2415
+ }
2416
+ async function persistRefreshedStore(root, embeddingModel, entries, chunks) {
2417
+ const dimensions = entries[0]?.vector.length ?? chunks[0]?.vector.length ?? 0;
1933
2418
  const store = {
1934
- version: 1,
2419
+ version: STORE_VERSION,
1935
2420
  model: embeddingModel,
1936
2421
  dimensions,
1937
- entries: mergedEntries
2422
+ entries,
2423
+ chunks
1938
2424
  };
1939
2425
  await writeEmbeddingStore(root, store);
1940
- status("*", dim(`Embeddings updated (${mergedEntries.length} pages).`));
2426
+ status(
2427
+ "*",
2428
+ dim(`Embeddings updated (${entries.length} pages, ${chunks.length} chunks).`)
2429
+ );
2430
+ }
2431
+ function isStoreEmpty(store) {
2432
+ if (!store) return false;
2433
+ return store.entries.length === 0 && (!store.chunks || store.chunks.length === 0);
2434
+ }
2435
+ function shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs) {
2436
+ if (modelChanged) return true;
2437
+ if (toEmbed.size > 0) return true;
2438
+ if (!previousEntries.every((e) => liveSlugs.has(e.slug))) return true;
2439
+ if (!previousChunks.every((c) => liveSlugs.has(c.slug))) return true;
2440
+ if (previousEntries.length > 0 && previousChunks.length === 0 && liveSlugs.size > 0) return true;
2441
+ return false;
1941
2442
  }
1942
2443
 
1943
2444
  // src/compiler/candidates.ts
1944
2445
  import { readdir as readdir6, rename as rename3, unlink as unlink2, writeFile as writeFile4, mkdir as mkdir5 } from "fs/promises";
1945
2446
  import { existsSync as existsSync5 } from "fs";
1946
- import path15 from "path";
2447
+ import path18 from "path";
1947
2448
  import { randomBytes } from "crypto";
1948
2449
  var ID_SUFFIX_BYTES = 4;
1949
2450
  var CANDIDATE_EXT = ".json";
@@ -1952,10 +2453,10 @@ function buildCandidateId(slug) {
1952
2453
  return `${slug}-${suffix}`;
1953
2454
  }
1954
2455
  function candidatePath(root, id) {
1955
- return path15.join(root, CANDIDATES_DIR, `${id}${CANDIDATE_EXT}`);
2456
+ return path18.join(root, CANDIDATES_DIR, `${id}${CANDIDATE_EXT}`);
1956
2457
  }
1957
2458
  function archivePath(root, id) {
1958
- return path15.join(root, CANDIDATES_ARCHIVE_DIR, `${id}${CANDIDATE_EXT}`);
2459
+ return path18.join(root, CANDIDATES_ARCHIVE_DIR, `${id}${CANDIDATE_EXT}`);
1959
2460
  }
1960
2461
  async function writeCandidate(root, draft) {
1961
2462
  const candidate = {
@@ -2006,7 +2507,7 @@ function isValidCandidate(value) {
2006
2507
  return typeof candidate.id === "string" && typeof candidate.title === "string" && typeof candidate.slug === "string" && typeof candidate.body === "string" && Array.isArray(candidate.sources);
2007
2508
  }
2008
2509
  async function listCandidates(root) {
2009
- const dir = path15.join(root, CANDIDATES_DIR);
2510
+ const dir = path18.join(root, CANDIDATES_DIR);
2010
2511
  if (!existsSync5(dir)) return [];
2011
2512
  const entries = await readdir6(dir, { withFileTypes: true });
2012
2513
  const candidates = [];
@@ -2033,7 +2534,7 @@ async function archiveCandidate(root, id) {
2033
2534
  const sourcePath = candidatePath(root, id);
2034
2535
  if (!existsSync5(sourcePath)) return false;
2035
2536
  const target = archivePath(root, id);
2036
- await mkdir5(path15.dirname(target), { recursive: true });
2537
+ await mkdir5(path18.dirname(target), { recursive: true });
2037
2538
  try {
2038
2539
  await rename3(sourcePath, target);
2039
2540
  } catch {
@@ -2045,9 +2546,9 @@ async function archiveCandidate(root, id) {
2045
2546
  }
2046
2547
 
2047
2548
  // src/linter/rules.ts
2048
- import { readdir as readdir7, readFile as readFile9 } from "fs/promises";
2549
+ import { readdir as readdir7, readFile as readFile13 } from "fs/promises";
2049
2550
  import { existsSync as existsSync6 } from "fs";
2050
- import path16 from "path";
2551
+ import path19 from "path";
2051
2552
  var MIN_BODY_LENGTH = 50;
2052
2553
  var WIKILINK_PATTERN2 = /\[\[([^\]]+)\]\]/g;
2053
2554
  var CITATION_PATTERN = /\^\[([^\]]+)\]/g;
@@ -2068,22 +2569,22 @@ async function readMarkdownFiles(dirPath) {
2068
2569
  const mdFiles = entries.filter((f) => f.endsWith(".md"));
2069
2570
  const results = await Promise.all(
2070
2571
  mdFiles.map(async (fileName) => {
2071
- const filePath = path16.join(dirPath, fileName);
2072
- const content = await readFile9(filePath, "utf-8");
2572
+ const filePath = path19.join(dirPath, fileName);
2573
+ const content = await readFile13(filePath, "utf-8");
2073
2574
  return { filePath, content };
2074
2575
  })
2075
2576
  );
2076
2577
  return results;
2077
2578
  }
2078
2579
  async function collectAllPages(root) {
2079
- const conceptPages = await readMarkdownFiles(path16.join(root, CONCEPTS_DIR));
2080
- const queryPages = await readMarkdownFiles(path16.join(root, QUERIES_DIR));
2580
+ const conceptPages = await readMarkdownFiles(path19.join(root, CONCEPTS_DIR));
2581
+ const queryPages = await readMarkdownFiles(path19.join(root, QUERIES_DIR));
2081
2582
  return [...conceptPages, ...queryPages];
2082
2583
  }
2083
2584
  function buildPageSlugSet(pages) {
2084
2585
  const slugs = /* @__PURE__ */ new Set();
2085
2586
  for (const page of pages) {
2086
- const baseName = path16.basename(page.filePath, ".md");
2587
+ const baseName = path19.basename(page.filePath, ".md");
2087
2588
  slugs.add(baseName.toLowerCase());
2088
2589
  }
2089
2590
  return slugs;
@@ -2318,7 +2819,7 @@ function countLines(content) {
2318
2819
  }
2319
2820
  async function checkBrokenCitations(root) {
2320
2821
  const pages = await collectAllPages(root);
2321
- const sourcesDir = path16.join(root, SOURCES_DIR);
2822
+ const sourcesDir = path19.join(root, SOURCES_DIR);
2322
2823
  const results = [];
2323
2824
  const lineCountCache = /* @__PURE__ */ new Map();
2324
2825
  for (const page of pages) {
@@ -2333,7 +2834,7 @@ async function collectBrokenForMarker(captured, line, pageFile, sourcesDir, line
2333
2834
  const trimmed = part.trim();
2334
2835
  if (trimmed.length === 0) continue;
2335
2836
  const filename = stripSpanSuffix(trimmed);
2336
- const citedPath = path16.join(sourcesDir, filename);
2837
+ const citedPath = path19.join(sourcesDir, filename);
2337
2838
  if (!existsSync6(citedPath)) {
2338
2839
  out.push({
2339
2840
  rule: "broken-citation",
@@ -2387,7 +2888,7 @@ async function checkMalformedClaimCitations(root) {
2387
2888
 
2388
2889
  // src/compiler/page-renderer.ts
2389
2890
  import { readdir as readdir8 } from "fs/promises";
2390
- import path17 from "path";
2891
+ import path20 from "path";
2391
2892
 
2392
2893
  // src/compiler/provenance.ts
2393
2894
  function addProvenanceMeta(fields, concept) {
@@ -2417,7 +2918,7 @@ function reportContradictionWarnings(conceptTitle, concept) {
2417
2918
  // src/compiler/page-renderer.ts
2418
2919
  var RELATED_PAGE_CONTEXT_LIMIT = 5;
2419
2920
  async function renderMergedPageContent(root, entry, schema) {
2420
- const pagePath = path17.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
2921
+ const pagePath = path20.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
2421
2922
  const existingPage = await safeReadFile(pagePath);
2422
2923
  const relatedPages = await loadRelatedPages(root, entry.slug);
2423
2924
  const system = buildPagePrompt(
@@ -2456,7 +2957,7 @@ function buildMergedFrontmatter(entry, existingPage, schema) {
2456
2957
  return buildFrontmatter(frontmatterFields);
2457
2958
  }
2458
2959
  async function loadRelatedPages(root, excludeSlug) {
2459
- const conceptsPath = path17.join(root, CONCEPTS_DIR);
2960
+ const conceptsPath = path20.join(root, CONCEPTS_DIR);
2460
2961
  let files;
2461
2962
  try {
2462
2963
  files = await readdir8(conceptsPath);
@@ -2466,7 +2967,7 @@ async function loadRelatedPages(root, excludeSlug) {
2466
2967
  const related = files.filter((f) => f.endsWith(".md") && f !== `${excludeSlug}.md`).slice(0, RELATED_PAGE_CONTEXT_LIMIT);
2467
2968
  const contents = [];
2468
2969
  for (const f of related) {
2469
- const content = await safeReadFile(path17.join(conceptsPath, f));
2970
+ const content = await safeReadFile(path20.join(conceptsPath, f));
2470
2971
  if (!content) continue;
2471
2972
  const { meta } = parseFrontmatter(content);
2472
2973
  if (meta.orphaned) continue;
@@ -2667,9 +3168,9 @@ function printChangesSummary(changes) {
2667
3168
  }
2668
3169
  async function extractForSource(root, sourceFile) {
2669
3170
  status("*", info(`Extracting: ${sourceFile}`));
2670
- const sourcePath = path18.join(root, SOURCES_DIR, sourceFile);
2671
- const sourceContent = await readFile10(sourcePath, "utf-8");
2672
- const existingIndex = await safeReadFile(path18.join(root, INDEX_FILE));
3171
+ const sourcePath = path21.join(root, SOURCES_DIR, sourceFile);
3172
+ const sourceContent = await readFile14(sourcePath, "utf-8");
3173
+ const existingIndex = await safeReadFile(path21.join(root, INDEX_FILE));
2673
3174
  const concepts = await extractConcepts(sourceContent, existingIndex);
2674
3175
  if (concepts.length > 0) {
2675
3176
  const names = concepts.map((c) => c.concept).join(", ");
@@ -2732,7 +3233,7 @@ async function generateMergedPage(root, entry, schema, options, sourceStates) {
2732
3233
  if (options.review) {
2733
3234
  return await persistReviewCandidate(root, entry, fullPage, sourceStates, schema);
2734
3235
  }
2735
- const pagePath = path18.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
3236
+ const pagePath = path21.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
2736
3237
  const error2 = await writePageIfValid(pagePath, fullPage, entry.concept.concept);
2737
3238
  return { error: error2 ?? void 0 };
2738
3239
  }
@@ -2760,7 +3261,7 @@ async function generateSeedPages(root, schema, generation) {
2760
3261
  }
2761
3262
  async function generateSingleSeedPage(root, schema, seed) {
2762
3263
  const slug = slugify(seed.title);
2763
- const pagePath = path18.join(root, CONCEPTS_DIR, `${slug}.md`);
3264
+ const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
2764
3265
  const relatedContent = await loadSeedRelatedPages(root, seed.relatedSlugs ?? []);
2765
3266
  const rule = schema.kinds[seed.kind];
2766
3267
  const system = buildSeedPagePrompt(seed, rule, relatedContent);
@@ -2792,7 +3293,7 @@ async function loadSeedRelatedPages(root, slugs) {
2792
3293
  if (slugs.length === 0) return "";
2793
3294
  const contents = [];
2794
3295
  for (const slug of slugs) {
2795
- const pagePath = path18.join(root, CONCEPTS_DIR, `${slug}.md`);
3296
+ const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
2796
3297
  const content = await safeReadFile(pagePath);
2797
3298
  if (content) contents.push(content);
2798
3299
  }
@@ -2847,7 +3348,7 @@ async function compileCommand(options = {}) {
2847
3348
 
2848
3349
  // src/commands/query.ts
2849
3350
  import { existsSync as existsSync8 } from "fs";
2850
- import path19 from "path";
3351
+ import path22 from "path";
2851
3352
  var PAGE_DIRS = [CONCEPTS_DIR, QUERIES_DIR];
2852
3353
  var PAGE_SELECTION_TOOL = {
2853
3354
  name: "select_pages",
@@ -2895,16 +3396,92 @@ ${indexContent}`;
2895
3396
  function buildFilteredIndex(candidates) {
2896
3397
  return candidates.map((entry) => `- **${entry.slug}**: ${entry.title} \u2014 ${entry.summary}`).join("\n");
2897
3398
  }
2898
- async function selectRelevantPages(root, question) {
3399
+ async function selectRelevantPages(root, question, debug) {
3400
+ const chunkSelection = await trySelectViaChunks(root, question, debug);
3401
+ if (chunkSelection) return chunkSelection;
2899
3402
  const candidates = await tryFindRelevantPages(root, question);
2900
3403
  if (candidates.length > 0) {
2901
3404
  const filteredIndex = buildFilteredIndex(candidates);
2902
3405
  const { pages: rawPages2, reasoning: reasoning2 } = await selectPages(question, filteredIndex);
2903
- return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2 };
3406
+ return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2, chunks: [] };
2904
3407
  }
2905
- const indexContent = await safeReadFile(path19.join(root, INDEX_FILE));
3408
+ const indexContent = await safeReadFile(path22.join(root, INDEX_FILE));
2906
3409
  const { pages: rawPages, reasoning } = await selectPages(question, indexContent);
2907
- return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning };
3410
+ return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning, chunks: [] };
3411
+ }
3412
+ async function trySelectViaChunks(root, question, debug) {
3413
+ const ranked = await tryFindRelevantChunks(root, question);
3414
+ if (ranked.length === 0) return null;
3415
+ const reranked = rerankWithBm25(
3416
+ question,
3417
+ ranked.map(({ chunk, score }) => ({ text: chunk.text, baseScore: score, chunk }))
3418
+ );
3419
+ const kept = reranked.slice(0, CHUNK_RERANK_KEEP);
3420
+ const reorderingHappened = wasReordered(ranked, kept.map((k) => k.candidate.chunk));
3421
+ const chunkCitations = toChunkCitations(kept);
3422
+ const pageSlugs = collapseToPages(chunkCitations, QUERY_PAGE_LIMIT);
3423
+ const reasoning = buildChunkReasoning(chunkCitations, pageSlugs);
3424
+ return {
3425
+ pages: pageSlugs,
3426
+ rawPages: pageSlugs,
3427
+ reasoning,
3428
+ chunks: chunkCitations,
3429
+ debug: debug ? buildDebug(chunkCitations, pageSlugs, reorderingHappened) : void 0
3430
+ };
3431
+ }
3432
+ function wasReordered(before, after) {
3433
+ const limit = Math.min(before.length, after.length);
3434
+ for (let i = 0; i < limit; i++) {
3435
+ if (before[i].chunk !== after[i]) return true;
3436
+ }
3437
+ return false;
3438
+ }
3439
+ function toChunkCitations(ranked) {
3440
+ return ranked.map(({ candidate, score }) => ({
3441
+ slug: candidate.chunk.slug,
3442
+ title: candidate.chunk.title,
3443
+ chunkIndex: candidate.chunk.chunkIndex,
3444
+ score,
3445
+ text: candidate.chunk.text
3446
+ }));
3447
+ }
3448
+ function collapseToPages(chunks, limit) {
3449
+ const slugs = [];
3450
+ const seen = /* @__PURE__ */ new Set();
3451
+ for (const chunk of chunks) {
3452
+ if (seen.has(chunk.slug)) continue;
3453
+ seen.add(chunk.slug);
3454
+ slugs.push(chunk.slug);
3455
+ if (slugs.length >= limit) break;
3456
+ }
3457
+ return slugs;
3458
+ }
3459
+ function buildChunkReasoning(chunks, pages) {
3460
+ const top = chunks.slice(0, pages.length);
3461
+ const summary = top.map((c) => `${c.slug}#${c.chunkIndex} (${c.score.toFixed(3)})`).join(", ");
3462
+ return `Selected ${pages.length} page(s) from ${chunks.length} reranked chunks: ${summary}`;
3463
+ }
3464
+ function buildDebug(chunks, pageSlugs, reranked) {
3465
+ const bestPerPage = /* @__PURE__ */ new Map();
3466
+ for (const c of chunks) {
3467
+ const prev = bestPerPage.get(c.slug);
3468
+ if (prev === void 0 || c.score > prev) bestPerPage.set(c.slug, c.score);
3469
+ }
3470
+ return {
3471
+ pages: pageSlugs.map((slug) => ({ slug, score: bestPerPage.get(slug) ?? 0 })),
3472
+ chunks,
3473
+ usedChunks: true,
3474
+ reranked
3475
+ };
3476
+ }
3477
+ async function tryFindRelevantChunks(root, question) {
3478
+ try {
3479
+ return await findRelevantChunks(root, question, CHUNK_TOP_K);
3480
+ } catch (err) {
3481
+ const message = err instanceof Error ? err.message : String(err);
3482
+ status("!", dim(`Chunk pre-filter unavailable (${message}); falling back.`));
3483
+ return [];
3484
+ }
2908
3485
  }
2909
3486
  async function tryFindRelevantPages(root, question) {
2910
3487
  try {
@@ -2920,7 +3497,7 @@ async function loadSelectedPages(root, slugs) {
2920
3497
  for (const slug of slugs) {
2921
3498
  let content = "";
2922
3499
  for (const dir of PAGE_DIRS) {
2923
- const candidate = await safeReadFile(path19.join(root, dir, `${slug}.md`));
3500
+ const candidate = await safeReadFile(path22.join(root, dir, `${slug}.md`));
2924
3501
  if (!candidate) continue;
2925
3502
  const { meta } = parseFrontmatter(candidate);
2926
3503
  if (meta.orphaned) continue;
@@ -2937,11 +3514,12 @@ ${content}`);
2937
3514
  return sections.join("\n\n");
2938
3515
  }
2939
3516
  var ANSWER_SYSTEM_PROMPT = "You are a knowledge assistant. Answer the question using ONLY the wiki content provided. Cite specific pages using [[Page Title]] wikilinks. If the wiki doesn't contain enough information, say so.";
2940
- async function callAnswerLLM(question, pagesContent, onToken) {
3517
+ async function callAnswerLLM(question, pagesContent, chunks, onToken) {
3518
+ const provenance = chunks.length > 0 ? buildChunkProvenance(chunks) : "";
2941
3519
  const userMessage = `Question: ${question}
2942
3520
 
2943
3521
  Relevant wiki pages:
2944
- ${pagesContent}`;
3522
+ ${pagesContent}${provenance}`;
2945
3523
  return callClaude({
2946
3524
  system: ANSWER_SYSTEM_PROMPT,
2947
3525
  messages: [{ role: "user", content: userMessage }],
@@ -2949,6 +3527,16 @@ ${pagesContent}`;
2949
3527
  onToken
2950
3528
  });
2951
3529
  }
3530
+ function buildChunkProvenance(chunks) {
3531
+ const sections = chunks.map(
3532
+ (chunk) => `--- ${chunk.slug} (chunk ${chunk.chunkIndex}) ---
3533
+ ${chunk.text}`
3534
+ );
3535
+ return `
3536
+
3537
+ Most relevant excerpts (from chunk-level retrieval):
3538
+ ${sections.join("\n\n")}`;
3539
+ }
2952
3540
  function summarizeAnswer(answer) {
2953
3541
  const firstLine = answer.trim().split(/\n/)[0] ?? "";
2954
3542
  const firstSentence = firstLine.split(/(?<=[.!?])\s/)[0] ?? firstLine;
@@ -2956,7 +3544,7 @@ function summarizeAnswer(answer) {
2956
3544
  }
2957
3545
  async function saveQueryPage(root, question, answer) {
2958
3546
  const slug = slugify(question);
2959
- const filePath = path19.join(root, QUERIES_DIR, `${slug}.md`);
3547
+ const filePath = path22.join(root, QUERIES_DIR, `${slug}.md`);
2960
3548
  const frontmatter = buildFrontmatter({
2961
3549
  title: question,
2962
3550
  summary: summarizeAnswer(answer),
@@ -2982,30 +3570,42 @@ ${answer}
2982
3570
  return slug;
2983
3571
  }
2984
3572
  async function generateAnswer(root, question, options = {}) {
2985
- if (!existsSync8(path19.join(root, INDEX_FILE))) {
3573
+ if (!existsSync8(path22.join(root, INDEX_FILE))) {
2986
3574
  throw new Error("Wiki index not found. Run `llmwiki compile` first.");
2987
3575
  }
2988
- const { pages, reasoning } = await selectRelevantPages(root, question);
2989
- options.onPageSelection?.(pages, reasoning);
2990
- const pagesContent = await loadSelectedPages(root, pages);
3576
+ const selection = await selectRelevantPages(root, question, Boolean(options.debug));
3577
+ options.onPageSelection?.(selection.pages, selection.reasoning);
3578
+ const pagesContent = await loadSelectedPages(root, selection.pages);
2991
3579
  if (!pagesContent) {
2992
- return { answer: "", selectedPages: pages, reasoning };
2993
- }
2994
- const answer = await callAnswerLLM(question, pagesContent, options.onToken);
2995
- let saved;
2996
- if (options.save) {
2997
- saved = await saveQueryPage(root, question, answer);
3580
+ return buildEmptyResult(selection);
2998
3581
  }
2999
- return { answer, selectedPages: pages, reasoning, saved };
3582
+ const answer = await callAnswerLLM(question, pagesContent, selection.chunks, options.onToken);
3583
+ const saved = options.save ? await saveQueryPage(root, question, answer) : void 0;
3584
+ return {
3585
+ answer,
3586
+ selectedPages: selection.pages,
3587
+ reasoning: selection.reasoning,
3588
+ saved,
3589
+ debug: selection.debug
3590
+ };
3591
+ }
3592
+ function buildEmptyResult(selection) {
3593
+ return {
3594
+ answer: "",
3595
+ selectedPages: selection.pages,
3596
+ reasoning: selection.reasoning,
3597
+ debug: selection.debug
3598
+ };
3000
3599
  }
3001
3600
  async function queryCommand(root, question, options) {
3002
- if (!existsSync8(path19.join(root, INDEX_FILE))) {
3601
+ if (!existsSync8(path22.join(root, INDEX_FILE))) {
3003
3602
  status("!", error("Wiki index not found. Run `llmwiki compile` first."));
3004
3603
  return;
3005
3604
  }
3006
3605
  header("Selecting relevant pages");
3007
3606
  const result = await generateAnswer(root, question, {
3008
3607
  save: options.save,
3608
+ debug: options.debug,
3009
3609
  onToken: (text) => process.stdout.write(text),
3010
3610
  onPageSelection: (pages, reasoning) => {
3011
3611
  status("i", dim(`Reasoning: ${reasoning}`));
@@ -3014,6 +3614,7 @@ async function queryCommand(root, question, options) {
3014
3614
  }
3015
3615
  });
3016
3616
  process.stdout.write("\n");
3617
+ if (result.debug) printDebugSnapshot(result.debug);
3017
3618
  if (!result.answer) {
3018
3619
  status("!", error("No matching pages found. Try refining your question."));
3019
3620
  return;
@@ -3024,14 +3625,34 @@ async function queryCommand(root, question, options) {
3024
3625
  status("\u2192", dim("Tip: use --save to add this answer to your wiki"));
3025
3626
  }
3026
3627
  }
3628
+ function printDebugSnapshot(debug) {
3629
+ header("Retrieval debug");
3630
+ status(
3631
+ "i",
3632
+ dim(
3633
+ `Source: ${debug.usedChunks ? "chunk-level" : "page-level"}; reranked: ${debug.reranked ? "yes" : "no"}`
3634
+ )
3635
+ );
3636
+ for (const page of debug.pages) {
3637
+ status("\u2022", `${page.slug} (best chunk score ${page.score.toFixed(3)})`);
3638
+ }
3639
+ for (const chunk of debug.chunks) {
3640
+ const preview = chunk.text.slice(0, DEBUG_CHUNK_PREVIEW_CHARS).replace(/\s+/g, " ").trim();
3641
+ status(
3642
+ "\xB7",
3643
+ dim(`${chunk.slug}#${chunk.chunkIndex} score=${chunk.score.toFixed(3)} :: ${preview}\u2026`)
3644
+ );
3645
+ }
3646
+ }
3647
+ var DEBUG_CHUNK_PREVIEW_CHARS = 120;
3027
3648
 
3028
3649
  // src/commands/watch.ts
3029
3650
  import { watch as chokidarWatch } from "chokidar";
3030
3651
  import { existsSync as existsSync9 } from "fs";
3031
- import path20 from "path";
3652
+ import path23 from "path";
3032
3653
  var DEBOUNCE_MS = 500;
3033
3654
  async function watchCommand() {
3034
- const sourcesPath = path20.resolve(SOURCES_DIR);
3655
+ const sourcesPath = path23.resolve(SOURCES_DIR);
3035
3656
  if (!existsSync9(sourcesPath)) {
3036
3657
  status(
3037
3658
  "!",
@@ -3066,7 +3687,7 @@ async function watchCommand() {
3066
3687
  const scheduleCompile = (eventPath, event) => {
3067
3688
  status(
3068
3689
  "~",
3069
- dim(`${event}: ${path20.basename(eventPath)}`)
3690
+ dim(`${event}: ${path23.basename(eventPath)}`)
3070
3691
  );
3071
3692
  if (debounceTimer) clearTimeout(debounceTimer);
3072
3693
  debounceTimer = setTimeout(triggerCompile, DEBOUNCE_MS);
@@ -3153,7 +3774,7 @@ async function lintCommand() {
3153
3774
  // src/commands/schema.ts
3154
3775
  import { existsSync as existsSync10 } from "fs";
3155
3776
  import { mkdir as mkdir6, writeFile as writeFile5 } from "fs/promises";
3156
- import path21 from "path";
3777
+ import path24 from "path";
3157
3778
  async function schemaInitCommand() {
3158
3779
  const root = process.cwd();
3159
3780
  const defaults = buildDefaultSchema();
@@ -3162,7 +3783,7 @@ async function schemaInitCommand() {
3162
3783
  status("!", warn(`Schema file already exists at ${targetPath}`));
3163
3784
  return;
3164
3785
  }
3165
- await mkdir6(path21.dirname(targetPath), { recursive: true });
3786
+ await mkdir6(path24.dirname(targetPath), { recursive: true });
3166
3787
  const serializable = {
3167
3788
  version: defaults.version,
3168
3789
  defaultKind: defaults.defaultKind,
@@ -3221,7 +3842,7 @@ async function reviewShowCommand(id) {
3221
3842
  }
3222
3843
 
3223
3844
  // src/commands/review-approve.ts
3224
- import path22 from "path";
3845
+ import path25 from "path";
3225
3846
 
3226
3847
  // src/commands/review-helpers.ts
3227
3848
  async function runReviewUnderLock(id, underLock) {
@@ -3253,7 +3874,7 @@ async function approveUnderLock(root, id) {
3253
3874
  process.exitCode = 1;
3254
3875
  return;
3255
3876
  }
3256
- const pagePath = path22.join(root, CONCEPTS_DIR, `${candidate.slug}.md`);
3877
+ const pagePath = path25.join(root, CONCEPTS_DIR, `${candidate.slug}.md`);
3257
3878
  await atomicWrite(pagePath, candidate.body);
3258
3879
  status("+", success(`Approved \u2192 ${source(pagePath)}`));
3259
3880
  await persistCandidateSourceStates(root, candidate);
@@ -3313,7 +3934,7 @@ import { McpServer as McpServer2 } from "@modelcontextprotocol/sdk/server/mcp.js
3313
3934
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
3314
3935
 
3315
3936
  // src/mcp/tools.ts
3316
- import path23 from "path";
3937
+ import path26 from "path";
3317
3938
  import { z } from "zod";
3318
3939
 
3319
3940
  // src/mcp/provider-check.ts
@@ -3406,15 +4027,16 @@ function registerQueryTool(server, root) {
3406
4027
  "query_wiki",
3407
4028
  {
3408
4029
  title: "Query Wiki",
3409
- description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Requires an LLM provider.",
4030
+ description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Set debug=true to include the selected chunks and their scores. Requires an LLM provider.",
3410
4031
  inputSchema: {
3411
4032
  question: z.string().describe("The natural-language question to answer."),
3412
- save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true.")
4033
+ save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true."),
4034
+ debug: z.boolean().optional().describe("Include retrieval debug info (selected chunks/pages + scores).")
3413
4035
  }
3414
4036
  },
3415
- async ({ question, save }) => {
4037
+ async ({ question, save, debug }) => {
3416
4038
  ensureProviderAvailable();
3417
- const result = await generateAnswer(root, question, { save });
4039
+ const result = await generateAnswer(root, question, { save, debug });
3418
4040
  return jsonResult(result);
3419
4041
  }
3420
4042
  );
@@ -3438,15 +4060,30 @@ function registerSearchTool(server, root) {
3438
4060
  );
3439
4061
  }
3440
4062
  async function pickSearchSlugs(root, question) {
4063
+ try {
4064
+ const chunks = await findRelevantChunks(root, question, CHUNK_TOP_K);
4065
+ if (chunks.length > 0) return dedupePreservingOrder(chunks.map((c) => c.chunk.slug));
4066
+ } catch {
4067
+ }
3441
4068
  try {
3442
4069
  const candidates = await findRelevantPages(root, question);
3443
4070
  if (candidates.length > 0) return candidates.map((c) => c.slug);
3444
4071
  } catch {
3445
4072
  }
3446
- const indexContent = await safeReadFile(path23.join(root, INDEX_FILE));
4073
+ const indexContent = await safeReadFile(path26.join(root, INDEX_FILE));
3447
4074
  const { pages } = await selectPages(question, indexContent);
3448
4075
  return pages;
3449
4076
  }
4077
+ function dedupePreservingOrder(slugs) {
4078
+ const seen = /* @__PURE__ */ new Set();
4079
+ const out = [];
4080
+ for (const slug of slugs) {
4081
+ if (seen.has(slug)) continue;
4082
+ seen.add(slug);
4083
+ out.push(slug);
4084
+ }
4085
+ return out;
4086
+ }
3450
4087
  function registerReadTool(server, root) {
3451
4088
  server.registerTool(
3452
4089
  "read_page",
@@ -3492,8 +4129,8 @@ function registerStatusTool(server, root) {
3492
4129
  );
3493
4130
  }
3494
4131
  async function collectStatus(root) {
3495
- const concepts = await collectPageSummaries(path23.join(root, CONCEPTS_DIR));
3496
- const queries = await collectPageSummaries(path23.join(root, QUERIES_DIR));
4132
+ const concepts = await collectPageSummaries(path26.join(root, CONCEPTS_DIR));
4133
+ const queries = await collectPageSummaries(path26.join(root, QUERIES_DIR));
3497
4134
  const state = await readState(root);
3498
4135
  const changes = await detectChanges(root, state);
3499
4136
  const orphans = await findOrphanedSlugs(root);
@@ -3510,7 +4147,7 @@ async function collectStatus(root) {
3510
4147
  };
3511
4148
  }
3512
4149
  async function findOrphanedSlugs(root) {
3513
- const scanned = await scanWikiPages(path23.join(root, CONCEPTS_DIR));
4150
+ const scanned = await scanWikiPages(path26.join(root, CONCEPTS_DIR));
3514
4151
  return scanned.filter(({ meta }) => meta.orphaned).map(({ slug }) => slug);
3515
4152
  }
3516
4153
  async function loadPageRecords(root, slugs) {
@@ -3523,7 +4160,7 @@ async function loadPageRecords(root, slugs) {
3523
4160
  }
3524
4161
  async function readPage(root, slug) {
3525
4162
  for (const dir of PAGE_DIRS2) {
3526
- const content = await safeReadFile(path23.join(root, dir, `${slug}.md`));
4163
+ const content = await safeReadFile(path26.join(root, dir, `${slug}.md`));
3527
4164
  if (!content) continue;
3528
4165
  const { meta, body } = parseFrontmatter(content);
3529
4166
  if (meta.orphaned) continue;
@@ -3538,7 +4175,7 @@ async function readPage(root, slug) {
3538
4175
  }
3539
4176
 
3540
4177
  // src/mcp/resources.ts
3541
- import path24 from "path";
4178
+ import path27 from "path";
3542
4179
  import { readdir as readdir9 } from "fs/promises";
3543
4180
  import { ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
3544
4181
  function jsonContent(uri, payload) {
@@ -3572,7 +4209,7 @@ function registerIndexResource(server, root) {
3572
4209
  mimeType: "text/markdown"
3573
4210
  },
3574
4211
  async (uri) => {
3575
- const content = await safeReadFile(path24.join(root, INDEX_FILE));
4212
+ const content = await safeReadFile(path27.join(root, INDEX_FILE));
3576
4213
  return { contents: [markdownContent(uri, content)] };
3577
4214
  }
3578
4215
  );
@@ -3639,7 +4276,7 @@ function registerQueryResource(server, root) {
3639
4276
  );
3640
4277
  }
3641
4278
  async function listSources(root) {
3642
- const sourcesPath = path24.join(root, SOURCES_DIR);
4279
+ const sourcesPath = path27.join(root, SOURCES_DIR);
3643
4280
  let files;
3644
4281
  try {
3645
4282
  files = await readdir9(sourcesPath);
@@ -3648,14 +4285,14 @@ async function listSources(root) {
3648
4285
  }
3649
4286
  const records = [];
3650
4287
  for (const file of files.filter((f) => f.endsWith(".md"))) {
3651
- const content = await safeReadFile(path24.join(sourcesPath, file));
4288
+ const content = await safeReadFile(path27.join(sourcesPath, file));
3652
4289
  const { meta } = parseFrontmatter(content);
3653
4290
  records.push({ filename: file, ...meta });
3654
4291
  }
3655
4292
  return records;
3656
4293
  }
3657
4294
  async function loadPageWithMeta(root, dir, slug) {
3658
- const filePath = path24.join(root, dir, `${slug}.md`);
4295
+ const filePath = path27.join(root, dir, `${slug}.md`);
3659
4296
  const content = await safeReadFile(filePath);
3660
4297
  if (!content) {
3661
4298
  throw new Error(`Page not found: ${dir}/${slug}.md`);
@@ -3664,7 +4301,7 @@ async function loadPageWithMeta(root, dir, slug) {
3664
4301
  return { slug, meta, body: body.trim() };
3665
4302
  }
3666
4303
  async function listPagesUnder(root, dir, scheme) {
3667
- const pagesPath = path24.join(root, dir);
4304
+ const pagesPath = path27.join(root, dir);
3668
4305
  let files;
3669
4306
  try {
3670
4307
  files = await readdir9(pagesPath);
@@ -3748,7 +4385,7 @@ reviewCommand.command("reject <id>").description("Reject a candidate and archive
3748
4385
  process.exit(1);
3749
4386
  }
3750
4387
  });
3751
- program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").action(async (question, options) => {
4388
+ program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").option("--debug", "Print which pages and chunks were selected and their scores").action(async (question, options) => {
3752
4389
  try {
3753
4390
  requireProvider();
3754
4391
  await queryCommand(process.cwd(), question, options);