llm-wiki-compiler 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -6,8 +6,8 @@ import { createRequire } from "module";
6
6
  import { Command } from "commander";
7
7
 
8
8
  // src/commands/ingest.ts
9
- import path3 from "path";
10
- import { mkdir as mkdir2, writeFile as writeFile2 } from "fs/promises";
9
+ import path7 from "path";
10
+ import { mkdir as mkdir2, readFile as readFile6, writeFile as writeFile2 } from "fs/promises";
11
11
 
12
12
  // src/utils/markdown.ts
13
13
  import { writeFile, rename, readFile, mkdir } from "fs/promises";
@@ -150,9 +150,17 @@ var LOCK_FILE = ".llmwiki/lock";
150
150
  var INDEX_FILE = "wiki/index.md";
151
151
  var MOC_FILE = "wiki/MOC.md";
152
152
  var EMBEDDINGS_FILE = ".llmwiki/embeddings.json";
153
+ var IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".jpg", ".jpeg", ".png", ".gif", ".webp"]);
154
+ var TRANSCRIPT_EXTENSIONS = /* @__PURE__ */ new Set([".vtt", ".srt"]);
155
+ var IMAGE_DESCRIBE_MAX_TOKENS = 2048;
153
156
  var CANDIDATES_DIR = ".llmwiki/candidates";
154
157
  var CANDIDATES_ARCHIVE_DIR = ".llmwiki/candidates/archive";
155
158
  var EMBEDDING_TOP_K = 15;
159
+ var CHUNK_TOP_K = 30;
160
+ var CHUNK_RERANK_KEEP = 12;
161
+ var CHUNK_TARGET_CHARS = 800;
162
+ var CHUNK_MAX_CHARS = 1400;
163
+ var CHUNK_MIN_CHARS = 200;
156
164
  var LOW_CONFIDENCE_THRESHOLD = 0.5;
157
165
  var MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS = 2;
158
166
  var EMBEDDING_MODELS = {
@@ -237,19 +245,24 @@ async function ingestWeb(url) {
237
245
 
238
246
  // src/ingest/file.ts
239
247
  import { readFile as readFile2 } from "fs/promises";
248
+ import path3 from "path";
249
+
250
+ // src/ingest/shared.ts
240
251
  import path2 from "path";
241
- var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
242
252
  function titleFromFilename(filePath) {
243
253
  const basename = path2.basename(filePath, path2.extname(filePath));
244
254
  return basename.replace(/[-_]+/g, " ").trim();
245
255
  }
256
+
257
+ // src/ingest/file.ts
258
+ var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
246
259
  function wrapPlainText(text) {
247
260
  return `\`\`\`
248
261
  ${text}
249
262
  \`\`\``;
250
263
  }
251
264
  async function ingestFile(filePath) {
252
- const ext = path2.extname(filePath).toLowerCase();
265
+ const ext = path3.extname(filePath).toLowerCase();
253
266
  if (!SUPPORTED_EXTENSIONS.has(ext)) {
254
267
  throw new Error(
255
268
  `Unsupported file type "${ext}". Only .md and .txt files are supported.`
@@ -261,10 +274,439 @@ async function ingestFile(filePath) {
261
274
  return { title, content };
262
275
  }
263
276
 
277
+ // src/ingest/pdf.ts
278
+ import { readFile as readFile3 } from "fs/promises";
279
+ function resolveTitle(filePath, info2) {
280
+ if (info2 && typeof info2 === "object") {
281
+ const titleField = info2["Title"];
282
+ if (typeof titleField === "string" && titleField.trim().length > 0) {
283
+ return titleField.trim();
284
+ }
285
+ }
286
+ return titleFromFilename(filePath);
287
+ }
288
+ async function ingestPdf(filePath) {
289
+ const { PDFParse } = await import("pdf-parse");
290
+ const buffer = await readFile3(filePath);
291
+ const parser = new PDFParse({ data: new Uint8Array(buffer) });
292
+ try {
293
+ const textResult = await parser.getText();
294
+ const infoResult = await parser.getInfo();
295
+ const title = resolveTitle(filePath, infoResult.info);
296
+ const content = textResult.text.trim();
297
+ return { title, content };
298
+ } finally {
299
+ await parser.destroy();
300
+ }
301
+ }
302
+
303
+ // src/ingest/image.ts
304
+ import { readFile as readFile4 } from "fs/promises";
305
+ import path5 from "path";
306
+ import Anthropic2 from "@anthropic-ai/sdk";
307
+
308
+ // src/providers/anthropic.ts
309
+ import Anthropic from "@anthropic-ai/sdk";
310
+ var VOYAGE_EMBEDDINGS_URL = "https://api.voyageai.com/v1/embeddings";
311
+ function buildAnthropicClientOptions(options = {}) {
312
+ const trimmedBaseURL = options.baseURL?.trim();
313
+ const trimmedApiKey = options.apiKey?.trim();
314
+ const trimmedAuthToken = options.authToken?.trim();
315
+ const result = {};
316
+ if (trimmedApiKey) {
317
+ result.apiKey = trimmedApiKey;
318
+ }
319
+ if (trimmedAuthToken) {
320
+ result.authToken = trimmedAuthToken;
321
+ }
322
+ if (!trimmedBaseURL) {
323
+ return result;
324
+ }
325
+ const normalizedBaseURL = trimmedBaseURL.endsWith("/") && trimmedBaseURL.length > 1 ? trimmedBaseURL.slice(0, -1) : trimmedBaseURL;
326
+ result.baseURL = normalizedBaseURL;
327
+ return result;
328
+ }
329
+ var AnthropicProvider = class {
330
+ client;
331
+ model;
332
+ constructor(model, options = {}) {
333
+ this.model = model;
334
+ this.client = new Anthropic(buildAnthropicClientOptions(options));
335
+ }
336
+ /** Send a single non-streaming completion request. */
337
+ async complete(system, messages, maxTokens) {
338
+ const response = await this.client.messages.create({
339
+ model: this.model,
340
+ max_tokens: maxTokens,
341
+ system,
342
+ messages
343
+ });
344
+ const textBlock = response.content.find((block) => block.type === "text");
345
+ return textBlock?.type === "text" ? textBlock.text : "";
346
+ }
347
+ /** Stream a completion, invoking onToken for each text chunk. */
348
+ async stream(system, messages, maxTokens, onToken) {
349
+ const stream = this.client.messages.stream({
350
+ model: this.model,
351
+ max_tokens: maxTokens,
352
+ system,
353
+ messages
354
+ });
355
+ let fullText = "";
356
+ for await (const event of stream) {
357
+ if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
358
+ fullText += event.delta.text;
359
+ onToken?.(event.delta.text);
360
+ }
361
+ }
362
+ return fullText;
363
+ }
364
+ /** Call Claude with tool definitions and return the parsed tool input as JSON. */
365
+ async toolCall(system, messages, tools, maxTokens) {
366
+ const anthropicTools = tools.map((t) => ({
367
+ name: t.name,
368
+ description: t.description,
369
+ input_schema: t.input_schema
370
+ }));
371
+ const response = await this.client.messages.create({
372
+ model: this.model,
373
+ max_tokens: maxTokens,
374
+ system,
375
+ messages,
376
+ tools: anthropicTools
377
+ });
378
+ const toolBlock = response.content.find((block) => block.type === "tool_use");
379
+ if (toolBlock?.type === "tool_use") {
380
+ return JSON.stringify(toolBlock.input);
381
+ }
382
+ const textBlock = response.content.find((block) => block.type === "text");
383
+ return textBlock?.type === "text" ? textBlock.text : "";
384
+ }
385
+ /**
386
+ * Produce a single embedding vector via the Voyage API.
387
+ *
388
+ * Anthropic does not ship a first-party embeddings endpoint, so we delegate
389
+ * to Voyage (their recommended partner). Requires VOYAGE_API_KEY.
390
+ */
391
+ async embed(text) {
392
+ const apiKey = process.env.VOYAGE_API_KEY?.trim();
393
+ if (!apiKey) {
394
+ throw new Error(
395
+ "VOYAGE_API_KEY is not set. Anthropic embeddings use Voyage \u2014 set VOYAGE_API_KEY to enable semantic search."
396
+ );
397
+ }
398
+ const response = await fetch(VOYAGE_EMBEDDINGS_URL, {
399
+ method: "POST",
400
+ headers: {
401
+ "Content-Type": "application/json",
402
+ Authorization: `Bearer ${apiKey}`
403
+ },
404
+ body: JSON.stringify({ input: text, model: EMBEDDING_MODELS.anthropic })
405
+ });
406
+ if (!response.ok) {
407
+ const detail = await response.text();
408
+ throw new Error(`Voyage embeddings request failed (${response.status}): ${detail}`);
409
+ }
410
+ const json = await response.json();
411
+ const vector = json.data?.[0]?.embedding;
412
+ if (!Array.isArray(vector)) {
413
+ throw new Error("Voyage embeddings response did not include a vector.");
414
+ }
415
+ return vector;
416
+ }
417
+ };
418
+
419
+ // src/utils/claude-settings.ts
420
+ import { readFileSync } from "fs";
421
+ import { homedir } from "os";
422
+ import path4 from "path";
423
+ var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
424
+ function isRecord(value) {
425
+ return typeof value === "object" && value !== null;
426
+ }
427
+ function normalize(value) {
428
+ if (typeof value !== "string") return void 0;
429
+ const trimmed = value.trim();
430
+ return trimmed.length > 0 ? trimmed : void 0;
431
+ }
432
+ function resolveClaudeSettingsPath(env) {
433
+ return env[CLAUDE_SETTINGS_PATH_ENV] ?? path4.join(homedir(), ".claude", "settings.json");
434
+ }
435
+ function readClaudeSettingsFile(settingsPath) {
436
+ try {
437
+ return readFileSync(settingsPath, "utf8");
438
+ } catch (err) {
439
+ if (isRecord(err) && err.code === "ENOENT") {
440
+ return void 0;
441
+ }
442
+ const message = err instanceof Error ? err.message : String(err);
443
+ throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
444
+ }
445
+ }
446
+ function readClaudeSettingsEnv(env = process.env) {
447
+ const settingsPath = resolveClaudeSettingsPath(env);
448
+ const raw = readClaudeSettingsFile(settingsPath);
449
+ if (!raw) return void 0;
450
+ let parsed;
451
+ try {
452
+ parsed = JSON.parse(raw);
453
+ } catch (err) {
454
+ const message = err instanceof Error ? err.message : String(err);
455
+ throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
456
+ }
457
+ if (!isRecord(parsed) || !isRecord(parsed.env)) {
458
+ return void 0;
459
+ }
460
+ const values = {
461
+ ANTHROPIC_API_KEY: normalize(parsed.env.ANTHROPIC_API_KEY),
462
+ ANTHROPIC_AUTH_TOKEN: normalize(parsed.env.ANTHROPIC_AUTH_TOKEN),
463
+ ANTHROPIC_BASE_URL: normalize(parsed.env.ANTHROPIC_BASE_URL),
464
+ ANTHROPIC_MODEL: normalize(parsed.env.ANTHROPIC_MODEL)
465
+ };
466
+ if (!values.ANTHROPIC_API_KEY && !values.ANTHROPIC_AUTH_TOKEN && !values.ANTHROPIC_BASE_URL && !values.ANTHROPIC_MODEL) {
467
+ return void 0;
468
+ }
469
+ return values;
470
+ }
471
+ function tryReadClaudeSettingsEnv(env) {
472
+ try {
473
+ return readClaudeSettingsEnv(env);
474
+ } catch {
475
+ return void 0;
476
+ }
477
+ }
478
+ function validateAnthropicBaseURL(value) {
479
+ const normalized = value.trim();
480
+ try {
481
+ const parsed = new URL(normalized);
482
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
483
+ throw new Error("Must use http:// or https:// protocol.");
484
+ }
485
+ } catch (err) {
486
+ const message = err instanceof Error ? err.message : "Must be a valid http(s) URL.";
487
+ throw new Error(`Invalid ANTHROPIC_BASE_URL: "${normalized}". ${message}`);
488
+ }
489
+ return normalized;
490
+ }
491
+ function resolveAnthropicAuthFromEnv(env = process.env) {
492
+ const explicitApiKey = normalize(env.ANTHROPIC_API_KEY);
493
+ if (explicitApiKey) return { apiKey: explicitApiKey };
494
+ const explicitAuthToken = normalize(env.ANTHROPIC_AUTH_TOKEN);
495
+ if (explicitAuthToken) return { authToken: explicitAuthToken };
496
+ const fallback = readClaudeSettingsEnv(env);
497
+ if (fallback?.ANTHROPIC_API_KEY) return { apiKey: fallback.ANTHROPIC_API_KEY };
498
+ if (fallback?.ANTHROPIC_AUTH_TOKEN) return { authToken: fallback.ANTHROPIC_AUTH_TOKEN };
499
+ return {};
500
+ }
501
+ function resolveAnthropicModelFromEnv(env = process.env) {
502
+ const explicitModel = env.LLMWIKI_MODEL;
503
+ if (explicitModel !== void 0) return explicitModel;
504
+ return tryReadClaudeSettingsEnv(env)?.ANTHROPIC_MODEL;
505
+ }
506
+ function resolveAnthropicBaseURLFromEnv(env = process.env) {
507
+ const explicitBaseURL = normalize(env.ANTHROPIC_BASE_URL);
508
+ if (explicitBaseURL) return validateAnthropicBaseURL(explicitBaseURL);
509
+ const fallbackBaseURL = tryReadClaudeSettingsEnv(env)?.ANTHROPIC_BASE_URL;
510
+ if (!fallbackBaseURL) return void 0;
511
+ return validateAnthropicBaseURL(fallbackBaseURL);
512
+ }
513
+
514
+ // src/ingest/image.ts
515
+ var EXTENSION_TO_MIME = {
516
+ ".jpg": "image/jpeg",
517
+ ".jpeg": "image/jpeg",
518
+ ".png": "image/png",
519
+ ".gif": "image/gif",
520
+ ".webp": "image/webp"
521
+ };
522
+ function mimeTypeForExtension(ext) {
523
+ const mimeType = EXTENSION_TO_MIME[ext.toLowerCase()];
524
+ if (!mimeType) {
525
+ throw new Error(
526
+ `Unsupported image extension "${ext}". Supported: ${Object.keys(EXTENSION_TO_MIME).join(", ")}`
527
+ );
528
+ }
529
+ return mimeType;
530
+ }
531
+ function buildClient() {
532
+ const baseURL = resolveAnthropicBaseURLFromEnv();
533
+ const auth = resolveAnthropicAuthFromEnv();
534
+ return new Anthropic2(buildAnthropicClientOptions({ baseURL, ...auth }));
535
+ }
536
+ async function describeImageWithVision(client, model, imageData, mimeType) {
537
+ const response = await client.messages.create({
538
+ model,
539
+ max_tokens: IMAGE_DESCRIBE_MAX_TOKENS,
540
+ messages: [
541
+ {
542
+ role: "user",
543
+ content: [
544
+ {
545
+ type: "image",
546
+ source: { type: "base64", media_type: mimeType, data: imageData }
547
+ },
548
+ {
549
+ type: "text",
550
+ text: "Extract and transcribe all text visible in this image. Then provide a detailed description of any non-text visual content. Format your response as markdown."
551
+ }
552
+ ]
553
+ }
554
+ ]
555
+ });
556
+ const textBlock = response.content.find((block) => block.type === "text");
557
+ return textBlock?.type === "text" ? textBlock.text : "";
558
+ }
559
+ async function ingestImage(filePath) {
560
+ const providerName = process.env.LLMWIKI_PROVIDER ?? "anthropic";
561
+ if (providerName !== "anthropic") {
562
+ throw new Error(
563
+ `Image ingest requires the Anthropic provider (vision). Current provider: "${providerName}". Set LLMWIKI_PROVIDER=anthropic and ANTHROPIC_API_KEY to use image ingest.`
564
+ );
565
+ }
566
+ const ext = path5.extname(filePath).toLowerCase();
567
+ const mimeType = mimeTypeForExtension(ext);
568
+ const imageBuffer = await readFile4(filePath);
569
+ const imageData = imageBuffer.toString("base64");
570
+ const client = buildClient();
571
+ const model = resolveAnthropicModelFromEnv() ?? PROVIDER_MODELS.anthropic;
572
+ const content = await describeImageWithVision(client, model, imageData, mimeType);
573
+ const title = titleFromFilename(filePath);
574
+ return { title, content };
575
+ }
576
+
577
+ // src/ingest/transcript.ts
578
+ import { readFile as readFile5 } from "fs/promises";
579
+ import path6 from "path";
580
+ import { YoutubeTranscript } from "youtube-transcript";
581
+ var YOUTUBE_URL_PATTERN = /^https?:\/\/(www\.)?(youtube\.com\/watch|youtu\.be\/)/;
582
+ var SRT_SEQUENCE_PATTERN = /^\d+$/;
583
+ var TIMESTAMP_PATTERN = /\d{2}:\d{2}[:.]\d{2}/;
584
+ var MS_PER_MINUTE = 6e4;
585
+ var MS_PER_SECOND = 1e3;
586
+ function isYoutubeUrl(source2) {
587
+ return YOUTUBE_URL_PATTERN.test(source2);
588
+ }
589
+ function extractVideoId(url) {
590
+ const match = url.match(/(?:v=|youtu\.be\/)([^&?/]+)/);
591
+ if (!match) {
592
+ throw new Error(`Could not extract video ID from YouTube URL: ${url}`);
593
+ }
594
+ return match[1];
595
+ }
596
+ function formatOffset(offsetMs) {
597
+ const minutes = Math.floor(offsetMs / MS_PER_MINUTE);
598
+ const seconds = Math.floor(offsetMs % MS_PER_MINUTE / MS_PER_SECOND);
599
+ return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}`;
600
+ }
601
+ async function fetchYoutubeTranscript(url) {
602
+ const videoId = extractVideoId(url);
603
+ const segments = await YoutubeTranscript.fetchTranscript(videoId);
604
+ if (!segments || segments.length === 0) {
605
+ throw new Error(`No transcript available for YouTube video: ${url}`);
606
+ }
607
+ const lines = segments.map((seg) => `[${formatOffset(seg.offset)}] ${seg.text}`);
608
+ return {
609
+ title: `YouTube Transcript ${videoId}`,
610
+ content: lines.join("\n")
611
+ };
612
+ }
613
+ function isCueTimestamp(trimmed) {
614
+ return TIMESTAMP_PATTERN.test(trimmed) && trimmed.includes("-->");
615
+ }
616
+ function parseVtt(raw, filePath) {
617
+ const lines = raw.split("\n");
618
+ const output = [];
619
+ let inCue = false;
620
+ for (const line of lines) {
621
+ const trimmed = line.trim();
622
+ if (trimmed === "WEBVTT" || trimmed === "") {
623
+ inCue = false;
624
+ continue;
625
+ }
626
+ if (isCueTimestamp(trimmed)) {
627
+ output.push(`
628
+ **[${trimmed}]**`);
629
+ inCue = true;
630
+ continue;
631
+ }
632
+ if (inCue && trimmed.length > 0) {
633
+ output.push(trimmed);
634
+ }
635
+ }
636
+ return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
637
+ }
638
+ function parseSrt(raw, filePath) {
639
+ const lines = raw.split("\n");
640
+ const output = [];
641
+ for (const line of lines) {
642
+ const trimmed = line.trim();
643
+ if (trimmed === "" || SRT_SEQUENCE_PATTERN.test(trimmed)) {
644
+ continue;
645
+ }
646
+ if (isCueTimestamp(trimmed)) {
647
+ output.push(`
648
+ **[${trimmed}]**`);
649
+ continue;
650
+ }
651
+ if (trimmed.length > 0) {
652
+ output.push(trimmed);
653
+ }
654
+ }
655
+ return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
656
+ }
657
+ function parsePlainTranscript(raw, filePath) {
658
+ return { title: titleFromFilename(filePath), content: raw.trim() };
659
+ }
660
+ async function ingestTranscript(source2) {
661
+ if (isYoutubeUrl(source2)) {
662
+ return fetchYoutubeTranscript(source2);
663
+ }
664
+ const ext = path6.extname(source2).toLowerCase();
665
+ const raw = await readFile5(source2, "utf-8");
666
+ if (ext === ".vtt") return parseVtt(raw, source2);
667
+ if (ext === ".srt") return parseSrt(raw, source2);
668
+ if (ext === ".txt") return parsePlainTranscript(raw, source2);
669
+ throw new Error(
670
+ `Unsupported transcript file type "${ext}". Supported: .vtt, .srt, .txt`
671
+ );
672
+ }
673
+
264
674
  // src/commands/ingest.ts
265
675
  function isUrl(source2) {
266
676
  return source2.startsWith("http://") || source2.startsWith("https://");
267
677
  }
678
+ var TXT_SNIFF_BYTES = 2048;
679
+ var SPEAKER_TAG_PATTERN = /^([A-Z][a-zA-Z .'-]{0,40}):\s/gm;
680
+ var TIMESTAMP_PATTERN2 = /^\s*\d{1,2}:\d{2}(:\d{2})?/;
681
+ var MIN_TIMESTAMP_MATCHES = 3;
682
+ var MIN_SPEAKER_REPEAT_COUNT = 2;
683
+ var MIN_DISTINCT_SPEAKERS = 2;
684
+ function countSpeakerOccurrences(sample) {
685
+ const counts = /* @__PURE__ */ new Map();
686
+ SPEAKER_TAG_PATTERN.lastIndex = 0;
687
+ let match;
688
+ while ((match = SPEAKER_TAG_PATTERN.exec(sample)) !== null) {
689
+ const name = match[1].trim();
690
+ counts.set(name, (counts.get(name) ?? 0) + 1);
691
+ }
692
+ return counts;
693
+ }
694
+ function hasSpeakerDialoguePattern(sample) {
695
+ const counts = countSpeakerOccurrences(sample);
696
+ const distinctSpeakers = counts.size;
697
+ const hasEnoughSpeakers = distinctSpeakers >= MIN_DISTINCT_SPEAKERS;
698
+ const hasRepeatedSpeaker = [...counts.values()].some(
699
+ (n) => n >= MIN_SPEAKER_REPEAT_COUNT
700
+ );
701
+ return hasEnoughSpeakers && hasRepeatedSpeaker;
702
+ }
703
+ async function looksLikeTxtTranscript(filePath) {
704
+ const raw = await readFile6(filePath, "utf-8");
705
+ const sample = raw.slice(0, TXT_SNIFF_BYTES);
706
+ if (hasSpeakerDialoguePattern(sample)) return true;
707
+ const timestampMatches = sample.match(new RegExp(TIMESTAMP_PATTERN2.source, "gm"));
708
+ return (timestampMatches?.length ?? 0) >= MIN_TIMESTAMP_MATCHES;
709
+ }
268
710
  function enforceCharLimit(content) {
269
711
  if (content.length <= MAX_SOURCE_CHARS) {
270
712
  return { content, truncated: false, originalChars: content.length };
@@ -297,12 +739,30 @@ function enforceMinContent(content) {
297
739
  );
298
740
  }
299
741
  }
300
- function buildDocument(title, source2, result) {
742
+ async function detectSourceType(source2) {
743
+ if (!isUrl(source2)) {
744
+ const ext = path7.extname(source2).toLowerCase();
745
+ if (ext === ".pdf") return "pdf";
746
+ if (IMAGE_EXTENSIONS.has(ext)) return "image";
747
+ if (TRANSCRIPT_EXTENSIONS.has(ext)) return "transcript";
748
+ if (ext === ".txt") {
749
+ const isTranscript = await looksLikeTxtTranscript(source2);
750
+ return isTranscript ? "transcript" : "file";
751
+ }
752
+ return "file";
753
+ }
754
+ if (isYoutubeUrl(source2)) return "transcript";
755
+ return "web";
756
+ }
757
+ function buildDocument(title, source2, result, sourceType) {
301
758
  const meta = {
302
759
  title,
303
760
  source: source2,
304
761
  ingestedAt: (/* @__PURE__ */ new Date()).toISOString()
305
762
  };
763
+ if (sourceType !== void 0) {
764
+ meta.sourceType = sourceType;
765
+ }
306
766
  if (result.truncated) {
307
767
  meta.truncated = true;
308
768
  meta.originalChars = result.originalChars;
@@ -313,30 +773,46 @@ function buildDocument(title, source2, result) {
313
773
  ${result.content}
314
774
  `;
315
775
  }
776
+ async function fetchContent(source2, sourceType) {
777
+ switch (sourceType) {
778
+ case "web":
779
+ return ingestWeb(source2);
780
+ case "pdf":
781
+ return ingestPdf(source2);
782
+ case "image":
783
+ return ingestImage(source2);
784
+ case "transcript":
785
+ return ingestTranscript(source2);
786
+ case "file":
787
+ return ingestFile(source2);
788
+ }
789
+ }
316
790
  async function saveSource(title, document) {
317
791
  const filename = `${slugify(title)}.md`;
318
- const destPath = path3.join(SOURCES_DIR, filename);
792
+ const destPath = path7.join(SOURCES_DIR, filename);
319
793
  await mkdir2(SOURCES_DIR, { recursive: true });
320
794
  await writeFile2(destPath, document, "utf-8");
321
795
  return destPath;
322
796
  }
323
797
  async function ingestSource(source2) {
324
- status("*", info(`Ingesting: ${source2}`));
325
- const { title, content } = isUrl(source2) ? await ingestWeb(source2) : await ingestFile(source2);
798
+ const sourceType = await detectSourceType(source2);
799
+ status("*", info(`Ingesting [${sourceType}]: ${source2}`));
800
+ const { title, content } = await fetchContent(source2, sourceType);
326
801
  const result = enforceCharLimit(content);
327
802
  enforceMinContent(result.content);
328
- const document = buildDocument(title, source2, result);
803
+ const document = buildDocument(title, source2, result, sourceType);
329
804
  const savedPath = await saveSource(title, document);
330
805
  return {
331
- filename: path3.basename(savedPath),
806
+ filename: path7.basename(savedPath),
332
807
  charCount: result.content.length,
333
808
  truncated: result.truncated,
334
- source: source2
809
+ source: source2,
810
+ sourceType
335
811
  };
336
812
  }
337
813
  async function ingest(source2) {
338
814
  const result = await ingestSource(source2);
339
- const savedPath = path3.join(SOURCES_DIR, result.filename);
815
+ const savedPath = path7.join(SOURCES_DIR, result.filename);
340
816
  status(
341
817
  "+",
342
818
  success(`Saved ${bold(result.filename)} \u2192 ${source(savedPath)}`)
@@ -348,23 +824,23 @@ async function ingest(source2) {
348
824
  import { existsSync as existsSync7 } from "fs";
349
825
 
350
826
  // src/compiler/index.ts
351
- import { readFile as readFile10 } from "fs/promises";
352
- import path18 from "path";
827
+ import { readFile as readFile14 } from "fs/promises";
828
+ import path21 from "path";
353
829
 
354
830
  // src/utils/state.ts
355
- import { readFile as readFile3, writeFile as writeFile3, rename as rename2, mkdir as mkdir3, copyFile } from "fs/promises";
831
+ import { readFile as readFile7, writeFile as writeFile3, rename as rename2, mkdir as mkdir3, copyFile } from "fs/promises";
356
832
  import { existsSync } from "fs";
357
- import path4 from "path";
833
+ import path8 from "path";
358
834
  function emptyState() {
359
835
  return { version: 1, indexHash: "", sources: {} };
360
836
  }
361
837
  async function readState(root) {
362
- const filePath = path4.join(root, STATE_FILE);
838
+ const filePath = path8.join(root, STATE_FILE);
363
839
  if (!existsSync(filePath)) {
364
840
  return emptyState();
365
841
  }
366
842
  try {
367
- const raw = await readFile3(filePath, "utf-8");
843
+ const raw = await readFile7(filePath, "utf-8");
368
844
  return JSON.parse(raw);
369
845
  } catch {
370
846
  const bakPath = filePath + ".bak";
@@ -374,9 +850,9 @@ async function readState(root) {
374
850
  }
375
851
  }
376
852
  async function writeState(root, state) {
377
- const dir = path4.join(root, LLMWIKI_DIR);
853
+ const dir = path8.join(root, LLMWIKI_DIR);
378
854
  await mkdir3(dir, { recursive: true });
379
- const filePath = path4.join(root, STATE_FILE);
855
+ const filePath = path8.join(root, STATE_FILE);
380
856
  const tmpPath = filePath + ".tmp";
381
857
  await writeFile3(tmpPath, JSON.stringify(state, null, 2), "utf-8");
382
858
  await rename2(tmpPath, filePath);
@@ -393,18 +869,18 @@ async function removeSourceState(root, sourceFile) {
393
869
  }
394
870
 
395
871
  // src/compiler/source-state.ts
396
- import path6 from "path";
872
+ import path10 from "path";
397
873
 
398
874
  // src/compiler/hasher.ts
399
875
  import { createHash } from "crypto";
400
- import { readFile as readFile4, readdir } from "fs/promises";
401
- import path5 from "path";
876
+ import { readFile as readFile8, readdir } from "fs/promises";
877
+ import path9 from "path";
402
878
  async function hashFile(filePath) {
403
- const content = await readFile4(filePath, "utf-8");
879
+ const content = await readFile8(filePath, "utf-8");
404
880
  return createHash("sha256").update(content).digest("hex");
405
881
  }
406
882
  async function detectChanges(root, prevState) {
407
- const sourcesPath = path5.join(root, SOURCES_DIR);
883
+ const sourcesPath = path9.join(root, SOURCES_DIR);
408
884
  const currentFiles = await listSourceFiles(sourcesPath);
409
885
  const changes = [];
410
886
  for (const file of currentFiles) {
@@ -424,7 +900,7 @@ async function listSourceFiles(sourcesPath) {
424
900
  }
425
901
  }
426
902
  async function classifyFile(root, file, prevState) {
427
- const filePath = path5.join(root, SOURCES_DIR, file);
903
+ const filePath = path9.join(root, SOURCES_DIR, file);
428
904
  const hash = await hashFile(filePath);
429
905
  const prev = prevState.sources[file];
430
906
  if (!prev) return "new";
@@ -447,133 +923,22 @@ async function buildExtractionSourceStates(root, extractions) {
447
923
  return snapshot;
448
924
  }
449
925
  async function buildEntry(root, result, compiledAt) {
450
- const filePath = path6.join(root, SOURCES_DIR, result.sourceFile);
926
+ const filePath = path10.join(root, SOURCES_DIR, result.sourceFile);
451
927
  const hash = await hashFile(filePath);
452
- return {
453
- hash,
454
- concepts: result.concepts.map((concept) => slugify(concept.concept)),
455
- compiledAt
456
- };
457
- }
458
- function pickStatesForSources(allStates, sourceFiles) {
459
- const picked = {};
460
- for (const file of sourceFiles) {
461
- const entry = allStates[file];
462
- if (entry) picked[file] = entry;
463
- }
464
- return picked;
465
- }
466
-
467
- // src/providers/anthropic.ts
468
- import Anthropic from "@anthropic-ai/sdk";
469
- var VOYAGE_EMBEDDINGS_URL = "https://api.voyageai.com/v1/embeddings";
470
- function buildAnthropicClientOptions(options = {}) {
471
- const trimmedBaseURL = options.baseURL?.trim();
472
- const trimmedApiKey = options.apiKey?.trim();
473
- const trimmedAuthToken = options.authToken?.trim();
474
- const result = {};
475
- if (trimmedApiKey) {
476
- result.apiKey = trimmedApiKey;
477
- }
478
- if (trimmedAuthToken) {
479
- result.authToken = trimmedAuthToken;
480
- }
481
- if (!trimmedBaseURL) {
482
- return result;
483
- }
484
- const normalizedBaseURL = trimmedBaseURL.endsWith("/") && trimmedBaseURL.length > 1 ? trimmedBaseURL.slice(0, -1) : trimmedBaseURL;
485
- result.baseURL = normalizedBaseURL;
486
- return result;
487
- }
488
- var AnthropicProvider = class {
489
- client;
490
- model;
491
- constructor(model, options = {}) {
492
- this.model = model;
493
- this.client = new Anthropic(buildAnthropicClientOptions(options));
494
- }
495
- /** Send a single non-streaming completion request. */
496
- async complete(system, messages, maxTokens) {
497
- const response = await this.client.messages.create({
498
- model: this.model,
499
- max_tokens: maxTokens,
500
- system,
501
- messages
502
- });
503
- const textBlock = response.content.find((block) => block.type === "text");
504
- return textBlock?.type === "text" ? textBlock.text : "";
505
- }
506
- /** Stream a completion, invoking onToken for each text chunk. */
507
- async stream(system, messages, maxTokens, onToken) {
508
- const stream = this.client.messages.stream({
509
- model: this.model,
510
- max_tokens: maxTokens,
511
- system,
512
- messages
513
- });
514
- let fullText = "";
515
- for await (const event of stream) {
516
- if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
517
- fullText += event.delta.text;
518
- onToken?.(event.delta.text);
519
- }
520
- }
521
- return fullText;
522
- }
523
- /** Call Claude with tool definitions and return the parsed tool input as JSON. */
524
- async toolCall(system, messages, tools, maxTokens) {
525
- const anthropicTools = tools.map((t) => ({
526
- name: t.name,
527
- description: t.description,
528
- input_schema: t.input_schema
529
- }));
530
- const response = await this.client.messages.create({
531
- model: this.model,
532
- max_tokens: maxTokens,
533
- system,
534
- messages,
535
- tools: anthropicTools
536
- });
537
- const toolBlock = response.content.find((block) => block.type === "tool_use");
538
- if (toolBlock?.type === "tool_use") {
539
- return JSON.stringify(toolBlock.input);
540
- }
541
- const textBlock = response.content.find((block) => block.type === "text");
542
- return textBlock?.type === "text" ? textBlock.text : "";
543
- }
544
- /**
545
- * Produce a single embedding vector via the Voyage API.
546
- *
547
- * Anthropic does not ship a first-party embeddings endpoint, so we delegate
548
- * to Voyage (their recommended partner). Requires VOYAGE_API_KEY.
549
- */
550
- async embed(text) {
551
- const apiKey = process.env.VOYAGE_API_KEY?.trim();
552
- if (!apiKey) {
553
- throw new Error(
554
- "VOYAGE_API_KEY is not set. Anthropic embeddings use Voyage \u2014 set VOYAGE_API_KEY to enable semantic search."
555
- );
556
- }
557
- const response = await fetch(VOYAGE_EMBEDDINGS_URL, {
558
- method: "POST",
559
- headers: {
560
- "Content-Type": "application/json",
561
- Authorization: `Bearer ${apiKey}`
562
- },
563
- body: JSON.stringify({ input: text, model: EMBEDDING_MODELS.anthropic })
564
- });
565
- if (!response.ok) {
566
- const detail = await response.text();
567
- throw new Error(`Voyage embeddings request failed (${response.status}): ${detail}`);
568
- }
569
- const json = await response.json();
570
- const vector = json.data?.[0]?.embedding;
571
- if (!Array.isArray(vector)) {
572
- throw new Error("Voyage embeddings response did not include a vector.");
573
- }
574
- return vector;
928
+ return {
929
+ hash,
930
+ concepts: result.concepts.map((concept) => slugify(concept.concept)),
931
+ compiledAt
932
+ };
933
+ }
934
+ function pickStatesForSources(allStates, sourceFiles) {
935
+ const picked = {};
936
+ for (const file of sourceFiles) {
937
+ const entry = allStates[file];
938
+ if (entry) picked[file] = entry;
575
939
  }
576
- };
940
+ return picked;
941
+ }
577
942
 
578
943
  // src/providers/openai.ts
579
944
  import OpenAI from "openai";
@@ -704,101 +1069,6 @@ var MiniMaxProvider = class extends OpenAIProvider {
704
1069
  }
705
1070
  };
706
1071
 
707
- // src/utils/claude-settings.ts
708
- import { readFileSync } from "fs";
709
- import { homedir } from "os";
710
- import path7 from "path";
711
- var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
712
- function isRecord(value) {
713
- return typeof value === "object" && value !== null;
714
- }
715
- function normalize(value) {
716
- if (typeof value !== "string") return void 0;
717
- const trimmed = value.trim();
718
- return trimmed.length > 0 ? trimmed : void 0;
719
- }
720
- function resolveClaudeSettingsPath(env) {
721
- return env[CLAUDE_SETTINGS_PATH_ENV] ?? path7.join(homedir(), ".claude", "settings.json");
722
- }
723
- function readClaudeSettingsFile(settingsPath) {
724
- try {
725
- return readFileSync(settingsPath, "utf8");
726
- } catch (err) {
727
- if (isRecord(err) && err.code === "ENOENT") {
728
- return void 0;
729
- }
730
- const message = err instanceof Error ? err.message : String(err);
731
- throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
732
- }
733
- }
734
- function readClaudeSettingsEnv(env = process.env) {
735
- const settingsPath = resolveClaudeSettingsPath(env);
736
- const raw = readClaudeSettingsFile(settingsPath);
737
- if (!raw) return void 0;
738
- let parsed;
739
- try {
740
- parsed = JSON.parse(raw);
741
- } catch (err) {
742
- const message = err instanceof Error ? err.message : String(err);
743
- throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
744
- }
745
- if (!isRecord(parsed) || !isRecord(parsed.env)) {
746
- return void 0;
747
- }
748
- const values = {
749
- ANTHROPIC_API_KEY: normalize(parsed.env.ANTHROPIC_API_KEY),
750
- ANTHROPIC_AUTH_TOKEN: normalize(parsed.env.ANTHROPIC_AUTH_TOKEN),
751
- ANTHROPIC_BASE_URL: normalize(parsed.env.ANTHROPIC_BASE_URL),
752
- ANTHROPIC_MODEL: normalize(parsed.env.ANTHROPIC_MODEL)
753
- };
754
- if (!values.ANTHROPIC_API_KEY && !values.ANTHROPIC_AUTH_TOKEN && !values.ANTHROPIC_BASE_URL && !values.ANTHROPIC_MODEL) {
755
- return void 0;
756
- }
757
- return values;
758
- }
759
- function tryReadClaudeSettingsEnv(env) {
760
- try {
761
- return readClaudeSettingsEnv(env);
762
- } catch {
763
- return void 0;
764
- }
765
- }
766
- function validateAnthropicBaseURL(value) {
767
- const normalized = value.trim();
768
- try {
769
- const parsed = new URL(normalized);
770
- if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
771
- throw new Error("Must use http:// or https:// protocol.");
772
- }
773
- } catch (err) {
774
- const message = err instanceof Error ? err.message : "Must be a valid http(s) URL.";
775
- throw new Error(`Invalid ANTHROPIC_BASE_URL: "${normalized}". ${message}`);
776
- }
777
- return normalized;
778
- }
779
- function resolveAnthropicAuthFromEnv(env = process.env) {
780
- const explicitApiKey = normalize(env.ANTHROPIC_API_KEY);
781
- if (explicitApiKey) return { apiKey: explicitApiKey };
782
- const explicitAuthToken = normalize(env.ANTHROPIC_AUTH_TOKEN);
783
- if (explicitAuthToken) return { authToken: explicitAuthToken };
784
- const fallback = readClaudeSettingsEnv(env);
785
- if (fallback?.ANTHROPIC_API_KEY) return { apiKey: fallback.ANTHROPIC_API_KEY };
786
- if (fallback?.ANTHROPIC_AUTH_TOKEN) return { authToken: fallback.ANTHROPIC_AUTH_TOKEN };
787
- return {};
788
- }
789
- function resolveAnthropicModelFromEnv(env = process.env) {
790
- const explicitModel = env.LLMWIKI_MODEL;
791
- if (explicitModel !== void 0) return explicitModel;
792
- return tryReadClaudeSettingsEnv(env)?.ANTHROPIC_MODEL;
793
- }
794
- function resolveAnthropicBaseURLFromEnv(env = process.env) {
795
- const explicitBaseURL = normalize(env.ANTHROPIC_BASE_URL);
796
- if (explicitBaseURL) return validateAnthropicBaseURL(explicitBaseURL);
797
- const fallbackBaseURL = tryReadClaudeSettingsEnv(env)?.ANTHROPIC_BASE_URL;
798
- if (!fallbackBaseURL) return void 0;
799
- return validateAnthropicBaseURL(fallbackBaseURL);
800
- }
801
-
802
1072
  // src/utils/provider.ts
803
1073
  var SUPPORTED_PROVIDERS = /* @__PURE__ */ new Set(["anthropic", "openai", "ollama", "minimax"]);
804
1074
  function getProvider() {
@@ -891,8 +1161,8 @@ async function callClaude(options) {
891
1161
  }
892
1162
 
893
1163
  // src/utils/lock.ts
894
- import { open, readFile as readFile5, unlink, mkdir as mkdir4 } from "fs/promises";
895
- import path8 from "path";
1164
+ import { open, readFile as readFile9, unlink, mkdir as mkdir4 } from "fs/promises";
1165
+ import path11 from "path";
896
1166
  var RECLAIM_SUFFIX = ".reclaim";
897
1167
  var MAX_ACQUIRE_ATTEMPTS = 2;
898
1168
  function isProcessAlive(pid) {
@@ -904,8 +1174,8 @@ function isProcessAlive(pid) {
904
1174
  }
905
1175
  }
906
1176
  async function acquireLock(root) {
907
- const lockPath = path8.join(root, LOCK_FILE);
908
- await mkdir4(path8.join(root, LLMWIKI_DIR), { recursive: true });
1177
+ const lockPath = path11.join(root, LOCK_FILE);
1178
+ await mkdir4(path11.join(root, LLMWIKI_DIR), { recursive: true });
909
1179
  for (let attempt = 0; attempt < MAX_ACQUIRE_ATTEMPTS; attempt++) {
910
1180
  const created = await tryCreateLock(lockPath);
911
1181
  if (created) return true;
@@ -968,7 +1238,7 @@ async function tryCreateLock(lockPath) {
968
1238
  }
969
1239
  async function isLockStale(lockPath) {
970
1240
  try {
971
- const content = await readFile5(lockPath, "utf-8");
1241
+ const content = await readFile9(lockPath, "utf-8");
972
1242
  const pid = parseInt(content.trim(), 10);
973
1243
  if (isNaN(pid)) return true;
974
1244
  return !isProcessAlive(pid);
@@ -977,7 +1247,7 @@ async function isLockStale(lockPath) {
977
1247
  }
978
1248
  }
979
1249
  async function releaseLock(root) {
980
- const lockPath = path8.join(root, LOCK_FILE);
1250
+ const lockPath = path11.join(root, LOCK_FILE);
981
1251
  try {
982
1252
  await unlink(lockPath);
983
1253
  } catch {
@@ -1220,8 +1490,8 @@ function buildDefaultSchema() {
1220
1490
 
1221
1491
  // src/schema/loader.ts
1222
1492
  import { existsSync as existsSync2 } from "fs";
1223
- import { readFile as readFile6 } from "fs/promises";
1224
- import path9 from "path";
1493
+ import { readFile as readFile10 } from "fs/promises";
1494
+ import path12 from "path";
1225
1495
  import yaml2 from "js-yaml";
1226
1496
  var SCHEMA_CANDIDATE_PATHS = [
1227
1497
  ".llmwiki/schema.json",
@@ -1232,7 +1502,7 @@ var SCHEMA_CANDIDATE_PATHS = [
1232
1502
  ];
1233
1503
  function findSchemaPath(root) {
1234
1504
  for (const candidate of SCHEMA_CANDIDATE_PATHS) {
1235
- const absolute = path9.join(root, candidate);
1505
+ const absolute = path12.join(root, candidate);
1236
1506
  if (existsSync2(absolute)) return absolute;
1237
1507
  }
1238
1508
  return null;
@@ -1285,12 +1555,12 @@ async function loadSchema(root) {
1285
1555
  const defaults = buildDefaultSchema();
1286
1556
  const schemaPath = findSchemaPath(root);
1287
1557
  if (!schemaPath) return defaults;
1288
- const raw = await readFile6(schemaPath, "utf-8");
1558
+ const raw = await readFile10(schemaPath, "utf-8");
1289
1559
  const parsed = parseSchemaFile(schemaPath, raw);
1290
1560
  return applyOverrides(defaults, parsed, schemaPath);
1291
1561
  }
1292
1562
  function defaultSchemaInitPath(root) {
1293
- return path9.join(root, SCHEMA_CANDIDATE_PATHS[0]);
1563
+ return path12.join(root, SCHEMA_CANDIDATE_PATHS[0]);
1294
1564
  }
1295
1565
 
1296
1566
  // src/schema/helpers.ts
@@ -1462,7 +1732,7 @@ async function freezeFailedExtractions(root, results, frozenSlugs) {
1462
1732
  }
1463
1733
 
1464
1734
  // src/compiler/orphan.ts
1465
- import path10 from "path";
1735
+ import path13 from "path";
1466
1736
  async function markOrphaned(root, sourceFile, state) {
1467
1737
  const sourceEntry = state.sources[sourceFile];
1468
1738
  if (!sourceEntry) return;
@@ -1488,7 +1758,7 @@ async function orphanUnownedFrozenPages(root, frozenSlugs) {
1488
1758
  }
1489
1759
  }
1490
1760
  async function orphanPage(root, slug, reason) {
1491
- const pagePath = path10.join(root, CONCEPTS_DIR, `${slug}.md`);
1761
+ const pagePath = path13.join(root, CONCEPTS_DIR, `${slug}.md`);
1492
1762
  const content = await safeReadFile(pagePath);
1493
1763
  if (!content) return;
1494
1764
  const { meta } = parseFrontmatter(content);
@@ -1499,18 +1769,18 @@ async function orphanPage(root, slug, reason) {
1499
1769
  }
1500
1770
 
1501
1771
  // src/compiler/resolver.ts
1502
- import { readdir as readdir2, readFile as readFile7 } from "fs/promises";
1503
- import path11 from "path";
1772
+ import { readdir as readdir2, readFile as readFile11 } from "fs/promises";
1773
+ import path14 from "path";
1504
1774
  import { existsSync as existsSync3 } from "fs";
1505
1775
  async function buildTitleIndex(root) {
1506
- const conceptsDir = path11.join(root, CONCEPTS_DIR);
1776
+ const conceptsDir = path14.join(root, CONCEPTS_DIR);
1507
1777
  if (!existsSync3(conceptsDir)) return [];
1508
1778
  const files = await readdir2(conceptsDir);
1509
1779
  const pages = [];
1510
1780
  for (const file of files) {
1511
1781
  if (!file.endsWith(".md")) continue;
1512
- const filePath = path11.join(conceptsDir, file);
1513
- const content = await readFile7(filePath, "utf-8");
1782
+ const filePath = path14.join(conceptsDir, file);
1783
+ const content = await readFile11(filePath, "utf-8");
1514
1784
  const { meta } = parseFrontmatter(content);
1515
1785
  if (meta.title && typeof meta.title === "string" && !meta.orphaned) {
1516
1786
  pages.push({
@@ -1596,7 +1866,7 @@ async function resolveInboundLinks(titleIndex, newSlugs) {
1596
1866
  let count = 0;
1597
1867
  for (const page of titleIndex) {
1598
1868
  if (newSlugs.includes(page.slug)) continue;
1599
- const content = await readFile7(page.filePath, "utf-8");
1869
+ const content = await readFile11(page.filePath, "utf-8");
1600
1870
  const { body } = parseFrontmatter(content);
1601
1871
  const linked = addWikilinks(body, newTitles, page.title);
1602
1872
  if (linked !== body) {
@@ -1608,7 +1878,7 @@ async function resolveInboundLinks(titleIndex, newSlugs) {
1608
1878
  return count;
1609
1879
  }
1610
1880
  async function linkPage(page, titleIndex) {
1611
- const content = await readFile7(page.filePath, "utf-8");
1881
+ const content = await readFile11(page.filePath, "utf-8");
1612
1882
  const { body } = parseFrontmatter(content);
1613
1883
  const linked = addWikilinks(body, titleIndex, page.title);
1614
1884
  if (linked === body) return false;
@@ -1619,17 +1889,17 @@ async function linkPage(page, titleIndex) {
1619
1889
 
1620
1890
  // src/compiler/indexgen.ts
1621
1891
  import { readdir as readdir3 } from "fs/promises";
1622
- import path12 from "path";
1892
+ import path15 from "path";
1623
1893
  async function generateIndex(root) {
1624
1894
  status("*", info("Generating index..."));
1625
- const conceptsPath = path12.join(root, CONCEPTS_DIR);
1626
- const queriesPath = path12.join(root, QUERIES_DIR);
1895
+ const conceptsPath = path15.join(root, CONCEPTS_DIR);
1896
+ const queriesPath = path15.join(root, QUERIES_DIR);
1627
1897
  const concepts = await collectPageSummaries(conceptsPath);
1628
1898
  const queries = await collectPageSummaries(queriesPath);
1629
1899
  concepts.sort((a, b) => a.title.localeCompare(b.title));
1630
1900
  queries.sort((a, b) => a.title.localeCompare(b.title));
1631
1901
  const indexContent = buildIndexContent(concepts, queries);
1632
- const indexPath = path12.join(root, INDEX_FILE);
1902
+ const indexPath = path15.join(root, INDEX_FILE);
1633
1903
  await atomicWrite(indexPath, indexContent);
1634
1904
  const total = concepts.length + queries.length;
1635
1905
  status("+", success(`Index updated with ${total} pages.`));
@@ -1643,7 +1913,7 @@ async function scanWikiPages(dirPath) {
1643
1913
  }
1644
1914
  const scanned = [];
1645
1915
  for (const file of files.filter((f) => f.endsWith(".md"))) {
1646
- const content = await safeReadFile(path12.join(dirPath, file));
1916
+ const content = await safeReadFile(path15.join(dirPath, file));
1647
1917
  const { meta } = parseFrontmatter(content);
1648
1918
  scanned.push({ slug: file.replace(/\.md$/, ""), meta });
1649
1919
  }
@@ -1680,7 +1950,7 @@ function buildIndexContent(concepts, queries) {
1680
1950
 
1681
1951
  // src/compiler/obsidian.ts
1682
1952
  import { readdir as readdir4 } from "fs/promises";
1683
- import path13 from "path";
1953
+ import path16 from "path";
1684
1954
  var ABBREVIATION_MIN_WORDS = 3;
1685
1955
  var SWAP_CONJUNCTIONS = [" and ", " or "];
1686
1956
  function addObsidianMeta(frontmatter, conceptTitle, tags) {
@@ -1722,11 +1992,11 @@ function generateAbbreviation(title) {
1722
1992
  return abbreviation;
1723
1993
  }
1724
1994
  async function generateMOC(root) {
1725
- const conceptsPath = path13.join(root, CONCEPTS_DIR);
1995
+ const conceptsPath = path16.join(root, CONCEPTS_DIR);
1726
1996
  const pages = await loadConceptPages(conceptsPath);
1727
1997
  const tagGroups = groupPagesByTag(pages);
1728
1998
  const content = buildMOCContent(tagGroups);
1729
- await atomicWrite(path13.join(root, MOC_FILE), content);
1999
+ await atomicWrite(path16.join(root, MOC_FILE), content);
1730
2000
  }
1731
2001
  async function loadConceptPages(conceptsPath) {
1732
2002
  let files;
@@ -1738,7 +2008,7 @@ async function loadConceptPages(conceptsPath) {
1738
2008
  const pages = [];
1739
2009
  for (const file of files) {
1740
2010
  if (!file.endsWith(".md")) continue;
1741
- const content = await safeReadFile(path13.join(conceptsPath, file));
2011
+ const content = await safeReadFile(path16.join(conceptsPath, file));
1742
2012
  if (!content) continue;
1743
2013
  const { meta } = parseFrontmatter(content);
1744
2014
  if (meta.orphaned) continue;
@@ -1789,9 +2059,143 @@ function buildMOCContent(tagGroups) {
1789
2059
  }
1790
2060
 
1791
2061
  // src/utils/embeddings.ts
1792
- import { readFile as readFile8, readdir as readdir5 } from "fs/promises";
2062
+ import { readFile as readFile12, readdir as readdir5 } from "fs/promises";
1793
2063
  import { existsSync as existsSync4 } from "fs";
1794
- import path14 from "path";
2064
+ import path17 from "path";
2065
+
2066
+ // src/utils/retrieval.ts
2067
+ import { createHash as createHash2 } from "crypto";
2068
+ function hashChunkText(text) {
2069
+ return createHash2("sha256").update(text, "utf8").digest("hex").slice(0, 16);
2070
+ }
2071
+ function splitIntoChunks(body) {
2072
+ const paragraphs = extractParagraphs(body);
2073
+ if (paragraphs.length === 0) return [];
2074
+ const chunks = [];
2075
+ let buffer = "";
2076
+ for (const paragraph of paragraphs) {
2077
+ for (const piece of splitOversizedParagraph(paragraph)) {
2078
+ buffer = appendParagraph(buffer, piece, chunks);
2079
+ }
2080
+ }
2081
+ if (buffer.length > 0) chunks.push(buffer);
2082
+ return mergeTrailingFragment(chunks);
2083
+ }
2084
+ function appendParagraph(buffer, paragraph, chunks) {
2085
+ const candidate = buffer ? `${buffer}
2086
+
2087
+ ${paragraph}` : paragraph;
2088
+ if (candidate.length <= CHUNK_TARGET_CHARS) return candidate;
2089
+ if (buffer.length > 0) {
2090
+ chunks.push(buffer);
2091
+ return paragraph;
2092
+ }
2093
+ chunks.push(candidate);
2094
+ return "";
2095
+ }
2096
+ function mergeTrailingFragment(chunks) {
2097
+ if (chunks.length < 2) return chunks;
2098
+ const last = chunks[chunks.length - 1];
2099
+ if (last.length >= CHUNK_MIN_CHARS) return chunks;
2100
+ const previous = chunks[chunks.length - 2];
2101
+ if (previous.length + last.length + 2 > CHUNK_MAX_CHARS) return chunks;
2102
+ const merged = chunks.slice(0, -2);
2103
+ merged.push(`${previous}
2104
+
2105
+ ${last}`);
2106
+ return merged;
2107
+ }
2108
+ function extractParagraphs(body) {
2109
+ return body.split(/\n{2,}/).map((p) => p.trim()).filter((p) => p.length > 0);
2110
+ }
2111
+ function splitOversizedParagraph(paragraph) {
2112
+ if (paragraph.length <= CHUNK_MAX_CHARS) return [paragraph];
2113
+ const sentences = paragraph.split(/(?<=[.!?])\s+/);
2114
+ const pieces = [];
2115
+ let buffer = "";
2116
+ for (const sentence of sentences) {
2117
+ if ((buffer + " " + sentence).length > CHUNK_MAX_CHARS && buffer.length > 0) {
2118
+ pieces.push(buffer.trim());
2119
+ buffer = sentence;
2120
+ } else {
2121
+ buffer = buffer ? `${buffer} ${sentence}` : sentence;
2122
+ }
2123
+ }
2124
+ if (buffer.length > 0) pieces.push(buffer.trim());
2125
+ return pieces.flatMap(hardCut);
2126
+ }
2127
+ function hardCut(text) {
2128
+ if (text.length <= CHUNK_MAX_CHARS) return [text];
2129
+ const pieces = [];
2130
+ for (let start = 0; start < text.length; start += CHUNK_MAX_CHARS) {
2131
+ pieces.push(text.slice(start, start + CHUNK_MAX_CHARS));
2132
+ }
2133
+ return pieces;
2134
+ }
2135
+ function rerankWithBm25(query, candidates) {
2136
+ if (candidates.length === 0) return [];
2137
+ const queryTerms = tokenize(query);
2138
+ if (queryTerms.length === 0) {
2139
+ return candidates.map((candidate) => ({ candidate, score: candidate.baseScore }));
2140
+ }
2141
+ const docs = candidates.map((c) => tokenize(c.text));
2142
+ const stats = buildCorpusStats(docs);
2143
+ return rankByBm25Score(candidates, docs, queryTerms, stats);
2144
+ }
2145
+ function rankByBm25Score(candidates, docs, queryTerms, stats) {
2146
+ const scored = candidates.map((candidate, index) => {
2147
+ const lexical = bm25Score(queryTerms, docs[index], stats);
2148
+ return { candidate, score: lexical + candidate.baseScore * BASE_SCORE_WEIGHT };
2149
+ });
2150
+ scored.sort((a, b) => b.score - a.score);
2151
+ return scored;
2152
+ }
2153
+ function tokenize(text) {
2154
+ return text.toLowerCase().match(/[a-z0-9]+/g) ?? [];
2155
+ }
2156
+ function buildCorpusStats(docs) {
2157
+ const docFreq = /* @__PURE__ */ new Map();
2158
+ let totalLen = 0;
2159
+ for (const tokens of docs) {
2160
+ totalLen += tokens.length;
2161
+ const unique = new Set(tokens);
2162
+ for (const term of unique) docFreq.set(term, (docFreq.get(term) ?? 0) + 1);
2163
+ }
2164
+ const totalDocs = docs.length;
2165
+ const avgDocLen = totalDocs > 0 ? totalLen / totalDocs : 0;
2166
+ return { docFreq, avgDocLen, totalDocs };
2167
+ }
2168
+ var BM25_K1 = 1.5;
2169
+ var BM25_B = 0.75;
2170
+ var BASE_SCORE_WEIGHT = 0.5;
2171
+ function bm25Score(queryTerms, docTokens, stats) {
2172
+ if (docTokens.length === 0 || stats.totalDocs === 0) return 0;
2173
+ const termFreq = countTerms(docTokens);
2174
+ const lengthRatio = docTokens.length / (stats.avgDocLen || 1);
2175
+ let total = 0;
2176
+ for (const term of queryTerms) {
2177
+ const tf = termFreq.get(term) ?? 0;
2178
+ if (tf === 0) continue;
2179
+ const idf = idfWeight(stats.docFreq.get(term) ?? 0, stats.totalDocs);
2180
+ const numerator = tf * (BM25_K1 + 1);
2181
+ const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * lengthRatio);
2182
+ total += idf * (numerator / denominator);
2183
+ }
2184
+ return total;
2185
+ }
2186
+ function idfWeight(docFrequency, totalDocs) {
2187
+ const numerator = totalDocs - docFrequency + 0.5;
2188
+ const denominator = docFrequency + 0.5;
2189
+ return Math.log(1 + numerator / denominator);
2190
+ }
2191
+ function countTerms(tokens) {
2192
+ const counts = /* @__PURE__ */ new Map();
2193
+ for (const token of tokens) counts.set(token, (counts.get(token) ?? 0) + 1);
2194
+ return counts;
2195
+ }
2196
+
2197
+ // src/utils/embeddings.ts
2198
+ var STORE_VERSION = 2;
1795
2199
  function cosineSimilarity(a, b) {
1796
2200
  if (a.length !== b.length || a.length === 0) return 0;
1797
2201
  let dot = 0;
@@ -1813,24 +2217,27 @@ function findTopK(queryVec, store, k) {
1813
2217
  scored.sort((left, right) => right.score - left.score);
1814
2218
  return scored.slice(0, k).map((item) => item.entry);
1815
2219
  }
2220
+ function findTopKChunks(queryVec, chunks, k) {
2221
+ const scored = chunks.map((chunk) => ({
2222
+ chunk,
2223
+ score: cosineSimilarity(queryVec, chunk.vector)
2224
+ }));
2225
+ scored.sort((left, right) => right.score - left.score);
2226
+ return scored.slice(0, k);
2227
+ }
1816
2228
  async function readEmbeddingStore(root) {
1817
- const filePath = path14.join(root, EMBEDDINGS_FILE);
2229
+ const filePath = path17.join(root, EMBEDDINGS_FILE);
1818
2230
  if (!existsSync4(filePath)) return null;
1819
- const raw = await readFile8(filePath, "utf-8");
2231
+ const raw = await readFile12(filePath, "utf-8");
1820
2232
  return JSON.parse(raw);
1821
2233
  }
1822
2234
  async function writeEmbeddingStore(root, store) {
1823
- const filePath = path14.join(root, EMBEDDINGS_FILE);
2235
+ const filePath = path17.join(root, EMBEDDINGS_FILE);
1824
2236
  await atomicWrite(filePath, JSON.stringify(store, null, 2));
1825
2237
  }
1826
2238
  async function findRelevantPages(root, question) {
1827
- const store = await readEmbeddingStore(root);
1828
- if (!store || store.entries.length === 0) return [];
1829
- const activeModel = resolveEmbeddingModel();
1830
- if (store.model !== activeModel) {
1831
- warnStaleEmbeddingStore(store.model, activeModel);
1832
- return [];
1833
- }
2239
+ const store = await loadActiveStore(root, (s) => s.entries.length > 0);
2240
+ if (!store) return [];
1834
2241
  const queryVec = await getProvider().embed(question);
1835
2242
  return findTopK(queryVec, store, EMBEDDING_TOP_K).map((entry) => ({
1836
2243
  slug: entry.slug,
@@ -1838,10 +2245,26 @@ async function findRelevantPages(root, question) {
1838
2245
  summary: entry.summary
1839
2246
  }));
1840
2247
  }
2248
+ async function findRelevantChunks(root, question, k) {
2249
+ const store = await loadActiveStore(root, (s) => Boolean(s.chunks && s.chunks.length > 0));
2250
+ if (!store) return [];
2251
+ const queryVec = await getProvider().embed(question);
2252
+ return findTopKChunks(queryVec, store.chunks ?? [], k);
2253
+ }
2254
+ async function loadActiveStore(root, hasContent) {
2255
+ const store = await readEmbeddingStore(root);
2256
+ if (!store || !hasContent(store)) return null;
2257
+ const activeModel = resolveEmbeddingModel();
2258
+ if (store.model !== activeModel) {
2259
+ warnStaleEmbeddingStore(store.model, activeModel);
2260
+ return null;
2261
+ }
2262
+ return store;
2263
+ }
1841
2264
  async function collectPageRecords(root) {
1842
2265
  const records = [];
1843
2266
  for (const dir of [CONCEPTS_DIR, QUERIES_DIR]) {
1844
- const absDir = path14.join(root, dir);
2267
+ const absDir = path17.join(root, dir);
1845
2268
  let files;
1846
2269
  try {
1847
2270
  files = await readdir5(absDir);
@@ -1849,18 +2272,23 @@ async function collectPageRecords(root) {
1849
2272
  continue;
1850
2273
  }
1851
2274
  for (const file of files.filter((f) => f.endsWith(".md"))) {
1852
- const content = await safeReadFile(path14.join(absDir, file));
1853
- const { meta } = parseFrontmatter(content);
1854
- if (meta.orphaned || typeof meta.title !== "string") continue;
1855
- records.push({
1856
- slug: file.replace(/\.md$/, ""),
1857
- title: meta.title,
1858
- summary: typeof meta.summary === "string" ? meta.summary : ""
1859
- });
2275
+ const record = await readPageRecord(absDir, file);
2276
+ if (record) records.push(record);
1860
2277
  }
1861
2278
  }
1862
2279
  return records;
1863
2280
  }
2281
+ async function readPageRecord(absDir, file) {
2282
+ const content = await safeReadFile(path17.join(absDir, file));
2283
+ const { meta, body } = parseFrontmatter(content);
2284
+ if (meta.orphaned || typeof meta.title !== "string") return null;
2285
+ return {
2286
+ slug: file.replace(/\.md$/, ""),
2287
+ title: meta.title,
2288
+ summary: typeof meta.summary === "string" ? meta.summary : "",
2289
+ body
2290
+ };
2291
+ }
1864
2292
  function buildEmbeddingText(record) {
1865
2293
  return record.summary ? `${record.title}
1866
2294
 
@@ -1913,6 +2341,56 @@ function mergeEntries(existing, fresh, liveSlugs) {
1913
2341
  }
1914
2342
  return Array.from(bySlug.values());
1915
2343
  }
2344
+ async function refreshChunkEmbeddings(records, existing, forceAll) {
2345
+ const liveSlugs = new Set(records.map((r) => r.slug));
2346
+ const existingByKey = indexChunksByKey(existing.filter((c) => liveSlugs.has(c.slug)));
2347
+ const now = (/* @__PURE__ */ new Date()).toISOString();
2348
+ const fresh = [];
2349
+ for (const record of records) {
2350
+ const pageChunks = await embedRecordChunks(record, existingByKey, forceAll, now);
2351
+ fresh.push(...pageChunks);
2352
+ }
2353
+ return fresh;
2354
+ }
2355
+ async function embedRecordChunks(record, existingByKey, forceAll, now) {
2356
+ const provider = getProvider();
2357
+ const chunkTexts = splitIntoChunks(record.body);
2358
+ const out = [];
2359
+ for (let i = 0; i < chunkTexts.length; i++) {
2360
+ const text = chunkTexts[i];
2361
+ const contentHash = hashChunkText(text);
2362
+ const reused = pickReusableChunk(existingByKey, record.slug, i, contentHash, forceAll);
2363
+ if (reused) {
2364
+ out.push({ ...reused, title: record.title });
2365
+ continue;
2366
+ }
2367
+ const vector = await provider.embed(text);
2368
+ out.push({
2369
+ slug: record.slug,
2370
+ title: record.title,
2371
+ chunkIndex: i,
2372
+ contentHash,
2373
+ text,
2374
+ vector,
2375
+ updatedAt: now
2376
+ });
2377
+ }
2378
+ return out;
2379
+ }
2380
+ function indexChunksByKey(chunks) {
2381
+ const byKey = /* @__PURE__ */ new Map();
2382
+ for (const chunk of chunks) byKey.set(chunkKey(chunk.slug, chunk.chunkIndex), chunk);
2383
+ return byKey;
2384
+ }
2385
+ function chunkKey(slug, chunkIndex) {
2386
+ return `${slug}#${chunkIndex}`;
2387
+ }
2388
+ function pickReusableChunk(byKey, slug, chunkIndex, contentHash, forceAll) {
2389
+ if (forceAll) return null;
2390
+ const existing = byKey.get(chunkKey(slug, chunkIndex));
2391
+ if (!existing) return null;
2392
+ return existing.contentHash === contentHash ? existing : null;
2393
+ }
1916
2394
  async function updateEmbeddings(root, changedSlugs) {
1917
2395
  const records = await collectPageRecords(root);
1918
2396
  const liveSlugs = new Set(records.map((r) => r.slug));
@@ -1921,29 +2399,51 @@ async function updateEmbeddings(root, changedSlugs) {
1921
2399
  const modelChanged = Boolean(existingStore && existingStore.model !== embeddingModel);
1922
2400
  const toEmbed = new Set(changedSlugs.filter((slug) => liveSlugs.has(slug)));
1923
2401
  const previousEntries = modelChanged ? [] : existingStore?.entries ?? [];
1924
- if (!existingStore || modelChanged) {
2402
+ const previousChunks = modelChanged ? [] : existingStore?.chunks ?? [];
2403
+ const isEmptyStore = isStoreEmpty(existingStore);
2404
+ if (!existingStore || modelChanged || isEmptyStore && liveSlugs.size > 0) {
1925
2405
  for (const record of records) toEmbed.add(record.slug);
1926
2406
  }
1927
- if (!modelChanged && toEmbed.size === 0 && previousEntries.every((e) => liveSlugs.has(e.slug))) {
2407
+ if (!shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs)) {
1928
2408
  return;
1929
2409
  }
1930
2410
  const freshEntries = await embedPages(records, toEmbed);
1931
2411
  const mergedEntries = mergeEntries(previousEntries, freshEntries, liveSlugs);
1932
- const dimensions = mergedEntries[0]?.vector.length ?? 0;
2412
+ const mergedChunks = await refreshChunkEmbeddings(records, previousChunks, modelChanged);
2413
+ await persistRefreshedStore(root, embeddingModel, mergedEntries, mergedChunks);
2414
+ }
2415
+ async function persistRefreshedStore(root, embeddingModel, entries, chunks) {
2416
+ const dimensions = entries[0]?.vector.length ?? chunks[0]?.vector.length ?? 0;
1933
2417
  const store = {
1934
- version: 1,
2418
+ version: STORE_VERSION,
1935
2419
  model: embeddingModel,
1936
2420
  dimensions,
1937
- entries: mergedEntries
2421
+ entries,
2422
+ chunks
1938
2423
  };
1939
2424
  await writeEmbeddingStore(root, store);
1940
- status("*", dim(`Embeddings updated (${mergedEntries.length} pages).`));
2425
+ status(
2426
+ "*",
2427
+ dim(`Embeddings updated (${entries.length} pages, ${chunks.length} chunks).`)
2428
+ );
2429
+ }
2430
+ function isStoreEmpty(store) {
2431
+ if (!store) return false;
2432
+ return store.entries.length === 0 && (!store.chunks || store.chunks.length === 0);
2433
+ }
2434
+ function shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs) {
2435
+ if (modelChanged) return true;
2436
+ if (toEmbed.size > 0) return true;
2437
+ if (!previousEntries.every((e) => liveSlugs.has(e.slug))) return true;
2438
+ if (!previousChunks.every((c) => liveSlugs.has(c.slug))) return true;
2439
+ if (previousEntries.length > 0 && previousChunks.length === 0 && liveSlugs.size > 0) return true;
2440
+ return false;
1941
2441
  }
1942
2442
 
1943
2443
  // src/compiler/candidates.ts
1944
2444
  import { readdir as readdir6, rename as rename3, unlink as unlink2, writeFile as writeFile4, mkdir as mkdir5 } from "fs/promises";
1945
2445
  import { existsSync as existsSync5 } from "fs";
1946
- import path15 from "path";
2446
+ import path18 from "path";
1947
2447
  import { randomBytes } from "crypto";
1948
2448
  var ID_SUFFIX_BYTES = 4;
1949
2449
  var CANDIDATE_EXT = ".json";
@@ -1952,10 +2452,10 @@ function buildCandidateId(slug) {
1952
2452
  return `${slug}-${suffix}`;
1953
2453
  }
1954
2454
  function candidatePath(root, id) {
1955
- return path15.join(root, CANDIDATES_DIR, `${id}${CANDIDATE_EXT}`);
2455
+ return path18.join(root, CANDIDATES_DIR, `${id}${CANDIDATE_EXT}`);
1956
2456
  }
1957
2457
  function archivePath(root, id) {
1958
- return path15.join(root, CANDIDATES_ARCHIVE_DIR, `${id}${CANDIDATE_EXT}`);
2458
+ return path18.join(root, CANDIDATES_ARCHIVE_DIR, `${id}${CANDIDATE_EXT}`);
1959
2459
  }
1960
2460
  async function writeCandidate(root, draft) {
1961
2461
  const candidate = {
@@ -2006,7 +2506,7 @@ function isValidCandidate(value) {
2006
2506
  return typeof candidate.id === "string" && typeof candidate.title === "string" && typeof candidate.slug === "string" && typeof candidate.body === "string" && Array.isArray(candidate.sources);
2007
2507
  }
2008
2508
  async function listCandidates(root) {
2009
- const dir = path15.join(root, CANDIDATES_DIR);
2509
+ const dir = path18.join(root, CANDIDATES_DIR);
2010
2510
  if (!existsSync5(dir)) return [];
2011
2511
  const entries = await readdir6(dir, { withFileTypes: true });
2012
2512
  const candidates = [];
@@ -2033,7 +2533,7 @@ async function archiveCandidate(root, id) {
2033
2533
  const sourcePath = candidatePath(root, id);
2034
2534
  if (!existsSync5(sourcePath)) return false;
2035
2535
  const target = archivePath(root, id);
2036
- await mkdir5(path15.dirname(target), { recursive: true });
2536
+ await mkdir5(path18.dirname(target), { recursive: true });
2037
2537
  try {
2038
2538
  await rename3(sourcePath, target);
2039
2539
  } catch {
@@ -2045,9 +2545,9 @@ async function archiveCandidate(root, id) {
2045
2545
  }
2046
2546
 
2047
2547
  // src/linter/rules.ts
2048
- import { readdir as readdir7, readFile as readFile9 } from "fs/promises";
2548
+ import { readdir as readdir7, readFile as readFile13 } from "fs/promises";
2049
2549
  import { existsSync as existsSync6 } from "fs";
2050
- import path16 from "path";
2550
+ import path19 from "path";
2051
2551
  var MIN_BODY_LENGTH = 50;
2052
2552
  var WIKILINK_PATTERN2 = /\[\[([^\]]+)\]\]/g;
2053
2553
  var CITATION_PATTERN = /\^\[([^\]]+)\]/g;
@@ -2068,22 +2568,22 @@ async function readMarkdownFiles(dirPath) {
2068
2568
  const mdFiles = entries.filter((f) => f.endsWith(".md"));
2069
2569
  const results = await Promise.all(
2070
2570
  mdFiles.map(async (fileName) => {
2071
- const filePath = path16.join(dirPath, fileName);
2072
- const content = await readFile9(filePath, "utf-8");
2571
+ const filePath = path19.join(dirPath, fileName);
2572
+ const content = await readFile13(filePath, "utf-8");
2073
2573
  return { filePath, content };
2074
2574
  })
2075
2575
  );
2076
2576
  return results;
2077
2577
  }
2078
2578
  async function collectAllPages(root) {
2079
- const conceptPages = await readMarkdownFiles(path16.join(root, CONCEPTS_DIR));
2080
- const queryPages = await readMarkdownFiles(path16.join(root, QUERIES_DIR));
2579
+ const conceptPages = await readMarkdownFiles(path19.join(root, CONCEPTS_DIR));
2580
+ const queryPages = await readMarkdownFiles(path19.join(root, QUERIES_DIR));
2081
2581
  return [...conceptPages, ...queryPages];
2082
2582
  }
2083
2583
  function buildPageSlugSet(pages) {
2084
2584
  const slugs = /* @__PURE__ */ new Set();
2085
2585
  for (const page of pages) {
2086
- const baseName = path16.basename(page.filePath, ".md");
2586
+ const baseName = path19.basename(page.filePath, ".md");
2087
2587
  slugs.add(baseName.toLowerCase());
2088
2588
  }
2089
2589
  return slugs;
@@ -2318,7 +2818,7 @@ function countLines(content) {
2318
2818
  }
2319
2819
  async function checkBrokenCitations(root) {
2320
2820
  const pages = await collectAllPages(root);
2321
- const sourcesDir = path16.join(root, SOURCES_DIR);
2821
+ const sourcesDir = path19.join(root, SOURCES_DIR);
2322
2822
  const results = [];
2323
2823
  const lineCountCache = /* @__PURE__ */ new Map();
2324
2824
  for (const page of pages) {
@@ -2333,7 +2833,7 @@ async function collectBrokenForMarker(captured, line, pageFile, sourcesDir, line
2333
2833
  const trimmed = part.trim();
2334
2834
  if (trimmed.length === 0) continue;
2335
2835
  const filename = stripSpanSuffix(trimmed);
2336
- const citedPath = path16.join(sourcesDir, filename);
2836
+ const citedPath = path19.join(sourcesDir, filename);
2337
2837
  if (!existsSync6(citedPath)) {
2338
2838
  out.push({
2339
2839
  rule: "broken-citation",
@@ -2387,7 +2887,7 @@ async function checkMalformedClaimCitations(root) {
2387
2887
 
2388
2888
  // src/compiler/page-renderer.ts
2389
2889
  import { readdir as readdir8 } from "fs/promises";
2390
- import path17 from "path";
2890
+ import path20 from "path";
2391
2891
 
2392
2892
  // src/compiler/provenance.ts
2393
2893
  function addProvenanceMeta(fields, concept) {
@@ -2417,7 +2917,7 @@ function reportContradictionWarnings(conceptTitle, concept) {
2417
2917
  // src/compiler/page-renderer.ts
2418
2918
  var RELATED_PAGE_CONTEXT_LIMIT = 5;
2419
2919
  async function renderMergedPageContent(root, entry, schema) {
2420
- const pagePath = path17.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
2920
+ const pagePath = path20.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
2421
2921
  const existingPage = await safeReadFile(pagePath);
2422
2922
  const relatedPages = await loadRelatedPages(root, entry.slug);
2423
2923
  const system = buildPagePrompt(
@@ -2456,7 +2956,7 @@ function buildMergedFrontmatter(entry, existingPage, schema) {
2456
2956
  return buildFrontmatter(frontmatterFields);
2457
2957
  }
2458
2958
  async function loadRelatedPages(root, excludeSlug) {
2459
- const conceptsPath = path17.join(root, CONCEPTS_DIR);
2959
+ const conceptsPath = path20.join(root, CONCEPTS_DIR);
2460
2960
  let files;
2461
2961
  try {
2462
2962
  files = await readdir8(conceptsPath);
@@ -2466,7 +2966,7 @@ async function loadRelatedPages(root, excludeSlug) {
2466
2966
  const related = files.filter((f) => f.endsWith(".md") && f !== `${excludeSlug}.md`).slice(0, RELATED_PAGE_CONTEXT_LIMIT);
2467
2967
  const contents = [];
2468
2968
  for (const f of related) {
2469
- const content = await safeReadFile(path17.join(conceptsPath, f));
2969
+ const content = await safeReadFile(path20.join(conceptsPath, f));
2470
2970
  if (!content) continue;
2471
2971
  const { meta } = parseFrontmatter(content);
2472
2972
  if (meta.orphaned) continue;
@@ -2667,9 +3167,9 @@ function printChangesSummary(changes) {
2667
3167
  }
2668
3168
  async function extractForSource(root, sourceFile) {
2669
3169
  status("*", info(`Extracting: ${sourceFile}`));
2670
- const sourcePath = path18.join(root, SOURCES_DIR, sourceFile);
2671
- const sourceContent = await readFile10(sourcePath, "utf-8");
2672
- const existingIndex = await safeReadFile(path18.join(root, INDEX_FILE));
3170
+ const sourcePath = path21.join(root, SOURCES_DIR, sourceFile);
3171
+ const sourceContent = await readFile14(sourcePath, "utf-8");
3172
+ const existingIndex = await safeReadFile(path21.join(root, INDEX_FILE));
2673
3173
  const concepts = await extractConcepts(sourceContent, existingIndex);
2674
3174
  if (concepts.length > 0) {
2675
3175
  const names = concepts.map((c) => c.concept).join(", ");
@@ -2732,7 +3232,7 @@ async function generateMergedPage(root, entry, schema, options, sourceStates) {
2732
3232
  if (options.review) {
2733
3233
  return await persistReviewCandidate(root, entry, fullPage, sourceStates, schema);
2734
3234
  }
2735
- const pagePath = path18.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
3235
+ const pagePath = path21.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
2736
3236
  const error2 = await writePageIfValid(pagePath, fullPage, entry.concept.concept);
2737
3237
  return { error: error2 ?? void 0 };
2738
3238
  }
@@ -2760,7 +3260,7 @@ async function generateSeedPages(root, schema, generation) {
2760
3260
  }
2761
3261
  async function generateSingleSeedPage(root, schema, seed) {
2762
3262
  const slug = slugify(seed.title);
2763
- const pagePath = path18.join(root, CONCEPTS_DIR, `${slug}.md`);
3263
+ const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
2764
3264
  const relatedContent = await loadSeedRelatedPages(root, seed.relatedSlugs ?? []);
2765
3265
  const rule = schema.kinds[seed.kind];
2766
3266
  const system = buildSeedPagePrompt(seed, rule, relatedContent);
@@ -2792,7 +3292,7 @@ async function loadSeedRelatedPages(root, slugs) {
2792
3292
  if (slugs.length === 0) return "";
2793
3293
  const contents = [];
2794
3294
  for (const slug of slugs) {
2795
- const pagePath = path18.join(root, CONCEPTS_DIR, `${slug}.md`);
3295
+ const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
2796
3296
  const content = await safeReadFile(pagePath);
2797
3297
  if (content) contents.push(content);
2798
3298
  }
@@ -2847,7 +3347,7 @@ async function compileCommand(options = {}) {
2847
3347
 
2848
3348
  // src/commands/query.ts
2849
3349
  import { existsSync as existsSync8 } from "fs";
2850
- import path19 from "path";
3350
+ import path22 from "path";
2851
3351
  var PAGE_DIRS = [CONCEPTS_DIR, QUERIES_DIR];
2852
3352
  var PAGE_SELECTION_TOOL = {
2853
3353
  name: "select_pages",
@@ -2895,16 +3395,92 @@ ${indexContent}`;
2895
3395
  function buildFilteredIndex(candidates) {
2896
3396
  return candidates.map((entry) => `- **${entry.slug}**: ${entry.title} \u2014 ${entry.summary}`).join("\n");
2897
3397
  }
2898
- async function selectRelevantPages(root, question) {
3398
+ async function selectRelevantPages(root, question, debug) {
3399
+ const chunkSelection = await trySelectViaChunks(root, question, debug);
3400
+ if (chunkSelection) return chunkSelection;
2899
3401
  const candidates = await tryFindRelevantPages(root, question);
2900
3402
  if (candidates.length > 0) {
2901
3403
  const filteredIndex = buildFilteredIndex(candidates);
2902
3404
  const { pages: rawPages2, reasoning: reasoning2 } = await selectPages(question, filteredIndex);
2903
- return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2 };
3405
+ return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2, chunks: [] };
2904
3406
  }
2905
- const indexContent = await safeReadFile(path19.join(root, INDEX_FILE));
3407
+ const indexContent = await safeReadFile(path22.join(root, INDEX_FILE));
2906
3408
  const { pages: rawPages, reasoning } = await selectPages(question, indexContent);
2907
- return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning };
3409
+ return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning, chunks: [] };
3410
+ }
3411
+ async function trySelectViaChunks(root, question, debug) {
3412
+ const ranked = await tryFindRelevantChunks(root, question);
3413
+ if (ranked.length === 0) return null;
3414
+ const reranked = rerankWithBm25(
3415
+ question,
3416
+ ranked.map(({ chunk, score }) => ({ text: chunk.text, baseScore: score, chunk }))
3417
+ );
3418
+ const kept = reranked.slice(0, CHUNK_RERANK_KEEP);
3419
+ const reorderingHappened = wasReordered(ranked, kept.map((k) => k.candidate.chunk));
3420
+ const chunkCitations = toChunkCitations(kept);
3421
+ const pageSlugs = collapseToPages(chunkCitations, QUERY_PAGE_LIMIT);
3422
+ const reasoning = buildChunkReasoning(chunkCitations, pageSlugs);
3423
+ return {
3424
+ pages: pageSlugs,
3425
+ rawPages: pageSlugs,
3426
+ reasoning,
3427
+ chunks: chunkCitations,
3428
+ debug: debug ? buildDebug(chunkCitations, pageSlugs, reorderingHappened) : void 0
3429
+ };
3430
+ }
3431
+ function wasReordered(before, after) {
3432
+ const limit = Math.min(before.length, after.length);
3433
+ for (let i = 0; i < limit; i++) {
3434
+ if (before[i].chunk !== after[i]) return true;
3435
+ }
3436
+ return false;
3437
+ }
3438
+ function toChunkCitations(ranked) {
3439
+ return ranked.map(({ candidate, score }) => ({
3440
+ slug: candidate.chunk.slug,
3441
+ title: candidate.chunk.title,
3442
+ chunkIndex: candidate.chunk.chunkIndex,
3443
+ score,
3444
+ text: candidate.chunk.text
3445
+ }));
3446
+ }
3447
+ function collapseToPages(chunks, limit) {
3448
+ const slugs = [];
3449
+ const seen = /* @__PURE__ */ new Set();
3450
+ for (const chunk of chunks) {
3451
+ if (seen.has(chunk.slug)) continue;
3452
+ seen.add(chunk.slug);
3453
+ slugs.push(chunk.slug);
3454
+ if (slugs.length >= limit) break;
3455
+ }
3456
+ return slugs;
3457
+ }
3458
+ function buildChunkReasoning(chunks, pages) {
3459
+ const top = chunks.slice(0, pages.length);
3460
+ const summary = top.map((c) => `${c.slug}#${c.chunkIndex} (${c.score.toFixed(3)})`).join(", ");
3461
+ return `Selected ${pages.length} page(s) from ${chunks.length} reranked chunks: ${summary}`;
3462
+ }
3463
+ function buildDebug(chunks, pageSlugs, reranked) {
3464
+ const bestPerPage = /* @__PURE__ */ new Map();
3465
+ for (const c of chunks) {
3466
+ const prev = bestPerPage.get(c.slug);
3467
+ if (prev === void 0 || c.score > prev) bestPerPage.set(c.slug, c.score);
3468
+ }
3469
+ return {
3470
+ pages: pageSlugs.map((slug) => ({ slug, score: bestPerPage.get(slug) ?? 0 })),
3471
+ chunks,
3472
+ usedChunks: true,
3473
+ reranked
3474
+ };
3475
+ }
3476
+ async function tryFindRelevantChunks(root, question) {
3477
+ try {
3478
+ return await findRelevantChunks(root, question, CHUNK_TOP_K);
3479
+ } catch (err) {
3480
+ const message = err instanceof Error ? err.message : String(err);
3481
+ status("!", dim(`Chunk pre-filter unavailable (${message}); falling back.`));
3482
+ return [];
3483
+ }
2908
3484
  }
2909
3485
  async function tryFindRelevantPages(root, question) {
2910
3486
  try {
@@ -2920,7 +3496,7 @@ async function loadSelectedPages(root, slugs) {
2920
3496
  for (const slug of slugs) {
2921
3497
  let content = "";
2922
3498
  for (const dir of PAGE_DIRS) {
2923
- const candidate = await safeReadFile(path19.join(root, dir, `${slug}.md`));
3499
+ const candidate = await safeReadFile(path22.join(root, dir, `${slug}.md`));
2924
3500
  if (!candidate) continue;
2925
3501
  const { meta } = parseFrontmatter(candidate);
2926
3502
  if (meta.orphaned) continue;
@@ -2937,11 +3513,12 @@ ${content}`);
2937
3513
  return sections.join("\n\n");
2938
3514
  }
2939
3515
  var ANSWER_SYSTEM_PROMPT = "You are a knowledge assistant. Answer the question using ONLY the wiki content provided. Cite specific pages using [[Page Title]] wikilinks. If the wiki doesn't contain enough information, say so.";
2940
- async function callAnswerLLM(question, pagesContent, onToken) {
3516
+ async function callAnswerLLM(question, pagesContent, chunks, onToken) {
3517
+ const provenance = chunks.length > 0 ? buildChunkProvenance(chunks) : "";
2941
3518
  const userMessage = `Question: ${question}
2942
3519
 
2943
3520
  Relevant wiki pages:
2944
- ${pagesContent}`;
3521
+ ${pagesContent}${provenance}`;
2945
3522
  return callClaude({
2946
3523
  system: ANSWER_SYSTEM_PROMPT,
2947
3524
  messages: [{ role: "user", content: userMessage }],
@@ -2949,6 +3526,16 @@ ${pagesContent}`;
2949
3526
  onToken
2950
3527
  });
2951
3528
  }
3529
+ function buildChunkProvenance(chunks) {
3530
+ const sections = chunks.map(
3531
+ (chunk) => `--- ${chunk.slug} (chunk ${chunk.chunkIndex}) ---
3532
+ ${chunk.text}`
3533
+ );
3534
+ return `
3535
+
3536
+ Most relevant excerpts (from chunk-level retrieval):
3537
+ ${sections.join("\n\n")}`;
3538
+ }
2952
3539
  function summarizeAnswer(answer) {
2953
3540
  const firstLine = answer.trim().split(/\n/)[0] ?? "";
2954
3541
  const firstSentence = firstLine.split(/(?<=[.!?])\s/)[0] ?? firstLine;
@@ -2956,7 +3543,7 @@ function summarizeAnswer(answer) {
2956
3543
  }
2957
3544
  async function saveQueryPage(root, question, answer) {
2958
3545
  const slug = slugify(question);
2959
- const filePath = path19.join(root, QUERIES_DIR, `${slug}.md`);
3546
+ const filePath = path22.join(root, QUERIES_DIR, `${slug}.md`);
2960
3547
  const frontmatter = buildFrontmatter({
2961
3548
  title: question,
2962
3549
  summary: summarizeAnswer(answer),
@@ -2982,30 +3569,42 @@ ${answer}
2982
3569
  return slug;
2983
3570
  }
2984
3571
  async function generateAnswer(root, question, options = {}) {
2985
- if (!existsSync8(path19.join(root, INDEX_FILE))) {
3572
+ if (!existsSync8(path22.join(root, INDEX_FILE))) {
2986
3573
  throw new Error("Wiki index not found. Run `llmwiki compile` first.");
2987
3574
  }
2988
- const { pages, reasoning } = await selectRelevantPages(root, question);
2989
- options.onPageSelection?.(pages, reasoning);
2990
- const pagesContent = await loadSelectedPages(root, pages);
3575
+ const selection = await selectRelevantPages(root, question, Boolean(options.debug));
3576
+ options.onPageSelection?.(selection.pages, selection.reasoning);
3577
+ const pagesContent = await loadSelectedPages(root, selection.pages);
2991
3578
  if (!pagesContent) {
2992
- return { answer: "", selectedPages: pages, reasoning };
2993
- }
2994
- const answer = await callAnswerLLM(question, pagesContent, options.onToken);
2995
- let saved;
2996
- if (options.save) {
2997
- saved = await saveQueryPage(root, question, answer);
3579
+ return buildEmptyResult(selection);
2998
3580
  }
2999
- return { answer, selectedPages: pages, reasoning, saved };
3581
+ const answer = await callAnswerLLM(question, pagesContent, selection.chunks, options.onToken);
3582
+ const saved = options.save ? await saveQueryPage(root, question, answer) : void 0;
3583
+ return {
3584
+ answer,
3585
+ selectedPages: selection.pages,
3586
+ reasoning: selection.reasoning,
3587
+ saved,
3588
+ debug: selection.debug
3589
+ };
3590
+ }
3591
+ function buildEmptyResult(selection) {
3592
+ return {
3593
+ answer: "",
3594
+ selectedPages: selection.pages,
3595
+ reasoning: selection.reasoning,
3596
+ debug: selection.debug
3597
+ };
3000
3598
  }
3001
3599
  async function queryCommand(root, question, options) {
3002
- if (!existsSync8(path19.join(root, INDEX_FILE))) {
3600
+ if (!existsSync8(path22.join(root, INDEX_FILE))) {
3003
3601
  status("!", error("Wiki index not found. Run `llmwiki compile` first."));
3004
3602
  return;
3005
3603
  }
3006
3604
  header("Selecting relevant pages");
3007
3605
  const result = await generateAnswer(root, question, {
3008
3606
  save: options.save,
3607
+ debug: options.debug,
3009
3608
  onToken: (text) => process.stdout.write(text),
3010
3609
  onPageSelection: (pages, reasoning) => {
3011
3610
  status("i", dim(`Reasoning: ${reasoning}`));
@@ -3014,6 +3613,7 @@ async function queryCommand(root, question, options) {
3014
3613
  }
3015
3614
  });
3016
3615
  process.stdout.write("\n");
3616
+ if (result.debug) printDebugSnapshot(result.debug);
3017
3617
  if (!result.answer) {
3018
3618
  status("!", error("No matching pages found. Try refining your question."));
3019
3619
  return;
@@ -3024,14 +3624,34 @@ async function queryCommand(root, question, options) {
3024
3624
  status("\u2192", dim("Tip: use --save to add this answer to your wiki"));
3025
3625
  }
3026
3626
  }
3627
+ function printDebugSnapshot(debug) {
3628
+ header("Retrieval debug");
3629
+ status(
3630
+ "i",
3631
+ dim(
3632
+ `Source: ${debug.usedChunks ? "chunk-level" : "page-level"}; reranked: ${debug.reranked ? "yes" : "no"}`
3633
+ )
3634
+ );
3635
+ for (const page of debug.pages) {
3636
+ status("\u2022", `${page.slug} (best chunk score ${page.score.toFixed(3)})`);
3637
+ }
3638
+ for (const chunk of debug.chunks) {
3639
+ const preview = chunk.text.slice(0, DEBUG_CHUNK_PREVIEW_CHARS).replace(/\s+/g, " ").trim();
3640
+ status(
3641
+ "\xB7",
3642
+ dim(`${chunk.slug}#${chunk.chunkIndex} score=${chunk.score.toFixed(3)} :: ${preview}\u2026`)
3643
+ );
3644
+ }
3645
+ }
3646
+ var DEBUG_CHUNK_PREVIEW_CHARS = 120;
3027
3647
 
3028
3648
  // src/commands/watch.ts
3029
3649
  import { watch as chokidarWatch } from "chokidar";
3030
3650
  import { existsSync as existsSync9 } from "fs";
3031
- import path20 from "path";
3651
+ import path23 from "path";
3032
3652
  var DEBOUNCE_MS = 500;
3033
3653
  async function watchCommand() {
3034
- const sourcesPath = path20.resolve(SOURCES_DIR);
3654
+ const sourcesPath = path23.resolve(SOURCES_DIR);
3035
3655
  if (!existsSync9(sourcesPath)) {
3036
3656
  status(
3037
3657
  "!",
@@ -3066,7 +3686,7 @@ async function watchCommand() {
3066
3686
  const scheduleCompile = (eventPath, event) => {
3067
3687
  status(
3068
3688
  "~",
3069
- dim(`${event}: ${path20.basename(eventPath)}`)
3689
+ dim(`${event}: ${path23.basename(eventPath)}`)
3070
3690
  );
3071
3691
  if (debounceTimer) clearTimeout(debounceTimer);
3072
3692
  debounceTimer = setTimeout(triggerCompile, DEBOUNCE_MS);
@@ -3153,7 +3773,7 @@ async function lintCommand() {
3153
3773
  // src/commands/schema.ts
3154
3774
  import { existsSync as existsSync10 } from "fs";
3155
3775
  import { mkdir as mkdir6, writeFile as writeFile5 } from "fs/promises";
3156
- import path21 from "path";
3776
+ import path24 from "path";
3157
3777
  async function schemaInitCommand() {
3158
3778
  const root = process.cwd();
3159
3779
  const defaults = buildDefaultSchema();
@@ -3162,7 +3782,7 @@ async function schemaInitCommand() {
3162
3782
  status("!", warn(`Schema file already exists at ${targetPath}`));
3163
3783
  return;
3164
3784
  }
3165
- await mkdir6(path21.dirname(targetPath), { recursive: true });
3785
+ await mkdir6(path24.dirname(targetPath), { recursive: true });
3166
3786
  const serializable = {
3167
3787
  version: defaults.version,
3168
3788
  defaultKind: defaults.defaultKind,
@@ -3221,7 +3841,7 @@ async function reviewShowCommand(id) {
3221
3841
  }
3222
3842
 
3223
3843
  // src/commands/review-approve.ts
3224
- import path22 from "path";
3844
+ import path25 from "path";
3225
3845
 
3226
3846
  // src/commands/review-helpers.ts
3227
3847
  async function runReviewUnderLock(id, underLock) {
@@ -3253,7 +3873,7 @@ async function approveUnderLock(root, id) {
3253
3873
  process.exitCode = 1;
3254
3874
  return;
3255
3875
  }
3256
- const pagePath = path22.join(root, CONCEPTS_DIR, `${candidate.slug}.md`);
3876
+ const pagePath = path25.join(root, CONCEPTS_DIR, `${candidate.slug}.md`);
3257
3877
  await atomicWrite(pagePath, candidate.body);
3258
3878
  status("+", success(`Approved \u2192 ${source(pagePath)}`));
3259
3879
  await persistCandidateSourceStates(root, candidate);
@@ -3313,7 +3933,7 @@ import { McpServer as McpServer2 } from "@modelcontextprotocol/sdk/server/mcp.js
3313
3933
  import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
3314
3934
 
3315
3935
  // src/mcp/tools.ts
3316
- import path23 from "path";
3936
+ import path26 from "path";
3317
3937
  import { z } from "zod";
3318
3938
 
3319
3939
  // src/mcp/provider-check.ts
@@ -3406,15 +4026,16 @@ function registerQueryTool(server, root) {
3406
4026
  "query_wiki",
3407
4027
  {
3408
4028
  title: "Query Wiki",
3409
- description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Requires an LLM provider.",
4029
+ description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Set debug=true to include the selected chunks and their scores. Requires an LLM provider.",
3410
4030
  inputSchema: {
3411
4031
  question: z.string().describe("The natural-language question to answer."),
3412
- save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true.")
4032
+ save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true."),
4033
+ debug: z.boolean().optional().describe("Include retrieval debug info (selected chunks/pages + scores).")
3413
4034
  }
3414
4035
  },
3415
- async ({ question, save }) => {
4036
+ async ({ question, save, debug }) => {
3416
4037
  ensureProviderAvailable();
3417
- const result = await generateAnswer(root, question, { save });
4038
+ const result = await generateAnswer(root, question, { save, debug });
3418
4039
  return jsonResult(result);
3419
4040
  }
3420
4041
  );
@@ -3438,15 +4059,30 @@ function registerSearchTool(server, root) {
3438
4059
  );
3439
4060
  }
3440
4061
  async function pickSearchSlugs(root, question) {
4062
+ try {
4063
+ const chunks = await findRelevantChunks(root, question, CHUNK_TOP_K);
4064
+ if (chunks.length > 0) return dedupePreservingOrder(chunks.map((c) => c.chunk.slug));
4065
+ } catch {
4066
+ }
3441
4067
  try {
3442
4068
  const candidates = await findRelevantPages(root, question);
3443
4069
  if (candidates.length > 0) return candidates.map((c) => c.slug);
3444
4070
  } catch {
3445
4071
  }
3446
- const indexContent = await safeReadFile(path23.join(root, INDEX_FILE));
4072
+ const indexContent = await safeReadFile(path26.join(root, INDEX_FILE));
3447
4073
  const { pages } = await selectPages(question, indexContent);
3448
4074
  return pages;
3449
4075
  }
4076
+ function dedupePreservingOrder(slugs) {
4077
+ const seen = /* @__PURE__ */ new Set();
4078
+ const out = [];
4079
+ for (const slug of slugs) {
4080
+ if (seen.has(slug)) continue;
4081
+ seen.add(slug);
4082
+ out.push(slug);
4083
+ }
4084
+ return out;
4085
+ }
3450
4086
  function registerReadTool(server, root) {
3451
4087
  server.registerTool(
3452
4088
  "read_page",
@@ -3492,8 +4128,8 @@ function registerStatusTool(server, root) {
3492
4128
  );
3493
4129
  }
3494
4130
  async function collectStatus(root) {
3495
- const concepts = await collectPageSummaries(path23.join(root, CONCEPTS_DIR));
3496
- const queries = await collectPageSummaries(path23.join(root, QUERIES_DIR));
4131
+ const concepts = await collectPageSummaries(path26.join(root, CONCEPTS_DIR));
4132
+ const queries = await collectPageSummaries(path26.join(root, QUERIES_DIR));
3497
4133
  const state = await readState(root);
3498
4134
  const changes = await detectChanges(root, state);
3499
4135
  const orphans = await findOrphanedSlugs(root);
@@ -3510,7 +4146,7 @@ async function collectStatus(root) {
3510
4146
  };
3511
4147
  }
3512
4148
  async function findOrphanedSlugs(root) {
3513
- const scanned = await scanWikiPages(path23.join(root, CONCEPTS_DIR));
4149
+ const scanned = await scanWikiPages(path26.join(root, CONCEPTS_DIR));
3514
4150
  return scanned.filter(({ meta }) => meta.orphaned).map(({ slug }) => slug);
3515
4151
  }
3516
4152
  async function loadPageRecords(root, slugs) {
@@ -3523,7 +4159,7 @@ async function loadPageRecords(root, slugs) {
3523
4159
  }
3524
4160
  async function readPage(root, slug) {
3525
4161
  for (const dir of PAGE_DIRS2) {
3526
- const content = await safeReadFile(path23.join(root, dir, `${slug}.md`));
4162
+ const content = await safeReadFile(path26.join(root, dir, `${slug}.md`));
3527
4163
  if (!content) continue;
3528
4164
  const { meta, body } = parseFrontmatter(content);
3529
4165
  if (meta.orphaned) continue;
@@ -3538,7 +4174,7 @@ async function readPage(root, slug) {
3538
4174
  }
3539
4175
 
3540
4176
  // src/mcp/resources.ts
3541
- import path24 from "path";
4177
+ import path27 from "path";
3542
4178
  import { readdir as readdir9 } from "fs/promises";
3543
4179
  import { ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
3544
4180
  function jsonContent(uri, payload) {
@@ -3572,7 +4208,7 @@ function registerIndexResource(server, root) {
3572
4208
  mimeType: "text/markdown"
3573
4209
  },
3574
4210
  async (uri) => {
3575
- const content = await safeReadFile(path24.join(root, INDEX_FILE));
4211
+ const content = await safeReadFile(path27.join(root, INDEX_FILE));
3576
4212
  return { contents: [markdownContent(uri, content)] };
3577
4213
  }
3578
4214
  );
@@ -3639,7 +4275,7 @@ function registerQueryResource(server, root) {
3639
4275
  );
3640
4276
  }
3641
4277
  async function listSources(root) {
3642
- const sourcesPath = path24.join(root, SOURCES_DIR);
4278
+ const sourcesPath = path27.join(root, SOURCES_DIR);
3643
4279
  let files;
3644
4280
  try {
3645
4281
  files = await readdir9(sourcesPath);
@@ -3648,14 +4284,14 @@ async function listSources(root) {
3648
4284
  }
3649
4285
  const records = [];
3650
4286
  for (const file of files.filter((f) => f.endsWith(".md"))) {
3651
- const content = await safeReadFile(path24.join(sourcesPath, file));
4287
+ const content = await safeReadFile(path27.join(sourcesPath, file));
3652
4288
  const { meta } = parseFrontmatter(content);
3653
4289
  records.push({ filename: file, ...meta });
3654
4290
  }
3655
4291
  return records;
3656
4292
  }
3657
4293
  async function loadPageWithMeta(root, dir, slug) {
3658
- const filePath = path24.join(root, dir, `${slug}.md`);
4294
+ const filePath = path27.join(root, dir, `${slug}.md`);
3659
4295
  const content = await safeReadFile(filePath);
3660
4296
  if (!content) {
3661
4297
  throw new Error(`Page not found: ${dir}/${slug}.md`);
@@ -3664,7 +4300,7 @@ async function loadPageWithMeta(root, dir, slug) {
3664
4300
  return { slug, meta, body: body.trim() };
3665
4301
  }
3666
4302
  async function listPagesUnder(root, dir, scheme) {
3667
- const pagesPath = path24.join(root, dir);
4303
+ const pagesPath = path27.join(root, dir);
3668
4304
  let files;
3669
4305
  try {
3670
4306
  files = await readdir9(pagesPath);
@@ -3748,7 +4384,7 @@ reviewCommand.command("reject <id>").description("Reject a candidate and archive
3748
4384
  process.exit(1);
3749
4385
  }
3750
4386
  });
3751
- program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").action(async (question, options) => {
4387
+ program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").option("--debug", "Print which pages and chunks were selected and their scores").action(async (question, options) => {
3752
4388
  try {
3753
4389
  requireProvider();
3754
4390
  await queryCommand(process.cwd(), question, options);