llm-wiki-compiler 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -4
- package/dist/cli.js +1008 -371
- package/dist/cli.js.map +1 -1
- package/package.json +6 -3
package/dist/cli.js
CHANGED
|
@@ -6,8 +6,8 @@ import { createRequire } from "module";
|
|
|
6
6
|
import { Command } from "commander";
|
|
7
7
|
|
|
8
8
|
// src/commands/ingest.ts
|
|
9
|
-
import
|
|
10
|
-
import { mkdir as mkdir2, writeFile as writeFile2 } from "fs/promises";
|
|
9
|
+
import path7 from "path";
|
|
10
|
+
import { mkdir as mkdir2, readFile as readFile6, writeFile as writeFile2 } from "fs/promises";
|
|
11
11
|
|
|
12
12
|
// src/utils/markdown.ts
|
|
13
13
|
import { writeFile, rename, readFile, mkdir } from "fs/promises";
|
|
@@ -150,9 +150,17 @@ var LOCK_FILE = ".llmwiki/lock";
|
|
|
150
150
|
var INDEX_FILE = "wiki/index.md";
|
|
151
151
|
var MOC_FILE = "wiki/MOC.md";
|
|
152
152
|
var EMBEDDINGS_FILE = ".llmwiki/embeddings.json";
|
|
153
|
+
var IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".jpg", ".jpeg", ".png", ".gif", ".webp"]);
|
|
154
|
+
var TRANSCRIPT_EXTENSIONS = /* @__PURE__ */ new Set([".vtt", ".srt"]);
|
|
155
|
+
var IMAGE_DESCRIBE_MAX_TOKENS = 2048;
|
|
153
156
|
var CANDIDATES_DIR = ".llmwiki/candidates";
|
|
154
157
|
var CANDIDATES_ARCHIVE_DIR = ".llmwiki/candidates/archive";
|
|
155
158
|
var EMBEDDING_TOP_K = 15;
|
|
159
|
+
var CHUNK_TOP_K = 30;
|
|
160
|
+
var CHUNK_RERANK_KEEP = 12;
|
|
161
|
+
var CHUNK_TARGET_CHARS = 800;
|
|
162
|
+
var CHUNK_MAX_CHARS = 1400;
|
|
163
|
+
var CHUNK_MIN_CHARS = 200;
|
|
156
164
|
var LOW_CONFIDENCE_THRESHOLD = 0.5;
|
|
157
165
|
var MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS = 2;
|
|
158
166
|
var EMBEDDING_MODELS = {
|
|
@@ -237,19 +245,24 @@ async function ingestWeb(url) {
|
|
|
237
245
|
|
|
238
246
|
// src/ingest/file.ts
|
|
239
247
|
import { readFile as readFile2 } from "fs/promises";
|
|
248
|
+
import path3 from "path";
|
|
249
|
+
|
|
250
|
+
// src/ingest/shared.ts
|
|
240
251
|
import path2 from "path";
|
|
241
|
-
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
|
|
242
252
|
function titleFromFilename(filePath) {
|
|
243
253
|
const basename = path2.basename(filePath, path2.extname(filePath));
|
|
244
254
|
return basename.replace(/[-_]+/g, " ").trim();
|
|
245
255
|
}
|
|
256
|
+
|
|
257
|
+
// src/ingest/file.ts
|
|
258
|
+
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
|
|
246
259
|
function wrapPlainText(text) {
|
|
247
260
|
return `\`\`\`
|
|
248
261
|
${text}
|
|
249
262
|
\`\`\``;
|
|
250
263
|
}
|
|
251
264
|
async function ingestFile(filePath) {
|
|
252
|
-
const ext =
|
|
265
|
+
const ext = path3.extname(filePath).toLowerCase();
|
|
253
266
|
if (!SUPPORTED_EXTENSIONS.has(ext)) {
|
|
254
267
|
throw new Error(
|
|
255
268
|
`Unsupported file type "${ext}". Only .md and .txt files are supported.`
|
|
@@ -261,10 +274,440 @@ async function ingestFile(filePath) {
|
|
|
261
274
|
return { title, content };
|
|
262
275
|
}
|
|
263
276
|
|
|
277
|
+
// src/ingest/pdf.ts
|
|
278
|
+
import { readFile as readFile3 } from "fs/promises";
|
|
279
|
+
function resolveTitle(filePath, info2) {
|
|
280
|
+
if (info2 && typeof info2 === "object") {
|
|
281
|
+
const titleField = info2["Title"];
|
|
282
|
+
if (typeof titleField === "string" && titleField.trim().length > 0) {
|
|
283
|
+
return titleField.trim();
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
return titleFromFilename(filePath);
|
|
287
|
+
}
|
|
288
|
+
async function ingestPdf(filePath) {
|
|
289
|
+
const { PDFParse } = await import("pdf-parse");
|
|
290
|
+
const buffer = await readFile3(filePath);
|
|
291
|
+
const parser = new PDFParse({ data: new Uint8Array(buffer) });
|
|
292
|
+
try {
|
|
293
|
+
const textResult = await parser.getText();
|
|
294
|
+
const infoResult = await parser.getInfo();
|
|
295
|
+
const title = resolveTitle(filePath, infoResult.info);
|
|
296
|
+
const content = textResult.text.trim();
|
|
297
|
+
return { title, content };
|
|
298
|
+
} finally {
|
|
299
|
+
await parser.destroy();
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// src/ingest/image.ts
|
|
304
|
+
import { readFile as readFile4 } from "fs/promises";
|
|
305
|
+
import path5 from "path";
|
|
306
|
+
import Anthropic2 from "@anthropic-ai/sdk";
|
|
307
|
+
|
|
308
|
+
// src/providers/anthropic.ts
|
|
309
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
310
|
+
var VOYAGE_EMBEDDINGS_URL = "https://api.voyageai.com/v1/embeddings";
|
|
311
|
+
function buildAnthropicClientOptions(options = {}) {
|
|
312
|
+
const trimmedBaseURL = options.baseURL?.trim();
|
|
313
|
+
const trimmedApiKey = options.apiKey?.trim();
|
|
314
|
+
const trimmedAuthToken = options.authToken?.trim();
|
|
315
|
+
const result = {};
|
|
316
|
+
if (trimmedApiKey) {
|
|
317
|
+
result.apiKey = trimmedApiKey;
|
|
318
|
+
}
|
|
319
|
+
if (trimmedAuthToken) {
|
|
320
|
+
result.authToken = trimmedAuthToken;
|
|
321
|
+
}
|
|
322
|
+
if (!trimmedBaseURL) {
|
|
323
|
+
return result;
|
|
324
|
+
}
|
|
325
|
+
const normalizedBaseURL = trimmedBaseURL.endsWith("/") && trimmedBaseURL.length > 1 ? trimmedBaseURL.slice(0, -1) : trimmedBaseURL;
|
|
326
|
+
result.baseURL = normalizedBaseURL;
|
|
327
|
+
return result;
|
|
328
|
+
}
|
|
329
|
+
var AnthropicProvider = class {
|
|
330
|
+
client;
|
|
331
|
+
model;
|
|
332
|
+
constructor(model, options = {}) {
|
|
333
|
+
this.model = model;
|
|
334
|
+
this.client = new Anthropic(buildAnthropicClientOptions(options));
|
|
335
|
+
}
|
|
336
|
+
/** Send a single non-streaming completion request. */
|
|
337
|
+
async complete(system, messages, maxTokens) {
|
|
338
|
+
const response = await this.client.messages.create({
|
|
339
|
+
model: this.model,
|
|
340
|
+
max_tokens: maxTokens,
|
|
341
|
+
system,
|
|
342
|
+
messages
|
|
343
|
+
});
|
|
344
|
+
const textBlock = response.content.find((block) => block.type === "text");
|
|
345
|
+
return textBlock?.type === "text" ? textBlock.text : "";
|
|
346
|
+
}
|
|
347
|
+
/** Stream a completion, invoking onToken for each text chunk. */
|
|
348
|
+
async stream(system, messages, maxTokens, onToken) {
|
|
349
|
+
const stream = this.client.messages.stream({
|
|
350
|
+
model: this.model,
|
|
351
|
+
max_tokens: maxTokens,
|
|
352
|
+
system,
|
|
353
|
+
messages
|
|
354
|
+
});
|
|
355
|
+
let fullText = "";
|
|
356
|
+
for await (const event of stream) {
|
|
357
|
+
if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
|
|
358
|
+
fullText += event.delta.text;
|
|
359
|
+
onToken?.(event.delta.text);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
return fullText;
|
|
363
|
+
}
|
|
364
|
+
/** Call Claude with tool definitions and return the parsed tool input as JSON. */
|
|
365
|
+
async toolCall(system, messages, tools, maxTokens) {
|
|
366
|
+
const anthropicTools = tools.map((t) => ({
|
|
367
|
+
name: t.name,
|
|
368
|
+
description: t.description,
|
|
369
|
+
input_schema: t.input_schema
|
|
370
|
+
}));
|
|
371
|
+
const response = await this.client.messages.create({
|
|
372
|
+
model: this.model,
|
|
373
|
+
max_tokens: maxTokens,
|
|
374
|
+
system,
|
|
375
|
+
messages,
|
|
376
|
+
tools: anthropicTools
|
|
377
|
+
});
|
|
378
|
+
const toolBlock = response.content.find((block) => block.type === "tool_use");
|
|
379
|
+
if (toolBlock?.type === "tool_use") {
|
|
380
|
+
return JSON.stringify(toolBlock.input);
|
|
381
|
+
}
|
|
382
|
+
const textBlock = response.content.find((block) => block.type === "text");
|
|
383
|
+
return textBlock?.type === "text" ? textBlock.text : "";
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* Produce a single embedding vector via the Voyage API.
|
|
387
|
+
*
|
|
388
|
+
* Anthropic does not ship a first-party embeddings endpoint, so we delegate
|
|
389
|
+
* to Voyage (their recommended partner). Requires VOYAGE_API_KEY.
|
|
390
|
+
*/
|
|
391
|
+
async embed(text) {
|
|
392
|
+
const apiKey = process.env.VOYAGE_API_KEY?.trim();
|
|
393
|
+
if (!apiKey) {
|
|
394
|
+
throw new Error(
|
|
395
|
+
"VOYAGE_API_KEY is not set. Anthropic embeddings use Voyage \u2014 set VOYAGE_API_KEY to enable semantic search."
|
|
396
|
+
);
|
|
397
|
+
}
|
|
398
|
+
const response = await fetch(VOYAGE_EMBEDDINGS_URL, {
|
|
399
|
+
method: "POST",
|
|
400
|
+
headers: {
|
|
401
|
+
"Content-Type": "application/json",
|
|
402
|
+
Authorization: `Bearer ${apiKey}`
|
|
403
|
+
},
|
|
404
|
+
body: JSON.stringify({ input: text, model: EMBEDDING_MODELS.anthropic })
|
|
405
|
+
});
|
|
406
|
+
if (!response.ok) {
|
|
407
|
+
const detail = await response.text();
|
|
408
|
+
throw new Error(`Voyage embeddings request failed (${response.status}): ${detail}`);
|
|
409
|
+
}
|
|
410
|
+
const json = await response.json();
|
|
411
|
+
const vector = json.data?.[0]?.embedding;
|
|
412
|
+
if (!Array.isArray(vector)) {
|
|
413
|
+
throw new Error("Voyage embeddings response did not include a vector.");
|
|
414
|
+
}
|
|
415
|
+
return vector;
|
|
416
|
+
}
|
|
417
|
+
};
|
|
418
|
+
|
|
419
|
+
// src/utils/claude-settings.ts
|
|
420
|
+
import { readFileSync } from "fs";
|
|
421
|
+
import { homedir } from "os";
|
|
422
|
+
import path4 from "path";
|
|
423
|
+
var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
|
|
424
|
+
function isRecord(value) {
|
|
425
|
+
return typeof value === "object" && value !== null;
|
|
426
|
+
}
|
|
427
|
+
function normalize(value) {
|
|
428
|
+
if (typeof value !== "string") return void 0;
|
|
429
|
+
const trimmed = value.trim();
|
|
430
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
431
|
+
}
|
|
432
|
+
function resolveClaudeSettingsPath(env) {
|
|
433
|
+
return env[CLAUDE_SETTINGS_PATH_ENV] ?? path4.join(homedir(), ".claude", "settings.json");
|
|
434
|
+
}
|
|
435
|
+
function readClaudeSettingsFile(settingsPath) {
|
|
436
|
+
try {
|
|
437
|
+
return readFileSync(settingsPath, "utf8");
|
|
438
|
+
} catch (err) {
|
|
439
|
+
if (isRecord(err) && err.code === "ENOENT") {
|
|
440
|
+
return void 0;
|
|
441
|
+
}
|
|
442
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
443
|
+
throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
function readClaudeSettingsEnv(env = process.env) {
|
|
447
|
+
const settingsPath = resolveClaudeSettingsPath(env);
|
|
448
|
+
const raw = readClaudeSettingsFile(settingsPath);
|
|
449
|
+
if (!raw) return void 0;
|
|
450
|
+
let parsed;
|
|
451
|
+
try {
|
|
452
|
+
parsed = JSON.parse(raw);
|
|
453
|
+
} catch (err) {
|
|
454
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
455
|
+
throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
|
|
456
|
+
}
|
|
457
|
+
if (!isRecord(parsed) || !isRecord(parsed.env)) {
|
|
458
|
+
return void 0;
|
|
459
|
+
}
|
|
460
|
+
const values = {
|
|
461
|
+
ANTHROPIC_API_KEY: normalize(parsed.env.ANTHROPIC_API_KEY),
|
|
462
|
+
ANTHROPIC_AUTH_TOKEN: normalize(parsed.env.ANTHROPIC_AUTH_TOKEN),
|
|
463
|
+
ANTHROPIC_BASE_URL: normalize(parsed.env.ANTHROPIC_BASE_URL),
|
|
464
|
+
ANTHROPIC_MODEL: normalize(parsed.env.ANTHROPIC_MODEL)
|
|
465
|
+
};
|
|
466
|
+
if (!values.ANTHROPIC_API_KEY && !values.ANTHROPIC_AUTH_TOKEN && !values.ANTHROPIC_BASE_URL && !values.ANTHROPIC_MODEL) {
|
|
467
|
+
return void 0;
|
|
468
|
+
}
|
|
469
|
+
return values;
|
|
470
|
+
}
|
|
471
|
+
function tryReadClaudeSettingsEnv(env) {
|
|
472
|
+
try {
|
|
473
|
+
return readClaudeSettingsEnv(env);
|
|
474
|
+
} catch {
|
|
475
|
+
return void 0;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
function validateAnthropicBaseURL(value) {
|
|
479
|
+
const normalized = value.trim();
|
|
480
|
+
try {
|
|
481
|
+
const parsed = new URL(normalized);
|
|
482
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
483
|
+
throw new Error("Must use http:// or https:// protocol.");
|
|
484
|
+
}
|
|
485
|
+
} catch (err) {
|
|
486
|
+
const message = err instanceof Error ? err.message : "Must be a valid http(s) URL.";
|
|
487
|
+
throw new Error(`Invalid ANTHROPIC_BASE_URL: "${normalized}". ${message}`);
|
|
488
|
+
}
|
|
489
|
+
return normalized;
|
|
490
|
+
}
|
|
491
|
+
function resolveAnthropicAuthFromEnv(env = process.env) {
|
|
492
|
+
const explicitApiKey = normalize(env.ANTHROPIC_API_KEY);
|
|
493
|
+
if (explicitApiKey) return { apiKey: explicitApiKey };
|
|
494
|
+
const explicitAuthToken = normalize(env.ANTHROPIC_AUTH_TOKEN);
|
|
495
|
+
if (explicitAuthToken) return { authToken: explicitAuthToken };
|
|
496
|
+
const fallback = readClaudeSettingsEnv(env);
|
|
497
|
+
if (fallback?.ANTHROPIC_API_KEY) return { apiKey: fallback.ANTHROPIC_API_KEY };
|
|
498
|
+
if (fallback?.ANTHROPIC_AUTH_TOKEN) return { authToken: fallback.ANTHROPIC_AUTH_TOKEN };
|
|
499
|
+
return {};
|
|
500
|
+
}
|
|
501
|
+
function resolveAnthropicModelFromEnv(env = process.env) {
|
|
502
|
+
const explicitModel = env.LLMWIKI_MODEL;
|
|
503
|
+
if (explicitModel !== void 0) return explicitModel;
|
|
504
|
+
return tryReadClaudeSettingsEnv(env)?.ANTHROPIC_MODEL;
|
|
505
|
+
}
|
|
506
|
+
function resolveAnthropicBaseURLFromEnv(env = process.env) {
|
|
507
|
+
const explicitBaseURL = normalize(env.ANTHROPIC_BASE_URL);
|
|
508
|
+
if (explicitBaseURL) return validateAnthropicBaseURL(explicitBaseURL);
|
|
509
|
+
const fallbackBaseURL = tryReadClaudeSettingsEnv(env)?.ANTHROPIC_BASE_URL;
|
|
510
|
+
if (!fallbackBaseURL) return void 0;
|
|
511
|
+
return validateAnthropicBaseURL(fallbackBaseURL);
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// src/ingest/image.ts
|
|
515
|
+
var EXTENSION_TO_MIME = {
|
|
516
|
+
".jpg": "image/jpeg",
|
|
517
|
+
".jpeg": "image/jpeg",
|
|
518
|
+
".png": "image/png",
|
|
519
|
+
".gif": "image/gif",
|
|
520
|
+
".webp": "image/webp"
|
|
521
|
+
};
|
|
522
|
+
function mimeTypeForExtension(ext) {
|
|
523
|
+
const mimeType = EXTENSION_TO_MIME[ext.toLowerCase()];
|
|
524
|
+
if (!mimeType) {
|
|
525
|
+
throw new Error(
|
|
526
|
+
`Unsupported image extension "${ext}". Supported: ${Object.keys(EXTENSION_TO_MIME).join(", ")}`
|
|
527
|
+
);
|
|
528
|
+
}
|
|
529
|
+
return mimeType;
|
|
530
|
+
}
|
|
531
|
+
function buildClient() {
|
|
532
|
+
const baseURL = resolveAnthropicBaseURLFromEnv();
|
|
533
|
+
const auth = resolveAnthropicAuthFromEnv();
|
|
534
|
+
return new Anthropic2(buildAnthropicClientOptions({ baseURL, ...auth }));
|
|
535
|
+
}
|
|
536
|
+
async function describeImageWithVision(client, model, imageData, mimeType) {
|
|
537
|
+
const response = await client.messages.create({
|
|
538
|
+
model,
|
|
539
|
+
max_tokens: IMAGE_DESCRIBE_MAX_TOKENS,
|
|
540
|
+
messages: [
|
|
541
|
+
{
|
|
542
|
+
role: "user",
|
|
543
|
+
content: [
|
|
544
|
+
{
|
|
545
|
+
type: "image",
|
|
546
|
+
source: { type: "base64", media_type: mimeType, data: imageData }
|
|
547
|
+
},
|
|
548
|
+
{
|
|
549
|
+
type: "text",
|
|
550
|
+
text: "Extract and transcribe all text visible in this image. Then provide a detailed description of any non-text visual content. Format your response as markdown."
|
|
551
|
+
}
|
|
552
|
+
]
|
|
553
|
+
}
|
|
554
|
+
]
|
|
555
|
+
});
|
|
556
|
+
const textBlock = response.content.find((block) => block.type === "text");
|
|
557
|
+
return textBlock?.type === "text" ? textBlock.text : "";
|
|
558
|
+
}
|
|
559
|
+
async function ingestImage(filePath) {
|
|
560
|
+
const providerName = process.env.LLMWIKI_PROVIDER ?? "anthropic";
|
|
561
|
+
if (providerName !== "anthropic") {
|
|
562
|
+
throw new Error(
|
|
563
|
+
`Image ingest requires the Anthropic provider (vision). Current provider: "${providerName}". Set LLMWIKI_PROVIDER=anthropic and ANTHROPIC_API_KEY to use image ingest.`
|
|
564
|
+
);
|
|
565
|
+
}
|
|
566
|
+
const ext = path5.extname(filePath).toLowerCase();
|
|
567
|
+
const mimeType = mimeTypeForExtension(ext);
|
|
568
|
+
const imageBuffer = await readFile4(filePath);
|
|
569
|
+
const imageData = imageBuffer.toString("base64");
|
|
570
|
+
const client = buildClient();
|
|
571
|
+
const model = resolveAnthropicModelFromEnv() ?? PROVIDER_MODELS.anthropic;
|
|
572
|
+
const content = await describeImageWithVision(client, model, imageData, mimeType);
|
|
573
|
+
const title = titleFromFilename(filePath);
|
|
574
|
+
return { title, content };
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// src/ingest/transcript.ts
|
|
578
|
+
import { readFile as readFile5 } from "fs/promises";
|
|
579
|
+
import path6 from "path";
|
|
580
|
+
import { YoutubeTranscript as YoutubeTranscriptUntyped } from "youtube-transcript/dist/youtube-transcript.esm.js";
|
|
581
|
+
var YoutubeTranscript = YoutubeTranscriptUntyped;
|
|
582
|
+
var YOUTUBE_URL_PATTERN = /^https?:\/\/(www\.)?(youtube\.com\/watch|youtu\.be\/)/;
|
|
583
|
+
var SRT_SEQUENCE_PATTERN = /^\d+$/;
|
|
584
|
+
var TIMESTAMP_PATTERN = /\d{2}:\d{2}[:.]\d{2}/;
|
|
585
|
+
var MS_PER_MINUTE = 6e4;
|
|
586
|
+
var MS_PER_SECOND = 1e3;
|
|
587
|
+
function isYoutubeUrl(source2) {
|
|
588
|
+
return YOUTUBE_URL_PATTERN.test(source2);
|
|
589
|
+
}
|
|
590
|
+
function extractVideoId(url) {
|
|
591
|
+
const match = url.match(/(?:v=|youtu\.be\/)([^&?/]+)/);
|
|
592
|
+
if (!match) {
|
|
593
|
+
throw new Error(`Could not extract video ID from YouTube URL: ${url}`);
|
|
594
|
+
}
|
|
595
|
+
return match[1];
|
|
596
|
+
}
|
|
597
|
+
function formatOffset(offsetMs) {
|
|
598
|
+
const minutes = Math.floor(offsetMs / MS_PER_MINUTE);
|
|
599
|
+
const seconds = Math.floor(offsetMs % MS_PER_MINUTE / MS_PER_SECOND);
|
|
600
|
+
return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}`;
|
|
601
|
+
}
|
|
602
|
+
async function fetchYoutubeTranscript(url) {
|
|
603
|
+
const videoId = extractVideoId(url);
|
|
604
|
+
const segments = await YoutubeTranscript.fetchTranscript(videoId);
|
|
605
|
+
if (!segments || segments.length === 0) {
|
|
606
|
+
throw new Error(`No transcript available for YouTube video: ${url}`);
|
|
607
|
+
}
|
|
608
|
+
const lines = segments.map((seg) => `[${formatOffset(seg.offset)}] ${seg.text}`);
|
|
609
|
+
return {
|
|
610
|
+
title: `YouTube Transcript ${videoId}`,
|
|
611
|
+
content: lines.join("\n")
|
|
612
|
+
};
|
|
613
|
+
}
|
|
614
|
+
function isCueTimestamp(trimmed) {
|
|
615
|
+
return TIMESTAMP_PATTERN.test(trimmed) && trimmed.includes("-->");
|
|
616
|
+
}
|
|
617
|
+
function parseVtt(raw, filePath) {
|
|
618
|
+
const lines = raw.split("\n");
|
|
619
|
+
const output = [];
|
|
620
|
+
let inCue = false;
|
|
621
|
+
for (const line of lines) {
|
|
622
|
+
const trimmed = line.trim();
|
|
623
|
+
if (trimmed === "WEBVTT" || trimmed === "") {
|
|
624
|
+
inCue = false;
|
|
625
|
+
continue;
|
|
626
|
+
}
|
|
627
|
+
if (isCueTimestamp(trimmed)) {
|
|
628
|
+
output.push(`
|
|
629
|
+
**[${trimmed}]**`);
|
|
630
|
+
inCue = true;
|
|
631
|
+
continue;
|
|
632
|
+
}
|
|
633
|
+
if (inCue && trimmed.length > 0) {
|
|
634
|
+
output.push(trimmed);
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
|
|
638
|
+
}
|
|
639
|
+
function parseSrt(raw, filePath) {
|
|
640
|
+
const lines = raw.split("\n");
|
|
641
|
+
const output = [];
|
|
642
|
+
for (const line of lines) {
|
|
643
|
+
const trimmed = line.trim();
|
|
644
|
+
if (trimmed === "" || SRT_SEQUENCE_PATTERN.test(trimmed)) {
|
|
645
|
+
continue;
|
|
646
|
+
}
|
|
647
|
+
if (isCueTimestamp(trimmed)) {
|
|
648
|
+
output.push(`
|
|
649
|
+
**[${trimmed}]**`);
|
|
650
|
+
continue;
|
|
651
|
+
}
|
|
652
|
+
if (trimmed.length > 0) {
|
|
653
|
+
output.push(trimmed);
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
|
|
657
|
+
}
|
|
658
|
+
function parsePlainTranscript(raw, filePath) {
|
|
659
|
+
return { title: titleFromFilename(filePath), content: raw.trim() };
|
|
660
|
+
}
|
|
661
|
+
async function ingestTranscript(source2) {
|
|
662
|
+
if (isYoutubeUrl(source2)) {
|
|
663
|
+
return fetchYoutubeTranscript(source2);
|
|
664
|
+
}
|
|
665
|
+
const ext = path6.extname(source2).toLowerCase();
|
|
666
|
+
const raw = await readFile5(source2, "utf-8");
|
|
667
|
+
if (ext === ".vtt") return parseVtt(raw, source2);
|
|
668
|
+
if (ext === ".srt") return parseSrt(raw, source2);
|
|
669
|
+
if (ext === ".txt") return parsePlainTranscript(raw, source2);
|
|
670
|
+
throw new Error(
|
|
671
|
+
`Unsupported transcript file type "${ext}". Supported: .vtt, .srt, .txt`
|
|
672
|
+
);
|
|
673
|
+
}
|
|
674
|
+
|
|
264
675
|
// src/commands/ingest.ts
|
|
265
676
|
function isUrl(source2) {
|
|
266
677
|
return source2.startsWith("http://") || source2.startsWith("https://");
|
|
267
678
|
}
|
|
679
|
+
var TXT_SNIFF_BYTES = 2048;
|
|
680
|
+
var SPEAKER_TAG_PATTERN = /^([A-Z][a-zA-Z .'-]{0,40}):\s/gm;
|
|
681
|
+
var TIMESTAMP_PATTERN2 = /^\s*\d{1,2}:\d{2}(:\d{2})?/;
|
|
682
|
+
var MIN_TIMESTAMP_MATCHES = 3;
|
|
683
|
+
var MIN_SPEAKER_REPEAT_COUNT = 2;
|
|
684
|
+
var MIN_DISTINCT_SPEAKERS = 2;
|
|
685
|
+
function countSpeakerOccurrences(sample) {
|
|
686
|
+
const counts = /* @__PURE__ */ new Map();
|
|
687
|
+
SPEAKER_TAG_PATTERN.lastIndex = 0;
|
|
688
|
+
let match;
|
|
689
|
+
while ((match = SPEAKER_TAG_PATTERN.exec(sample)) !== null) {
|
|
690
|
+
const name = match[1].trim();
|
|
691
|
+
counts.set(name, (counts.get(name) ?? 0) + 1);
|
|
692
|
+
}
|
|
693
|
+
return counts;
|
|
694
|
+
}
|
|
695
|
+
function hasSpeakerDialoguePattern(sample) {
|
|
696
|
+
const counts = countSpeakerOccurrences(sample);
|
|
697
|
+
const distinctSpeakers = counts.size;
|
|
698
|
+
const hasEnoughSpeakers = distinctSpeakers >= MIN_DISTINCT_SPEAKERS;
|
|
699
|
+
const hasRepeatedSpeaker = [...counts.values()].some(
|
|
700
|
+
(n) => n >= MIN_SPEAKER_REPEAT_COUNT
|
|
701
|
+
);
|
|
702
|
+
return hasEnoughSpeakers && hasRepeatedSpeaker;
|
|
703
|
+
}
|
|
704
|
+
async function looksLikeTxtTranscript(filePath) {
|
|
705
|
+
const raw = await readFile6(filePath, "utf-8");
|
|
706
|
+
const sample = raw.slice(0, TXT_SNIFF_BYTES);
|
|
707
|
+
if (hasSpeakerDialoguePattern(sample)) return true;
|
|
708
|
+
const timestampMatches = sample.match(new RegExp(TIMESTAMP_PATTERN2.source, "gm"));
|
|
709
|
+
return (timestampMatches?.length ?? 0) >= MIN_TIMESTAMP_MATCHES;
|
|
710
|
+
}
|
|
268
711
|
function enforceCharLimit(content) {
|
|
269
712
|
if (content.length <= MAX_SOURCE_CHARS) {
|
|
270
713
|
return { content, truncated: false, originalChars: content.length };
|
|
@@ -297,12 +740,30 @@ function enforceMinContent(content) {
|
|
|
297
740
|
);
|
|
298
741
|
}
|
|
299
742
|
}
|
|
300
|
-
function
|
|
743
|
+
async function detectSourceType(source2) {
|
|
744
|
+
if (!isUrl(source2)) {
|
|
745
|
+
const ext = path7.extname(source2).toLowerCase();
|
|
746
|
+
if (ext === ".pdf") return "pdf";
|
|
747
|
+
if (IMAGE_EXTENSIONS.has(ext)) return "image";
|
|
748
|
+
if (TRANSCRIPT_EXTENSIONS.has(ext)) return "transcript";
|
|
749
|
+
if (ext === ".txt") {
|
|
750
|
+
const isTranscript = await looksLikeTxtTranscript(source2);
|
|
751
|
+
return isTranscript ? "transcript" : "file";
|
|
752
|
+
}
|
|
753
|
+
return "file";
|
|
754
|
+
}
|
|
755
|
+
if (isYoutubeUrl(source2)) return "transcript";
|
|
756
|
+
return "web";
|
|
757
|
+
}
|
|
758
|
+
function buildDocument(title, source2, result, sourceType) {
|
|
301
759
|
const meta = {
|
|
302
760
|
title,
|
|
303
761
|
source: source2,
|
|
304
762
|
ingestedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
305
763
|
};
|
|
764
|
+
if (sourceType !== void 0) {
|
|
765
|
+
meta.sourceType = sourceType;
|
|
766
|
+
}
|
|
306
767
|
if (result.truncated) {
|
|
307
768
|
meta.truncated = true;
|
|
308
769
|
meta.originalChars = result.originalChars;
|
|
@@ -313,30 +774,46 @@ function buildDocument(title, source2, result) {
|
|
|
313
774
|
${result.content}
|
|
314
775
|
`;
|
|
315
776
|
}
|
|
777
|
+
async function fetchContent(source2, sourceType) {
|
|
778
|
+
switch (sourceType) {
|
|
779
|
+
case "web":
|
|
780
|
+
return ingestWeb(source2);
|
|
781
|
+
case "pdf":
|
|
782
|
+
return ingestPdf(source2);
|
|
783
|
+
case "image":
|
|
784
|
+
return ingestImage(source2);
|
|
785
|
+
case "transcript":
|
|
786
|
+
return ingestTranscript(source2);
|
|
787
|
+
case "file":
|
|
788
|
+
return ingestFile(source2);
|
|
789
|
+
}
|
|
790
|
+
}
|
|
316
791
|
async function saveSource(title, document) {
|
|
317
792
|
const filename = `${slugify(title)}.md`;
|
|
318
|
-
const destPath =
|
|
793
|
+
const destPath = path7.join(SOURCES_DIR, filename);
|
|
319
794
|
await mkdir2(SOURCES_DIR, { recursive: true });
|
|
320
795
|
await writeFile2(destPath, document, "utf-8");
|
|
321
796
|
return destPath;
|
|
322
797
|
}
|
|
323
798
|
async function ingestSource(source2) {
|
|
324
|
-
|
|
325
|
-
|
|
799
|
+
const sourceType = await detectSourceType(source2);
|
|
800
|
+
status("*", info(`Ingesting [${sourceType}]: ${source2}`));
|
|
801
|
+
const { title, content } = await fetchContent(source2, sourceType);
|
|
326
802
|
const result = enforceCharLimit(content);
|
|
327
803
|
enforceMinContent(result.content);
|
|
328
|
-
const document = buildDocument(title, source2, result);
|
|
804
|
+
const document = buildDocument(title, source2, result, sourceType);
|
|
329
805
|
const savedPath = await saveSource(title, document);
|
|
330
806
|
return {
|
|
331
|
-
filename:
|
|
807
|
+
filename: path7.basename(savedPath),
|
|
332
808
|
charCount: result.content.length,
|
|
333
809
|
truncated: result.truncated,
|
|
334
|
-
source: source2
|
|
810
|
+
source: source2,
|
|
811
|
+
sourceType
|
|
335
812
|
};
|
|
336
813
|
}
|
|
337
814
|
async function ingest(source2) {
|
|
338
815
|
const result = await ingestSource(source2);
|
|
339
|
-
const savedPath =
|
|
816
|
+
const savedPath = path7.join(SOURCES_DIR, result.filename);
|
|
340
817
|
status(
|
|
341
818
|
"+",
|
|
342
819
|
success(`Saved ${bold(result.filename)} \u2192 ${source(savedPath)}`)
|
|
@@ -348,23 +825,23 @@ async function ingest(source2) {
|
|
|
348
825
|
import { existsSync as existsSync7 } from "fs";
|
|
349
826
|
|
|
350
827
|
// src/compiler/index.ts
|
|
351
|
-
import { readFile as
|
|
352
|
-
import
|
|
828
|
+
import { readFile as readFile14 } from "fs/promises";
|
|
829
|
+
import path21 from "path";
|
|
353
830
|
|
|
354
831
|
// src/utils/state.ts
|
|
355
|
-
import { readFile as
|
|
832
|
+
import { readFile as readFile7, writeFile as writeFile3, rename as rename2, mkdir as mkdir3, copyFile } from "fs/promises";
|
|
356
833
|
import { existsSync } from "fs";
|
|
357
|
-
import
|
|
834
|
+
import path8 from "path";
|
|
358
835
|
function emptyState() {
|
|
359
836
|
return { version: 1, indexHash: "", sources: {} };
|
|
360
837
|
}
|
|
361
838
|
async function readState(root) {
|
|
362
|
-
const filePath =
|
|
839
|
+
const filePath = path8.join(root, STATE_FILE);
|
|
363
840
|
if (!existsSync(filePath)) {
|
|
364
841
|
return emptyState();
|
|
365
842
|
}
|
|
366
843
|
try {
|
|
367
|
-
const raw = await
|
|
844
|
+
const raw = await readFile7(filePath, "utf-8");
|
|
368
845
|
return JSON.parse(raw);
|
|
369
846
|
} catch {
|
|
370
847
|
const bakPath = filePath + ".bak";
|
|
@@ -374,9 +851,9 @@ async function readState(root) {
|
|
|
374
851
|
}
|
|
375
852
|
}
|
|
376
853
|
async function writeState(root, state) {
|
|
377
|
-
const dir =
|
|
854
|
+
const dir = path8.join(root, LLMWIKI_DIR);
|
|
378
855
|
await mkdir3(dir, { recursive: true });
|
|
379
|
-
const filePath =
|
|
856
|
+
const filePath = path8.join(root, STATE_FILE);
|
|
380
857
|
const tmpPath = filePath + ".tmp";
|
|
381
858
|
await writeFile3(tmpPath, JSON.stringify(state, null, 2), "utf-8");
|
|
382
859
|
await rename2(tmpPath, filePath);
|
|
@@ -393,18 +870,18 @@ async function removeSourceState(root, sourceFile) {
|
|
|
393
870
|
}
|
|
394
871
|
|
|
395
872
|
// src/compiler/source-state.ts
|
|
396
|
-
import
|
|
873
|
+
import path10 from "path";
|
|
397
874
|
|
|
398
875
|
// src/compiler/hasher.ts
|
|
399
876
|
import { createHash } from "crypto";
|
|
400
|
-
import { readFile as
|
|
401
|
-
import
|
|
877
|
+
import { readFile as readFile8, readdir } from "fs/promises";
|
|
878
|
+
import path9 from "path";
|
|
402
879
|
async function hashFile(filePath) {
|
|
403
|
-
const content = await
|
|
880
|
+
const content = await readFile8(filePath, "utf-8");
|
|
404
881
|
return createHash("sha256").update(content).digest("hex");
|
|
405
882
|
}
|
|
406
883
|
async function detectChanges(root, prevState) {
|
|
407
|
-
const sourcesPath =
|
|
884
|
+
const sourcesPath = path9.join(root, SOURCES_DIR);
|
|
408
885
|
const currentFiles = await listSourceFiles(sourcesPath);
|
|
409
886
|
const changes = [];
|
|
410
887
|
for (const file of currentFiles) {
|
|
@@ -424,7 +901,7 @@ async function listSourceFiles(sourcesPath) {
|
|
|
424
901
|
}
|
|
425
902
|
}
|
|
426
903
|
async function classifyFile(root, file, prevState) {
|
|
427
|
-
const filePath =
|
|
904
|
+
const filePath = path9.join(root, SOURCES_DIR, file);
|
|
428
905
|
const hash = await hashFile(filePath);
|
|
429
906
|
const prev = prevState.sources[file];
|
|
430
907
|
if (!prev) return "new";
|
|
@@ -447,133 +924,22 @@ async function buildExtractionSourceStates(root, extractions) {
|
|
|
447
924
|
return snapshot;
|
|
448
925
|
}
|
|
449
926
|
async function buildEntry(root, result, compiledAt) {
|
|
450
|
-
const filePath =
|
|
927
|
+
const filePath = path10.join(root, SOURCES_DIR, result.sourceFile);
|
|
451
928
|
const hash = await hashFile(filePath);
|
|
452
|
-
return {
|
|
453
|
-
hash,
|
|
454
|
-
concepts: result.concepts.map((concept) => slugify(concept.concept)),
|
|
455
|
-
compiledAt
|
|
456
|
-
};
|
|
457
|
-
}
|
|
458
|
-
function pickStatesForSources(allStates, sourceFiles) {
|
|
459
|
-
const picked = {};
|
|
460
|
-
for (const file of sourceFiles) {
|
|
461
|
-
const entry = allStates[file];
|
|
462
|
-
if (entry) picked[file] = entry;
|
|
463
|
-
}
|
|
464
|
-
return picked;
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
// src/providers/anthropic.ts
|
|
468
|
-
import Anthropic from "@anthropic-ai/sdk";
|
|
469
|
-
var VOYAGE_EMBEDDINGS_URL = "https://api.voyageai.com/v1/embeddings";
|
|
470
|
-
function buildAnthropicClientOptions(options = {}) {
|
|
471
|
-
const trimmedBaseURL = options.baseURL?.trim();
|
|
472
|
-
const trimmedApiKey = options.apiKey?.trim();
|
|
473
|
-
const trimmedAuthToken = options.authToken?.trim();
|
|
474
|
-
const result = {};
|
|
475
|
-
if (trimmedApiKey) {
|
|
476
|
-
result.apiKey = trimmedApiKey;
|
|
477
|
-
}
|
|
478
|
-
if (trimmedAuthToken) {
|
|
479
|
-
result.authToken = trimmedAuthToken;
|
|
480
|
-
}
|
|
481
|
-
if (!trimmedBaseURL) {
|
|
482
|
-
return result;
|
|
483
|
-
}
|
|
484
|
-
const normalizedBaseURL = trimmedBaseURL.endsWith("/") && trimmedBaseURL.length > 1 ? trimmedBaseURL.slice(0, -1) : trimmedBaseURL;
|
|
485
|
-
result.baseURL = normalizedBaseURL;
|
|
486
|
-
return result;
|
|
487
|
-
}
|
|
488
|
-
var AnthropicProvider = class {
|
|
489
|
-
client;
|
|
490
|
-
model;
|
|
491
|
-
constructor(model, options = {}) {
|
|
492
|
-
this.model = model;
|
|
493
|
-
this.client = new Anthropic(buildAnthropicClientOptions(options));
|
|
494
|
-
}
|
|
495
|
-
/** Send a single non-streaming completion request. */
|
|
496
|
-
async complete(system, messages, maxTokens) {
|
|
497
|
-
const response = await this.client.messages.create({
|
|
498
|
-
model: this.model,
|
|
499
|
-
max_tokens: maxTokens,
|
|
500
|
-
system,
|
|
501
|
-
messages
|
|
502
|
-
});
|
|
503
|
-
const textBlock = response.content.find((block) => block.type === "text");
|
|
504
|
-
return textBlock?.type === "text" ? textBlock.text : "";
|
|
505
|
-
}
|
|
506
|
-
/** Stream a completion, invoking onToken for each text chunk. */
|
|
507
|
-
async stream(system, messages, maxTokens, onToken) {
|
|
508
|
-
const stream = this.client.messages.stream({
|
|
509
|
-
model: this.model,
|
|
510
|
-
max_tokens: maxTokens,
|
|
511
|
-
system,
|
|
512
|
-
messages
|
|
513
|
-
});
|
|
514
|
-
let fullText = "";
|
|
515
|
-
for await (const event of stream) {
|
|
516
|
-
if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
|
|
517
|
-
fullText += event.delta.text;
|
|
518
|
-
onToken?.(event.delta.text);
|
|
519
|
-
}
|
|
520
|
-
}
|
|
521
|
-
return fullText;
|
|
522
|
-
}
|
|
523
|
-
/** Call Claude with tool definitions and return the parsed tool input as JSON. */
|
|
524
|
-
async toolCall(system, messages, tools, maxTokens) {
|
|
525
|
-
const anthropicTools = tools.map((t) => ({
|
|
526
|
-
name: t.name,
|
|
527
|
-
description: t.description,
|
|
528
|
-
input_schema: t.input_schema
|
|
529
|
-
}));
|
|
530
|
-
const response = await this.client.messages.create({
|
|
531
|
-
model: this.model,
|
|
532
|
-
max_tokens: maxTokens,
|
|
533
|
-
system,
|
|
534
|
-
messages,
|
|
535
|
-
tools: anthropicTools
|
|
536
|
-
});
|
|
537
|
-
const toolBlock = response.content.find((block) => block.type === "tool_use");
|
|
538
|
-
if (toolBlock?.type === "tool_use") {
|
|
539
|
-
return JSON.stringify(toolBlock.input);
|
|
540
|
-
}
|
|
541
|
-
const textBlock = response.content.find((block) => block.type === "text");
|
|
542
|
-
return textBlock?.type === "text" ? textBlock.text : "";
|
|
543
|
-
}
|
|
544
|
-
/**
|
|
545
|
-
* Produce a single embedding vector via the Voyage API.
|
|
546
|
-
*
|
|
547
|
-
* Anthropic does not ship a first-party embeddings endpoint, so we delegate
|
|
548
|
-
* to Voyage (their recommended partner). Requires VOYAGE_API_KEY.
|
|
549
|
-
*/
|
|
550
|
-
async embed(text) {
|
|
551
|
-
const apiKey = process.env.VOYAGE_API_KEY?.trim();
|
|
552
|
-
if (!apiKey) {
|
|
553
|
-
throw new Error(
|
|
554
|
-
"VOYAGE_API_KEY is not set. Anthropic embeddings use Voyage \u2014 set VOYAGE_API_KEY to enable semantic search."
|
|
555
|
-
);
|
|
556
|
-
}
|
|
557
|
-
const response = await fetch(VOYAGE_EMBEDDINGS_URL, {
|
|
558
|
-
method: "POST",
|
|
559
|
-
headers: {
|
|
560
|
-
"Content-Type": "application/json",
|
|
561
|
-
Authorization: `Bearer ${apiKey}`
|
|
562
|
-
},
|
|
563
|
-
body: JSON.stringify({ input: text, model: EMBEDDING_MODELS.anthropic })
|
|
564
|
-
});
|
|
565
|
-
if (!response.ok) {
|
|
566
|
-
const detail = await response.text();
|
|
567
|
-
throw new Error(`Voyage embeddings request failed (${response.status}): ${detail}`);
|
|
568
|
-
}
|
|
569
|
-
const json = await response.json();
|
|
570
|
-
const vector = json.data?.[0]?.embedding;
|
|
571
|
-
if (!Array.isArray(vector)) {
|
|
572
|
-
throw new Error("Voyage embeddings response did not include a vector.");
|
|
573
|
-
}
|
|
574
|
-
return vector;
|
|
929
|
+
return {
|
|
930
|
+
hash,
|
|
931
|
+
concepts: result.concepts.map((concept) => slugify(concept.concept)),
|
|
932
|
+
compiledAt
|
|
933
|
+
};
|
|
934
|
+
}
|
|
935
|
+
function pickStatesForSources(allStates, sourceFiles) {
|
|
936
|
+
const picked = {};
|
|
937
|
+
for (const file of sourceFiles) {
|
|
938
|
+
const entry = allStates[file];
|
|
939
|
+
if (entry) picked[file] = entry;
|
|
575
940
|
}
|
|
576
|
-
|
|
941
|
+
return picked;
|
|
942
|
+
}
|
|
577
943
|
|
|
578
944
|
// src/providers/openai.ts
|
|
579
945
|
import OpenAI from "openai";
|
|
@@ -704,101 +1070,6 @@ var MiniMaxProvider = class extends OpenAIProvider {
|
|
|
704
1070
|
}
|
|
705
1071
|
};
|
|
706
1072
|
|
|
707
|
-
// src/utils/claude-settings.ts
|
|
708
|
-
import { readFileSync } from "fs";
|
|
709
|
-
import { homedir } from "os";
|
|
710
|
-
import path7 from "path";
|
|
711
|
-
var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
|
|
712
|
-
function isRecord(value) {
|
|
713
|
-
return typeof value === "object" && value !== null;
|
|
714
|
-
}
|
|
715
|
-
function normalize(value) {
|
|
716
|
-
if (typeof value !== "string") return void 0;
|
|
717
|
-
const trimmed = value.trim();
|
|
718
|
-
return trimmed.length > 0 ? trimmed : void 0;
|
|
719
|
-
}
|
|
720
|
-
function resolveClaudeSettingsPath(env) {
|
|
721
|
-
return env[CLAUDE_SETTINGS_PATH_ENV] ?? path7.join(homedir(), ".claude", "settings.json");
|
|
722
|
-
}
|
|
723
|
-
function readClaudeSettingsFile(settingsPath) {
|
|
724
|
-
try {
|
|
725
|
-
return readFileSync(settingsPath, "utf8");
|
|
726
|
-
} catch (err) {
|
|
727
|
-
if (isRecord(err) && err.code === "ENOENT") {
|
|
728
|
-
return void 0;
|
|
729
|
-
}
|
|
730
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
731
|
-
throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
|
|
732
|
-
}
|
|
733
|
-
}
|
|
734
|
-
function readClaudeSettingsEnv(env = process.env) {
|
|
735
|
-
const settingsPath = resolveClaudeSettingsPath(env);
|
|
736
|
-
const raw = readClaudeSettingsFile(settingsPath);
|
|
737
|
-
if (!raw) return void 0;
|
|
738
|
-
let parsed;
|
|
739
|
-
try {
|
|
740
|
-
parsed = JSON.parse(raw);
|
|
741
|
-
} catch (err) {
|
|
742
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
743
|
-
throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
|
|
744
|
-
}
|
|
745
|
-
if (!isRecord(parsed) || !isRecord(parsed.env)) {
|
|
746
|
-
return void 0;
|
|
747
|
-
}
|
|
748
|
-
const values = {
|
|
749
|
-
ANTHROPIC_API_KEY: normalize(parsed.env.ANTHROPIC_API_KEY),
|
|
750
|
-
ANTHROPIC_AUTH_TOKEN: normalize(parsed.env.ANTHROPIC_AUTH_TOKEN),
|
|
751
|
-
ANTHROPIC_BASE_URL: normalize(parsed.env.ANTHROPIC_BASE_URL),
|
|
752
|
-
ANTHROPIC_MODEL: normalize(parsed.env.ANTHROPIC_MODEL)
|
|
753
|
-
};
|
|
754
|
-
if (!values.ANTHROPIC_API_KEY && !values.ANTHROPIC_AUTH_TOKEN && !values.ANTHROPIC_BASE_URL && !values.ANTHROPIC_MODEL) {
|
|
755
|
-
return void 0;
|
|
756
|
-
}
|
|
757
|
-
return values;
|
|
758
|
-
}
|
|
759
|
-
function tryReadClaudeSettingsEnv(env) {
|
|
760
|
-
try {
|
|
761
|
-
return readClaudeSettingsEnv(env);
|
|
762
|
-
} catch {
|
|
763
|
-
return void 0;
|
|
764
|
-
}
|
|
765
|
-
}
|
|
766
|
-
function validateAnthropicBaseURL(value) {
|
|
767
|
-
const normalized = value.trim();
|
|
768
|
-
try {
|
|
769
|
-
const parsed = new URL(normalized);
|
|
770
|
-
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
771
|
-
throw new Error("Must use http:// or https:// protocol.");
|
|
772
|
-
}
|
|
773
|
-
} catch (err) {
|
|
774
|
-
const message = err instanceof Error ? err.message : "Must be a valid http(s) URL.";
|
|
775
|
-
throw new Error(`Invalid ANTHROPIC_BASE_URL: "${normalized}". ${message}`);
|
|
776
|
-
}
|
|
777
|
-
return normalized;
|
|
778
|
-
}
|
|
779
|
-
function resolveAnthropicAuthFromEnv(env = process.env) {
|
|
780
|
-
const explicitApiKey = normalize(env.ANTHROPIC_API_KEY);
|
|
781
|
-
if (explicitApiKey) return { apiKey: explicitApiKey };
|
|
782
|
-
const explicitAuthToken = normalize(env.ANTHROPIC_AUTH_TOKEN);
|
|
783
|
-
if (explicitAuthToken) return { authToken: explicitAuthToken };
|
|
784
|
-
const fallback = readClaudeSettingsEnv(env);
|
|
785
|
-
if (fallback?.ANTHROPIC_API_KEY) return { apiKey: fallback.ANTHROPIC_API_KEY };
|
|
786
|
-
if (fallback?.ANTHROPIC_AUTH_TOKEN) return { authToken: fallback.ANTHROPIC_AUTH_TOKEN };
|
|
787
|
-
return {};
|
|
788
|
-
}
|
|
789
|
-
function resolveAnthropicModelFromEnv(env = process.env) {
|
|
790
|
-
const explicitModel = env.LLMWIKI_MODEL;
|
|
791
|
-
if (explicitModel !== void 0) return explicitModel;
|
|
792
|
-
return tryReadClaudeSettingsEnv(env)?.ANTHROPIC_MODEL;
|
|
793
|
-
}
|
|
794
|
-
function resolveAnthropicBaseURLFromEnv(env = process.env) {
|
|
795
|
-
const explicitBaseURL = normalize(env.ANTHROPIC_BASE_URL);
|
|
796
|
-
if (explicitBaseURL) return validateAnthropicBaseURL(explicitBaseURL);
|
|
797
|
-
const fallbackBaseURL = tryReadClaudeSettingsEnv(env)?.ANTHROPIC_BASE_URL;
|
|
798
|
-
if (!fallbackBaseURL) return void 0;
|
|
799
|
-
return validateAnthropicBaseURL(fallbackBaseURL);
|
|
800
|
-
}
|
|
801
|
-
|
|
802
1073
|
// src/utils/provider.ts
|
|
803
1074
|
var SUPPORTED_PROVIDERS = /* @__PURE__ */ new Set(["anthropic", "openai", "ollama", "minimax"]);
|
|
804
1075
|
function getProvider() {
|
|
@@ -891,8 +1162,8 @@ async function callClaude(options) {
|
|
|
891
1162
|
}
|
|
892
1163
|
|
|
893
1164
|
// src/utils/lock.ts
|
|
894
|
-
import { open, readFile as
|
|
895
|
-
import
|
|
1165
|
+
import { open, readFile as readFile9, unlink, mkdir as mkdir4 } from "fs/promises";
|
|
1166
|
+
import path11 from "path";
|
|
896
1167
|
var RECLAIM_SUFFIX = ".reclaim";
|
|
897
1168
|
var MAX_ACQUIRE_ATTEMPTS = 2;
|
|
898
1169
|
function isProcessAlive(pid) {
|
|
@@ -904,8 +1175,8 @@ function isProcessAlive(pid) {
|
|
|
904
1175
|
}
|
|
905
1176
|
}
|
|
906
1177
|
async function acquireLock(root) {
|
|
907
|
-
const lockPath =
|
|
908
|
-
await mkdir4(
|
|
1178
|
+
const lockPath = path11.join(root, LOCK_FILE);
|
|
1179
|
+
await mkdir4(path11.join(root, LLMWIKI_DIR), { recursive: true });
|
|
909
1180
|
for (let attempt = 0; attempt < MAX_ACQUIRE_ATTEMPTS; attempt++) {
|
|
910
1181
|
const created = await tryCreateLock(lockPath);
|
|
911
1182
|
if (created) return true;
|
|
@@ -968,7 +1239,7 @@ async function tryCreateLock(lockPath) {
|
|
|
968
1239
|
}
|
|
969
1240
|
async function isLockStale(lockPath) {
|
|
970
1241
|
try {
|
|
971
|
-
const content = await
|
|
1242
|
+
const content = await readFile9(lockPath, "utf-8");
|
|
972
1243
|
const pid = parseInt(content.trim(), 10);
|
|
973
1244
|
if (isNaN(pid)) return true;
|
|
974
1245
|
return !isProcessAlive(pid);
|
|
@@ -977,7 +1248,7 @@ async function isLockStale(lockPath) {
|
|
|
977
1248
|
}
|
|
978
1249
|
}
|
|
979
1250
|
async function releaseLock(root) {
|
|
980
|
-
const lockPath =
|
|
1251
|
+
const lockPath = path11.join(root, LOCK_FILE);
|
|
981
1252
|
try {
|
|
982
1253
|
await unlink(lockPath);
|
|
983
1254
|
} catch {
|
|
@@ -1220,8 +1491,8 @@ function buildDefaultSchema() {
|
|
|
1220
1491
|
|
|
1221
1492
|
// src/schema/loader.ts
|
|
1222
1493
|
import { existsSync as existsSync2 } from "fs";
|
|
1223
|
-
import { readFile as
|
|
1224
|
-
import
|
|
1494
|
+
import { readFile as readFile10 } from "fs/promises";
|
|
1495
|
+
import path12 from "path";
|
|
1225
1496
|
import yaml2 from "js-yaml";
|
|
1226
1497
|
var SCHEMA_CANDIDATE_PATHS = [
|
|
1227
1498
|
".llmwiki/schema.json",
|
|
@@ -1232,7 +1503,7 @@ var SCHEMA_CANDIDATE_PATHS = [
|
|
|
1232
1503
|
];
|
|
1233
1504
|
function findSchemaPath(root) {
|
|
1234
1505
|
for (const candidate of SCHEMA_CANDIDATE_PATHS) {
|
|
1235
|
-
const absolute =
|
|
1506
|
+
const absolute = path12.join(root, candidate);
|
|
1236
1507
|
if (existsSync2(absolute)) return absolute;
|
|
1237
1508
|
}
|
|
1238
1509
|
return null;
|
|
@@ -1285,12 +1556,12 @@ async function loadSchema(root) {
|
|
|
1285
1556
|
const defaults = buildDefaultSchema();
|
|
1286
1557
|
const schemaPath = findSchemaPath(root);
|
|
1287
1558
|
if (!schemaPath) return defaults;
|
|
1288
|
-
const raw = await
|
|
1559
|
+
const raw = await readFile10(schemaPath, "utf-8");
|
|
1289
1560
|
const parsed = parseSchemaFile(schemaPath, raw);
|
|
1290
1561
|
return applyOverrides(defaults, parsed, schemaPath);
|
|
1291
1562
|
}
|
|
1292
1563
|
function defaultSchemaInitPath(root) {
|
|
1293
|
-
return
|
|
1564
|
+
return path12.join(root, SCHEMA_CANDIDATE_PATHS[0]);
|
|
1294
1565
|
}
|
|
1295
1566
|
|
|
1296
1567
|
// src/schema/helpers.ts
|
|
@@ -1462,7 +1733,7 @@ async function freezeFailedExtractions(root, results, frozenSlugs) {
|
|
|
1462
1733
|
}
|
|
1463
1734
|
|
|
1464
1735
|
// src/compiler/orphan.ts
|
|
1465
|
-
import
|
|
1736
|
+
import path13 from "path";
|
|
1466
1737
|
async function markOrphaned(root, sourceFile, state) {
|
|
1467
1738
|
const sourceEntry = state.sources[sourceFile];
|
|
1468
1739
|
if (!sourceEntry) return;
|
|
@@ -1488,7 +1759,7 @@ async function orphanUnownedFrozenPages(root, frozenSlugs) {
|
|
|
1488
1759
|
}
|
|
1489
1760
|
}
|
|
1490
1761
|
async function orphanPage(root, slug, reason) {
|
|
1491
|
-
const pagePath =
|
|
1762
|
+
const pagePath = path13.join(root, CONCEPTS_DIR, `${slug}.md`);
|
|
1492
1763
|
const content = await safeReadFile(pagePath);
|
|
1493
1764
|
if (!content) return;
|
|
1494
1765
|
const { meta } = parseFrontmatter(content);
|
|
@@ -1499,18 +1770,18 @@ async function orphanPage(root, slug, reason) {
|
|
|
1499
1770
|
}
|
|
1500
1771
|
|
|
1501
1772
|
// src/compiler/resolver.ts
|
|
1502
|
-
import { readdir as readdir2, readFile as
|
|
1503
|
-
import
|
|
1773
|
+
import { readdir as readdir2, readFile as readFile11 } from "fs/promises";
|
|
1774
|
+
import path14 from "path";
|
|
1504
1775
|
import { existsSync as existsSync3 } from "fs";
|
|
1505
1776
|
async function buildTitleIndex(root) {
|
|
1506
|
-
const conceptsDir =
|
|
1777
|
+
const conceptsDir = path14.join(root, CONCEPTS_DIR);
|
|
1507
1778
|
if (!existsSync3(conceptsDir)) return [];
|
|
1508
1779
|
const files = await readdir2(conceptsDir);
|
|
1509
1780
|
const pages = [];
|
|
1510
1781
|
for (const file of files) {
|
|
1511
1782
|
if (!file.endsWith(".md")) continue;
|
|
1512
|
-
const filePath =
|
|
1513
|
-
const content = await
|
|
1783
|
+
const filePath = path14.join(conceptsDir, file);
|
|
1784
|
+
const content = await readFile11(filePath, "utf-8");
|
|
1514
1785
|
const { meta } = parseFrontmatter(content);
|
|
1515
1786
|
if (meta.title && typeof meta.title === "string" && !meta.orphaned) {
|
|
1516
1787
|
pages.push({
|
|
@@ -1596,7 +1867,7 @@ async function resolveInboundLinks(titleIndex, newSlugs) {
|
|
|
1596
1867
|
let count = 0;
|
|
1597
1868
|
for (const page of titleIndex) {
|
|
1598
1869
|
if (newSlugs.includes(page.slug)) continue;
|
|
1599
|
-
const content = await
|
|
1870
|
+
const content = await readFile11(page.filePath, "utf-8");
|
|
1600
1871
|
const { body } = parseFrontmatter(content);
|
|
1601
1872
|
const linked = addWikilinks(body, newTitles, page.title);
|
|
1602
1873
|
if (linked !== body) {
|
|
@@ -1608,7 +1879,7 @@ async function resolveInboundLinks(titleIndex, newSlugs) {
|
|
|
1608
1879
|
return count;
|
|
1609
1880
|
}
|
|
1610
1881
|
async function linkPage(page, titleIndex) {
|
|
1611
|
-
const content = await
|
|
1882
|
+
const content = await readFile11(page.filePath, "utf-8");
|
|
1612
1883
|
const { body } = parseFrontmatter(content);
|
|
1613
1884
|
const linked = addWikilinks(body, titleIndex, page.title);
|
|
1614
1885
|
if (linked === body) return false;
|
|
@@ -1619,17 +1890,17 @@ async function linkPage(page, titleIndex) {
|
|
|
1619
1890
|
|
|
1620
1891
|
// src/compiler/indexgen.ts
|
|
1621
1892
|
import { readdir as readdir3 } from "fs/promises";
|
|
1622
|
-
import
|
|
1893
|
+
import path15 from "path";
|
|
1623
1894
|
async function generateIndex(root) {
|
|
1624
1895
|
status("*", info("Generating index..."));
|
|
1625
|
-
const conceptsPath =
|
|
1626
|
-
const queriesPath =
|
|
1896
|
+
const conceptsPath = path15.join(root, CONCEPTS_DIR);
|
|
1897
|
+
const queriesPath = path15.join(root, QUERIES_DIR);
|
|
1627
1898
|
const concepts = await collectPageSummaries(conceptsPath);
|
|
1628
1899
|
const queries = await collectPageSummaries(queriesPath);
|
|
1629
1900
|
concepts.sort((a, b) => a.title.localeCompare(b.title));
|
|
1630
1901
|
queries.sort((a, b) => a.title.localeCompare(b.title));
|
|
1631
1902
|
const indexContent = buildIndexContent(concepts, queries);
|
|
1632
|
-
const indexPath =
|
|
1903
|
+
const indexPath = path15.join(root, INDEX_FILE);
|
|
1633
1904
|
await atomicWrite(indexPath, indexContent);
|
|
1634
1905
|
const total = concepts.length + queries.length;
|
|
1635
1906
|
status("+", success(`Index updated with ${total} pages.`));
|
|
@@ -1643,7 +1914,7 @@ async function scanWikiPages(dirPath) {
|
|
|
1643
1914
|
}
|
|
1644
1915
|
const scanned = [];
|
|
1645
1916
|
for (const file of files.filter((f) => f.endsWith(".md"))) {
|
|
1646
|
-
const content = await safeReadFile(
|
|
1917
|
+
const content = await safeReadFile(path15.join(dirPath, file));
|
|
1647
1918
|
const { meta } = parseFrontmatter(content);
|
|
1648
1919
|
scanned.push({ slug: file.replace(/\.md$/, ""), meta });
|
|
1649
1920
|
}
|
|
@@ -1680,7 +1951,7 @@ function buildIndexContent(concepts, queries) {
|
|
|
1680
1951
|
|
|
1681
1952
|
// src/compiler/obsidian.ts
|
|
1682
1953
|
import { readdir as readdir4 } from "fs/promises";
|
|
1683
|
-
import
|
|
1954
|
+
import path16 from "path";
|
|
1684
1955
|
var ABBREVIATION_MIN_WORDS = 3;
|
|
1685
1956
|
var SWAP_CONJUNCTIONS = [" and ", " or "];
|
|
1686
1957
|
function addObsidianMeta(frontmatter, conceptTitle, tags) {
|
|
@@ -1722,11 +1993,11 @@ function generateAbbreviation(title) {
|
|
|
1722
1993
|
return abbreviation;
|
|
1723
1994
|
}
|
|
1724
1995
|
async function generateMOC(root) {
|
|
1725
|
-
const conceptsPath =
|
|
1996
|
+
const conceptsPath = path16.join(root, CONCEPTS_DIR);
|
|
1726
1997
|
const pages = await loadConceptPages(conceptsPath);
|
|
1727
1998
|
const tagGroups = groupPagesByTag(pages);
|
|
1728
1999
|
const content = buildMOCContent(tagGroups);
|
|
1729
|
-
await atomicWrite(
|
|
2000
|
+
await atomicWrite(path16.join(root, MOC_FILE), content);
|
|
1730
2001
|
}
|
|
1731
2002
|
async function loadConceptPages(conceptsPath) {
|
|
1732
2003
|
let files;
|
|
@@ -1738,7 +2009,7 @@ async function loadConceptPages(conceptsPath) {
|
|
|
1738
2009
|
const pages = [];
|
|
1739
2010
|
for (const file of files) {
|
|
1740
2011
|
if (!file.endsWith(".md")) continue;
|
|
1741
|
-
const content = await safeReadFile(
|
|
2012
|
+
const content = await safeReadFile(path16.join(conceptsPath, file));
|
|
1742
2013
|
if (!content) continue;
|
|
1743
2014
|
const { meta } = parseFrontmatter(content);
|
|
1744
2015
|
if (meta.orphaned) continue;
|
|
@@ -1789,9 +2060,143 @@ function buildMOCContent(tagGroups) {
|
|
|
1789
2060
|
}
|
|
1790
2061
|
|
|
1791
2062
|
// src/utils/embeddings.ts
|
|
1792
|
-
import { readFile as
|
|
2063
|
+
import { readFile as readFile12, readdir as readdir5 } from "fs/promises";
|
|
1793
2064
|
import { existsSync as existsSync4 } from "fs";
|
|
1794
|
-
import
|
|
2065
|
+
import path17 from "path";
|
|
2066
|
+
|
|
2067
|
+
// src/utils/retrieval.ts
|
|
2068
|
+
import { createHash as createHash2 } from "crypto";
|
|
2069
|
+
function hashChunkText(text) {
|
|
2070
|
+
return createHash2("sha256").update(text, "utf8").digest("hex").slice(0, 16);
|
|
2071
|
+
}
|
|
2072
|
+
function splitIntoChunks(body) {
|
|
2073
|
+
const paragraphs = extractParagraphs(body);
|
|
2074
|
+
if (paragraphs.length === 0) return [];
|
|
2075
|
+
const chunks = [];
|
|
2076
|
+
let buffer = "";
|
|
2077
|
+
for (const paragraph of paragraphs) {
|
|
2078
|
+
for (const piece of splitOversizedParagraph(paragraph)) {
|
|
2079
|
+
buffer = appendParagraph(buffer, piece, chunks);
|
|
2080
|
+
}
|
|
2081
|
+
}
|
|
2082
|
+
if (buffer.length > 0) chunks.push(buffer);
|
|
2083
|
+
return mergeTrailingFragment(chunks);
|
|
2084
|
+
}
|
|
2085
|
+
function appendParagraph(buffer, paragraph, chunks) {
|
|
2086
|
+
const candidate = buffer ? `${buffer}
|
|
2087
|
+
|
|
2088
|
+
${paragraph}` : paragraph;
|
|
2089
|
+
if (candidate.length <= CHUNK_TARGET_CHARS) return candidate;
|
|
2090
|
+
if (buffer.length > 0) {
|
|
2091
|
+
chunks.push(buffer);
|
|
2092
|
+
return paragraph;
|
|
2093
|
+
}
|
|
2094
|
+
chunks.push(candidate);
|
|
2095
|
+
return "";
|
|
2096
|
+
}
|
|
2097
|
+
function mergeTrailingFragment(chunks) {
|
|
2098
|
+
if (chunks.length < 2) return chunks;
|
|
2099
|
+
const last = chunks[chunks.length - 1];
|
|
2100
|
+
if (last.length >= CHUNK_MIN_CHARS) return chunks;
|
|
2101
|
+
const previous = chunks[chunks.length - 2];
|
|
2102
|
+
if (previous.length + last.length + 2 > CHUNK_MAX_CHARS) return chunks;
|
|
2103
|
+
const merged = chunks.slice(0, -2);
|
|
2104
|
+
merged.push(`${previous}
|
|
2105
|
+
|
|
2106
|
+
${last}`);
|
|
2107
|
+
return merged;
|
|
2108
|
+
}
|
|
2109
|
+
function extractParagraphs(body) {
|
|
2110
|
+
return body.split(/\n{2,}/).map((p) => p.trim()).filter((p) => p.length > 0);
|
|
2111
|
+
}
|
|
2112
|
+
function splitOversizedParagraph(paragraph) {
|
|
2113
|
+
if (paragraph.length <= CHUNK_MAX_CHARS) return [paragraph];
|
|
2114
|
+
const sentences = paragraph.split(/(?<=[.!?])\s+/);
|
|
2115
|
+
const pieces = [];
|
|
2116
|
+
let buffer = "";
|
|
2117
|
+
for (const sentence of sentences) {
|
|
2118
|
+
if ((buffer + " " + sentence).length > CHUNK_MAX_CHARS && buffer.length > 0) {
|
|
2119
|
+
pieces.push(buffer.trim());
|
|
2120
|
+
buffer = sentence;
|
|
2121
|
+
} else {
|
|
2122
|
+
buffer = buffer ? `${buffer} ${sentence}` : sentence;
|
|
2123
|
+
}
|
|
2124
|
+
}
|
|
2125
|
+
if (buffer.length > 0) pieces.push(buffer.trim());
|
|
2126
|
+
return pieces.flatMap(hardCut);
|
|
2127
|
+
}
|
|
2128
|
+
function hardCut(text) {
|
|
2129
|
+
if (text.length <= CHUNK_MAX_CHARS) return [text];
|
|
2130
|
+
const pieces = [];
|
|
2131
|
+
for (let start = 0; start < text.length; start += CHUNK_MAX_CHARS) {
|
|
2132
|
+
pieces.push(text.slice(start, start + CHUNK_MAX_CHARS));
|
|
2133
|
+
}
|
|
2134
|
+
return pieces;
|
|
2135
|
+
}
|
|
2136
|
+
function rerankWithBm25(query, candidates) {
|
|
2137
|
+
if (candidates.length === 0) return [];
|
|
2138
|
+
const queryTerms = tokenize(query);
|
|
2139
|
+
if (queryTerms.length === 0) {
|
|
2140
|
+
return candidates.map((candidate) => ({ candidate, score: candidate.baseScore }));
|
|
2141
|
+
}
|
|
2142
|
+
const docs = candidates.map((c) => tokenize(c.text));
|
|
2143
|
+
const stats = buildCorpusStats(docs);
|
|
2144
|
+
return rankByBm25Score(candidates, docs, queryTerms, stats);
|
|
2145
|
+
}
|
|
2146
|
+
function rankByBm25Score(candidates, docs, queryTerms, stats) {
|
|
2147
|
+
const scored = candidates.map((candidate, index) => {
|
|
2148
|
+
const lexical = bm25Score(queryTerms, docs[index], stats);
|
|
2149
|
+
return { candidate, score: lexical + candidate.baseScore * BASE_SCORE_WEIGHT };
|
|
2150
|
+
});
|
|
2151
|
+
scored.sort((a, b) => b.score - a.score);
|
|
2152
|
+
return scored;
|
|
2153
|
+
}
|
|
2154
|
+
function tokenize(text) {
|
|
2155
|
+
return text.toLowerCase().match(/[a-z0-9]+/g) ?? [];
|
|
2156
|
+
}
|
|
2157
|
+
function buildCorpusStats(docs) {
|
|
2158
|
+
const docFreq = /* @__PURE__ */ new Map();
|
|
2159
|
+
let totalLen = 0;
|
|
2160
|
+
for (const tokens of docs) {
|
|
2161
|
+
totalLen += tokens.length;
|
|
2162
|
+
const unique = new Set(tokens);
|
|
2163
|
+
for (const term of unique) docFreq.set(term, (docFreq.get(term) ?? 0) + 1);
|
|
2164
|
+
}
|
|
2165
|
+
const totalDocs = docs.length;
|
|
2166
|
+
const avgDocLen = totalDocs > 0 ? totalLen / totalDocs : 0;
|
|
2167
|
+
return { docFreq, avgDocLen, totalDocs };
|
|
2168
|
+
}
|
|
2169
|
+
var BM25_K1 = 1.5;
|
|
2170
|
+
var BM25_B = 0.75;
|
|
2171
|
+
var BASE_SCORE_WEIGHT = 0.5;
|
|
2172
|
+
function bm25Score(queryTerms, docTokens, stats) {
|
|
2173
|
+
if (docTokens.length === 0 || stats.totalDocs === 0) return 0;
|
|
2174
|
+
const termFreq = countTerms(docTokens);
|
|
2175
|
+
const lengthRatio = docTokens.length / (stats.avgDocLen || 1);
|
|
2176
|
+
let total = 0;
|
|
2177
|
+
for (const term of queryTerms) {
|
|
2178
|
+
const tf = termFreq.get(term) ?? 0;
|
|
2179
|
+
if (tf === 0) continue;
|
|
2180
|
+
const idf = idfWeight(stats.docFreq.get(term) ?? 0, stats.totalDocs);
|
|
2181
|
+
const numerator = tf * (BM25_K1 + 1);
|
|
2182
|
+
const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * lengthRatio);
|
|
2183
|
+
total += idf * (numerator / denominator);
|
|
2184
|
+
}
|
|
2185
|
+
return total;
|
|
2186
|
+
}
|
|
2187
|
+
function idfWeight(docFrequency, totalDocs) {
|
|
2188
|
+
const numerator = totalDocs - docFrequency + 0.5;
|
|
2189
|
+
const denominator = docFrequency + 0.5;
|
|
2190
|
+
return Math.log(1 + numerator / denominator);
|
|
2191
|
+
}
|
|
2192
|
+
function countTerms(tokens) {
|
|
2193
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2194
|
+
for (const token of tokens) counts.set(token, (counts.get(token) ?? 0) + 1);
|
|
2195
|
+
return counts;
|
|
2196
|
+
}
|
|
2197
|
+
|
|
2198
|
+
// src/utils/embeddings.ts
|
|
2199
|
+
var STORE_VERSION = 2;
|
|
1795
2200
|
function cosineSimilarity(a, b) {
|
|
1796
2201
|
if (a.length !== b.length || a.length === 0) return 0;
|
|
1797
2202
|
let dot = 0;
|
|
@@ -1813,24 +2218,27 @@ function findTopK(queryVec, store, k) {
|
|
|
1813
2218
|
scored.sort((left, right) => right.score - left.score);
|
|
1814
2219
|
return scored.slice(0, k).map((item) => item.entry);
|
|
1815
2220
|
}
|
|
2221
|
+
function findTopKChunks(queryVec, chunks, k) {
|
|
2222
|
+
const scored = chunks.map((chunk) => ({
|
|
2223
|
+
chunk,
|
|
2224
|
+
score: cosineSimilarity(queryVec, chunk.vector)
|
|
2225
|
+
}));
|
|
2226
|
+
scored.sort((left, right) => right.score - left.score);
|
|
2227
|
+
return scored.slice(0, k);
|
|
2228
|
+
}
|
|
1816
2229
|
async function readEmbeddingStore(root) {
|
|
1817
|
-
const filePath =
|
|
2230
|
+
const filePath = path17.join(root, EMBEDDINGS_FILE);
|
|
1818
2231
|
if (!existsSync4(filePath)) return null;
|
|
1819
|
-
const raw = await
|
|
2232
|
+
const raw = await readFile12(filePath, "utf-8");
|
|
1820
2233
|
return JSON.parse(raw);
|
|
1821
2234
|
}
|
|
1822
2235
|
async function writeEmbeddingStore(root, store) {
|
|
1823
|
-
const filePath =
|
|
2236
|
+
const filePath = path17.join(root, EMBEDDINGS_FILE);
|
|
1824
2237
|
await atomicWrite(filePath, JSON.stringify(store, null, 2));
|
|
1825
2238
|
}
|
|
1826
2239
|
async function findRelevantPages(root, question) {
|
|
1827
|
-
const store = await
|
|
1828
|
-
if (!store
|
|
1829
|
-
const activeModel = resolveEmbeddingModel();
|
|
1830
|
-
if (store.model !== activeModel) {
|
|
1831
|
-
warnStaleEmbeddingStore(store.model, activeModel);
|
|
1832
|
-
return [];
|
|
1833
|
-
}
|
|
2240
|
+
const store = await loadActiveStore(root, (s) => s.entries.length > 0);
|
|
2241
|
+
if (!store) return [];
|
|
1834
2242
|
const queryVec = await getProvider().embed(question);
|
|
1835
2243
|
return findTopK(queryVec, store, EMBEDDING_TOP_K).map((entry) => ({
|
|
1836
2244
|
slug: entry.slug,
|
|
@@ -1838,10 +2246,26 @@ async function findRelevantPages(root, question) {
|
|
|
1838
2246
|
summary: entry.summary
|
|
1839
2247
|
}));
|
|
1840
2248
|
}
|
|
2249
|
+
async function findRelevantChunks(root, question, k) {
|
|
2250
|
+
const store = await loadActiveStore(root, (s) => Boolean(s.chunks && s.chunks.length > 0));
|
|
2251
|
+
if (!store) return [];
|
|
2252
|
+
const queryVec = await getProvider().embed(question);
|
|
2253
|
+
return findTopKChunks(queryVec, store.chunks ?? [], k);
|
|
2254
|
+
}
|
|
2255
|
+
async function loadActiveStore(root, hasContent) {
|
|
2256
|
+
const store = await readEmbeddingStore(root);
|
|
2257
|
+
if (!store || !hasContent(store)) return null;
|
|
2258
|
+
const activeModel = resolveEmbeddingModel();
|
|
2259
|
+
if (store.model !== activeModel) {
|
|
2260
|
+
warnStaleEmbeddingStore(store.model, activeModel);
|
|
2261
|
+
return null;
|
|
2262
|
+
}
|
|
2263
|
+
return store;
|
|
2264
|
+
}
|
|
1841
2265
|
async function collectPageRecords(root) {
|
|
1842
2266
|
const records = [];
|
|
1843
2267
|
for (const dir of [CONCEPTS_DIR, QUERIES_DIR]) {
|
|
1844
|
-
const absDir =
|
|
2268
|
+
const absDir = path17.join(root, dir);
|
|
1845
2269
|
let files;
|
|
1846
2270
|
try {
|
|
1847
2271
|
files = await readdir5(absDir);
|
|
@@ -1849,18 +2273,23 @@ async function collectPageRecords(root) {
|
|
|
1849
2273
|
continue;
|
|
1850
2274
|
}
|
|
1851
2275
|
for (const file of files.filter((f) => f.endsWith(".md"))) {
|
|
1852
|
-
const
|
|
1853
|
-
|
|
1854
|
-
if (meta.orphaned || typeof meta.title !== "string") continue;
|
|
1855
|
-
records.push({
|
|
1856
|
-
slug: file.replace(/\.md$/, ""),
|
|
1857
|
-
title: meta.title,
|
|
1858
|
-
summary: typeof meta.summary === "string" ? meta.summary : ""
|
|
1859
|
-
});
|
|
2276
|
+
const record = await readPageRecord(absDir, file);
|
|
2277
|
+
if (record) records.push(record);
|
|
1860
2278
|
}
|
|
1861
2279
|
}
|
|
1862
2280
|
return records;
|
|
1863
2281
|
}
|
|
2282
|
+
async function readPageRecord(absDir, file) {
|
|
2283
|
+
const content = await safeReadFile(path17.join(absDir, file));
|
|
2284
|
+
const { meta, body } = parseFrontmatter(content);
|
|
2285
|
+
if (meta.orphaned || typeof meta.title !== "string") return null;
|
|
2286
|
+
return {
|
|
2287
|
+
slug: file.replace(/\.md$/, ""),
|
|
2288
|
+
title: meta.title,
|
|
2289
|
+
summary: typeof meta.summary === "string" ? meta.summary : "",
|
|
2290
|
+
body
|
|
2291
|
+
};
|
|
2292
|
+
}
|
|
1864
2293
|
function buildEmbeddingText(record) {
|
|
1865
2294
|
return record.summary ? `${record.title}
|
|
1866
2295
|
|
|
@@ -1913,6 +2342,56 @@ function mergeEntries(existing, fresh, liveSlugs) {
|
|
|
1913
2342
|
}
|
|
1914
2343
|
return Array.from(bySlug.values());
|
|
1915
2344
|
}
|
|
2345
|
+
async function refreshChunkEmbeddings(records, existing, forceAll) {
|
|
2346
|
+
const liveSlugs = new Set(records.map((r) => r.slug));
|
|
2347
|
+
const existingByKey = indexChunksByKey(existing.filter((c) => liveSlugs.has(c.slug)));
|
|
2348
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
2349
|
+
const fresh = [];
|
|
2350
|
+
for (const record of records) {
|
|
2351
|
+
const pageChunks = await embedRecordChunks(record, existingByKey, forceAll, now);
|
|
2352
|
+
fresh.push(...pageChunks);
|
|
2353
|
+
}
|
|
2354
|
+
return fresh;
|
|
2355
|
+
}
|
|
2356
|
+
async function embedRecordChunks(record, existingByKey, forceAll, now) {
|
|
2357
|
+
const provider = getProvider();
|
|
2358
|
+
const chunkTexts = splitIntoChunks(record.body);
|
|
2359
|
+
const out = [];
|
|
2360
|
+
for (let i = 0; i < chunkTexts.length; i++) {
|
|
2361
|
+
const text = chunkTexts[i];
|
|
2362
|
+
const contentHash = hashChunkText(text);
|
|
2363
|
+
const reused = pickReusableChunk(existingByKey, record.slug, i, contentHash, forceAll);
|
|
2364
|
+
if (reused) {
|
|
2365
|
+
out.push({ ...reused, title: record.title });
|
|
2366
|
+
continue;
|
|
2367
|
+
}
|
|
2368
|
+
const vector = await provider.embed(text);
|
|
2369
|
+
out.push({
|
|
2370
|
+
slug: record.slug,
|
|
2371
|
+
title: record.title,
|
|
2372
|
+
chunkIndex: i,
|
|
2373
|
+
contentHash,
|
|
2374
|
+
text,
|
|
2375
|
+
vector,
|
|
2376
|
+
updatedAt: now
|
|
2377
|
+
});
|
|
2378
|
+
}
|
|
2379
|
+
return out;
|
|
2380
|
+
}
|
|
2381
|
+
function indexChunksByKey(chunks) {
|
|
2382
|
+
const byKey = /* @__PURE__ */ new Map();
|
|
2383
|
+
for (const chunk of chunks) byKey.set(chunkKey(chunk.slug, chunk.chunkIndex), chunk);
|
|
2384
|
+
return byKey;
|
|
2385
|
+
}
|
|
2386
|
+
function chunkKey(slug, chunkIndex) {
|
|
2387
|
+
return `${slug}#${chunkIndex}`;
|
|
2388
|
+
}
|
|
2389
|
+
function pickReusableChunk(byKey, slug, chunkIndex, contentHash, forceAll) {
|
|
2390
|
+
if (forceAll) return null;
|
|
2391
|
+
const existing = byKey.get(chunkKey(slug, chunkIndex));
|
|
2392
|
+
if (!existing) return null;
|
|
2393
|
+
return existing.contentHash === contentHash ? existing : null;
|
|
2394
|
+
}
|
|
1916
2395
|
async function updateEmbeddings(root, changedSlugs) {
|
|
1917
2396
|
const records = await collectPageRecords(root);
|
|
1918
2397
|
const liveSlugs = new Set(records.map((r) => r.slug));
|
|
@@ -1921,29 +2400,51 @@ async function updateEmbeddings(root, changedSlugs) {
|
|
|
1921
2400
|
const modelChanged = Boolean(existingStore && existingStore.model !== embeddingModel);
|
|
1922
2401
|
const toEmbed = new Set(changedSlugs.filter((slug) => liveSlugs.has(slug)));
|
|
1923
2402
|
const previousEntries = modelChanged ? [] : existingStore?.entries ?? [];
|
|
1924
|
-
|
|
2403
|
+
const previousChunks = modelChanged ? [] : existingStore?.chunks ?? [];
|
|
2404
|
+
const isEmptyStore = isStoreEmpty(existingStore);
|
|
2405
|
+
if (!existingStore || modelChanged || isEmptyStore && liveSlugs.size > 0) {
|
|
1925
2406
|
for (const record of records) toEmbed.add(record.slug);
|
|
1926
2407
|
}
|
|
1927
|
-
if (!modelChanged
|
|
2408
|
+
if (!shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs)) {
|
|
1928
2409
|
return;
|
|
1929
2410
|
}
|
|
1930
2411
|
const freshEntries = await embedPages(records, toEmbed);
|
|
1931
2412
|
const mergedEntries = mergeEntries(previousEntries, freshEntries, liveSlugs);
|
|
1932
|
-
const
|
|
2413
|
+
const mergedChunks = await refreshChunkEmbeddings(records, previousChunks, modelChanged);
|
|
2414
|
+
await persistRefreshedStore(root, embeddingModel, mergedEntries, mergedChunks);
|
|
2415
|
+
}
|
|
2416
|
+
async function persistRefreshedStore(root, embeddingModel, entries, chunks) {
|
|
2417
|
+
const dimensions = entries[0]?.vector.length ?? chunks[0]?.vector.length ?? 0;
|
|
1933
2418
|
const store = {
|
|
1934
|
-
version:
|
|
2419
|
+
version: STORE_VERSION,
|
|
1935
2420
|
model: embeddingModel,
|
|
1936
2421
|
dimensions,
|
|
1937
|
-
entries
|
|
2422
|
+
entries,
|
|
2423
|
+
chunks
|
|
1938
2424
|
};
|
|
1939
2425
|
await writeEmbeddingStore(root, store);
|
|
1940
|
-
status(
|
|
2426
|
+
status(
|
|
2427
|
+
"*",
|
|
2428
|
+
dim(`Embeddings updated (${entries.length} pages, ${chunks.length} chunks).`)
|
|
2429
|
+
);
|
|
2430
|
+
}
|
|
2431
|
+
function isStoreEmpty(store) {
|
|
2432
|
+
if (!store) return false;
|
|
2433
|
+
return store.entries.length === 0 && (!store.chunks || store.chunks.length === 0);
|
|
2434
|
+
}
|
|
2435
|
+
function shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs) {
|
|
2436
|
+
if (modelChanged) return true;
|
|
2437
|
+
if (toEmbed.size > 0) return true;
|
|
2438
|
+
if (!previousEntries.every((e) => liveSlugs.has(e.slug))) return true;
|
|
2439
|
+
if (!previousChunks.every((c) => liveSlugs.has(c.slug))) return true;
|
|
2440
|
+
if (previousEntries.length > 0 && previousChunks.length === 0 && liveSlugs.size > 0) return true;
|
|
2441
|
+
return false;
|
|
1941
2442
|
}
|
|
1942
2443
|
|
|
1943
2444
|
// src/compiler/candidates.ts
|
|
1944
2445
|
import { readdir as readdir6, rename as rename3, unlink as unlink2, writeFile as writeFile4, mkdir as mkdir5 } from "fs/promises";
|
|
1945
2446
|
import { existsSync as existsSync5 } from "fs";
|
|
1946
|
-
import
|
|
2447
|
+
import path18 from "path";
|
|
1947
2448
|
import { randomBytes } from "crypto";
|
|
1948
2449
|
var ID_SUFFIX_BYTES = 4;
|
|
1949
2450
|
var CANDIDATE_EXT = ".json";
|
|
@@ -1952,10 +2453,10 @@ function buildCandidateId(slug) {
|
|
|
1952
2453
|
return `${slug}-${suffix}`;
|
|
1953
2454
|
}
|
|
1954
2455
|
function candidatePath(root, id) {
|
|
1955
|
-
return
|
|
2456
|
+
return path18.join(root, CANDIDATES_DIR, `${id}${CANDIDATE_EXT}`);
|
|
1956
2457
|
}
|
|
1957
2458
|
function archivePath(root, id) {
|
|
1958
|
-
return
|
|
2459
|
+
return path18.join(root, CANDIDATES_ARCHIVE_DIR, `${id}${CANDIDATE_EXT}`);
|
|
1959
2460
|
}
|
|
1960
2461
|
async function writeCandidate(root, draft) {
|
|
1961
2462
|
const candidate = {
|
|
@@ -2006,7 +2507,7 @@ function isValidCandidate(value) {
|
|
|
2006
2507
|
return typeof candidate.id === "string" && typeof candidate.title === "string" && typeof candidate.slug === "string" && typeof candidate.body === "string" && Array.isArray(candidate.sources);
|
|
2007
2508
|
}
|
|
2008
2509
|
async function listCandidates(root) {
|
|
2009
|
-
const dir =
|
|
2510
|
+
const dir = path18.join(root, CANDIDATES_DIR);
|
|
2010
2511
|
if (!existsSync5(dir)) return [];
|
|
2011
2512
|
const entries = await readdir6(dir, { withFileTypes: true });
|
|
2012
2513
|
const candidates = [];
|
|
@@ -2033,7 +2534,7 @@ async function archiveCandidate(root, id) {
|
|
|
2033
2534
|
const sourcePath = candidatePath(root, id);
|
|
2034
2535
|
if (!existsSync5(sourcePath)) return false;
|
|
2035
2536
|
const target = archivePath(root, id);
|
|
2036
|
-
await mkdir5(
|
|
2537
|
+
await mkdir5(path18.dirname(target), { recursive: true });
|
|
2037
2538
|
try {
|
|
2038
2539
|
await rename3(sourcePath, target);
|
|
2039
2540
|
} catch {
|
|
@@ -2045,9 +2546,9 @@ async function archiveCandidate(root, id) {
|
|
|
2045
2546
|
}
|
|
2046
2547
|
|
|
2047
2548
|
// src/linter/rules.ts
|
|
2048
|
-
import { readdir as readdir7, readFile as
|
|
2549
|
+
import { readdir as readdir7, readFile as readFile13 } from "fs/promises";
|
|
2049
2550
|
import { existsSync as existsSync6 } from "fs";
|
|
2050
|
-
import
|
|
2551
|
+
import path19 from "path";
|
|
2051
2552
|
var MIN_BODY_LENGTH = 50;
|
|
2052
2553
|
var WIKILINK_PATTERN2 = /\[\[([^\]]+)\]\]/g;
|
|
2053
2554
|
var CITATION_PATTERN = /\^\[([^\]]+)\]/g;
|
|
@@ -2068,22 +2569,22 @@ async function readMarkdownFiles(dirPath) {
|
|
|
2068
2569
|
const mdFiles = entries.filter((f) => f.endsWith(".md"));
|
|
2069
2570
|
const results = await Promise.all(
|
|
2070
2571
|
mdFiles.map(async (fileName) => {
|
|
2071
|
-
const filePath =
|
|
2072
|
-
const content = await
|
|
2572
|
+
const filePath = path19.join(dirPath, fileName);
|
|
2573
|
+
const content = await readFile13(filePath, "utf-8");
|
|
2073
2574
|
return { filePath, content };
|
|
2074
2575
|
})
|
|
2075
2576
|
);
|
|
2076
2577
|
return results;
|
|
2077
2578
|
}
|
|
2078
2579
|
async function collectAllPages(root) {
|
|
2079
|
-
const conceptPages = await readMarkdownFiles(
|
|
2080
|
-
const queryPages = await readMarkdownFiles(
|
|
2580
|
+
const conceptPages = await readMarkdownFiles(path19.join(root, CONCEPTS_DIR));
|
|
2581
|
+
const queryPages = await readMarkdownFiles(path19.join(root, QUERIES_DIR));
|
|
2081
2582
|
return [...conceptPages, ...queryPages];
|
|
2082
2583
|
}
|
|
2083
2584
|
function buildPageSlugSet(pages) {
|
|
2084
2585
|
const slugs = /* @__PURE__ */ new Set();
|
|
2085
2586
|
for (const page of pages) {
|
|
2086
|
-
const baseName =
|
|
2587
|
+
const baseName = path19.basename(page.filePath, ".md");
|
|
2087
2588
|
slugs.add(baseName.toLowerCase());
|
|
2088
2589
|
}
|
|
2089
2590
|
return slugs;
|
|
@@ -2318,7 +2819,7 @@ function countLines(content) {
|
|
|
2318
2819
|
}
|
|
2319
2820
|
async function checkBrokenCitations(root) {
|
|
2320
2821
|
const pages = await collectAllPages(root);
|
|
2321
|
-
const sourcesDir =
|
|
2822
|
+
const sourcesDir = path19.join(root, SOURCES_DIR);
|
|
2322
2823
|
const results = [];
|
|
2323
2824
|
const lineCountCache = /* @__PURE__ */ new Map();
|
|
2324
2825
|
for (const page of pages) {
|
|
@@ -2333,7 +2834,7 @@ async function collectBrokenForMarker(captured, line, pageFile, sourcesDir, line
|
|
|
2333
2834
|
const trimmed = part.trim();
|
|
2334
2835
|
if (trimmed.length === 0) continue;
|
|
2335
2836
|
const filename = stripSpanSuffix(trimmed);
|
|
2336
|
-
const citedPath =
|
|
2837
|
+
const citedPath = path19.join(sourcesDir, filename);
|
|
2337
2838
|
if (!existsSync6(citedPath)) {
|
|
2338
2839
|
out.push({
|
|
2339
2840
|
rule: "broken-citation",
|
|
@@ -2387,7 +2888,7 @@ async function checkMalformedClaimCitations(root) {
|
|
|
2387
2888
|
|
|
2388
2889
|
// src/compiler/page-renderer.ts
|
|
2389
2890
|
import { readdir as readdir8 } from "fs/promises";
|
|
2390
|
-
import
|
|
2891
|
+
import path20 from "path";
|
|
2391
2892
|
|
|
2392
2893
|
// src/compiler/provenance.ts
|
|
2393
2894
|
function addProvenanceMeta(fields, concept) {
|
|
@@ -2417,7 +2918,7 @@ function reportContradictionWarnings(conceptTitle, concept) {
|
|
|
2417
2918
|
// src/compiler/page-renderer.ts
|
|
2418
2919
|
var RELATED_PAGE_CONTEXT_LIMIT = 5;
|
|
2419
2920
|
async function renderMergedPageContent(root, entry, schema) {
|
|
2420
|
-
const pagePath =
|
|
2921
|
+
const pagePath = path20.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
|
|
2421
2922
|
const existingPage = await safeReadFile(pagePath);
|
|
2422
2923
|
const relatedPages = await loadRelatedPages(root, entry.slug);
|
|
2423
2924
|
const system = buildPagePrompt(
|
|
@@ -2456,7 +2957,7 @@ function buildMergedFrontmatter(entry, existingPage, schema) {
|
|
|
2456
2957
|
return buildFrontmatter(frontmatterFields);
|
|
2457
2958
|
}
|
|
2458
2959
|
async function loadRelatedPages(root, excludeSlug) {
|
|
2459
|
-
const conceptsPath =
|
|
2960
|
+
const conceptsPath = path20.join(root, CONCEPTS_DIR);
|
|
2460
2961
|
let files;
|
|
2461
2962
|
try {
|
|
2462
2963
|
files = await readdir8(conceptsPath);
|
|
@@ -2466,7 +2967,7 @@ async function loadRelatedPages(root, excludeSlug) {
|
|
|
2466
2967
|
const related = files.filter((f) => f.endsWith(".md") && f !== `${excludeSlug}.md`).slice(0, RELATED_PAGE_CONTEXT_LIMIT);
|
|
2467
2968
|
const contents = [];
|
|
2468
2969
|
for (const f of related) {
|
|
2469
|
-
const content = await safeReadFile(
|
|
2970
|
+
const content = await safeReadFile(path20.join(conceptsPath, f));
|
|
2470
2971
|
if (!content) continue;
|
|
2471
2972
|
const { meta } = parseFrontmatter(content);
|
|
2472
2973
|
if (meta.orphaned) continue;
|
|
@@ -2667,9 +3168,9 @@ function printChangesSummary(changes) {
|
|
|
2667
3168
|
}
|
|
2668
3169
|
async function extractForSource(root, sourceFile) {
|
|
2669
3170
|
status("*", info(`Extracting: ${sourceFile}`));
|
|
2670
|
-
const sourcePath =
|
|
2671
|
-
const sourceContent = await
|
|
2672
|
-
const existingIndex = await safeReadFile(
|
|
3171
|
+
const sourcePath = path21.join(root, SOURCES_DIR, sourceFile);
|
|
3172
|
+
const sourceContent = await readFile14(sourcePath, "utf-8");
|
|
3173
|
+
const existingIndex = await safeReadFile(path21.join(root, INDEX_FILE));
|
|
2673
3174
|
const concepts = await extractConcepts(sourceContent, existingIndex);
|
|
2674
3175
|
if (concepts.length > 0) {
|
|
2675
3176
|
const names = concepts.map((c) => c.concept).join(", ");
|
|
@@ -2732,7 +3233,7 @@ async function generateMergedPage(root, entry, schema, options, sourceStates) {
|
|
|
2732
3233
|
if (options.review) {
|
|
2733
3234
|
return await persistReviewCandidate(root, entry, fullPage, sourceStates, schema);
|
|
2734
3235
|
}
|
|
2735
|
-
const pagePath =
|
|
3236
|
+
const pagePath = path21.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
|
|
2736
3237
|
const error2 = await writePageIfValid(pagePath, fullPage, entry.concept.concept);
|
|
2737
3238
|
return { error: error2 ?? void 0 };
|
|
2738
3239
|
}
|
|
@@ -2760,7 +3261,7 @@ async function generateSeedPages(root, schema, generation) {
|
|
|
2760
3261
|
}
|
|
2761
3262
|
async function generateSingleSeedPage(root, schema, seed) {
|
|
2762
3263
|
const slug = slugify(seed.title);
|
|
2763
|
-
const pagePath =
|
|
3264
|
+
const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
|
|
2764
3265
|
const relatedContent = await loadSeedRelatedPages(root, seed.relatedSlugs ?? []);
|
|
2765
3266
|
const rule = schema.kinds[seed.kind];
|
|
2766
3267
|
const system = buildSeedPagePrompt(seed, rule, relatedContent);
|
|
@@ -2792,7 +3293,7 @@ async function loadSeedRelatedPages(root, slugs) {
|
|
|
2792
3293
|
if (slugs.length === 0) return "";
|
|
2793
3294
|
const contents = [];
|
|
2794
3295
|
for (const slug of slugs) {
|
|
2795
|
-
const pagePath =
|
|
3296
|
+
const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
|
|
2796
3297
|
const content = await safeReadFile(pagePath);
|
|
2797
3298
|
if (content) contents.push(content);
|
|
2798
3299
|
}
|
|
@@ -2847,7 +3348,7 @@ async function compileCommand(options = {}) {
|
|
|
2847
3348
|
|
|
2848
3349
|
// src/commands/query.ts
|
|
2849
3350
|
import { existsSync as existsSync8 } from "fs";
|
|
2850
|
-
import
|
|
3351
|
+
import path22 from "path";
|
|
2851
3352
|
var PAGE_DIRS = [CONCEPTS_DIR, QUERIES_DIR];
|
|
2852
3353
|
var PAGE_SELECTION_TOOL = {
|
|
2853
3354
|
name: "select_pages",
|
|
@@ -2895,16 +3396,92 @@ ${indexContent}`;
|
|
|
2895
3396
|
function buildFilteredIndex(candidates) {
|
|
2896
3397
|
return candidates.map((entry) => `- **${entry.slug}**: ${entry.title} \u2014 ${entry.summary}`).join("\n");
|
|
2897
3398
|
}
|
|
2898
|
-
async function selectRelevantPages(root, question) {
|
|
3399
|
+
async function selectRelevantPages(root, question, debug) {
|
|
3400
|
+
const chunkSelection = await trySelectViaChunks(root, question, debug);
|
|
3401
|
+
if (chunkSelection) return chunkSelection;
|
|
2899
3402
|
const candidates = await tryFindRelevantPages(root, question);
|
|
2900
3403
|
if (candidates.length > 0) {
|
|
2901
3404
|
const filteredIndex = buildFilteredIndex(candidates);
|
|
2902
3405
|
const { pages: rawPages2, reasoning: reasoning2 } = await selectPages(question, filteredIndex);
|
|
2903
|
-
return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2 };
|
|
3406
|
+
return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2, chunks: [] };
|
|
2904
3407
|
}
|
|
2905
|
-
const indexContent = await safeReadFile(
|
|
3408
|
+
const indexContent = await safeReadFile(path22.join(root, INDEX_FILE));
|
|
2906
3409
|
const { pages: rawPages, reasoning } = await selectPages(question, indexContent);
|
|
2907
|
-
return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning };
|
|
3410
|
+
return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning, chunks: [] };
|
|
3411
|
+
}
|
|
3412
|
+
async function trySelectViaChunks(root, question, debug) {
|
|
3413
|
+
const ranked = await tryFindRelevantChunks(root, question);
|
|
3414
|
+
if (ranked.length === 0) return null;
|
|
3415
|
+
const reranked = rerankWithBm25(
|
|
3416
|
+
question,
|
|
3417
|
+
ranked.map(({ chunk, score }) => ({ text: chunk.text, baseScore: score, chunk }))
|
|
3418
|
+
);
|
|
3419
|
+
const kept = reranked.slice(0, CHUNK_RERANK_KEEP);
|
|
3420
|
+
const reorderingHappened = wasReordered(ranked, kept.map((k) => k.candidate.chunk));
|
|
3421
|
+
const chunkCitations = toChunkCitations(kept);
|
|
3422
|
+
const pageSlugs = collapseToPages(chunkCitations, QUERY_PAGE_LIMIT);
|
|
3423
|
+
const reasoning = buildChunkReasoning(chunkCitations, pageSlugs);
|
|
3424
|
+
return {
|
|
3425
|
+
pages: pageSlugs,
|
|
3426
|
+
rawPages: pageSlugs,
|
|
3427
|
+
reasoning,
|
|
3428
|
+
chunks: chunkCitations,
|
|
3429
|
+
debug: debug ? buildDebug(chunkCitations, pageSlugs, reorderingHappened) : void 0
|
|
3430
|
+
};
|
|
3431
|
+
}
|
|
3432
|
+
function wasReordered(before, after) {
|
|
3433
|
+
const limit = Math.min(before.length, after.length);
|
|
3434
|
+
for (let i = 0; i < limit; i++) {
|
|
3435
|
+
if (before[i].chunk !== after[i]) return true;
|
|
3436
|
+
}
|
|
3437
|
+
return false;
|
|
3438
|
+
}
|
|
3439
|
+
function toChunkCitations(ranked) {
|
|
3440
|
+
return ranked.map(({ candidate, score }) => ({
|
|
3441
|
+
slug: candidate.chunk.slug,
|
|
3442
|
+
title: candidate.chunk.title,
|
|
3443
|
+
chunkIndex: candidate.chunk.chunkIndex,
|
|
3444
|
+
score,
|
|
3445
|
+
text: candidate.chunk.text
|
|
3446
|
+
}));
|
|
3447
|
+
}
|
|
3448
|
+
function collapseToPages(chunks, limit) {
|
|
3449
|
+
const slugs = [];
|
|
3450
|
+
const seen = /* @__PURE__ */ new Set();
|
|
3451
|
+
for (const chunk of chunks) {
|
|
3452
|
+
if (seen.has(chunk.slug)) continue;
|
|
3453
|
+
seen.add(chunk.slug);
|
|
3454
|
+
slugs.push(chunk.slug);
|
|
3455
|
+
if (slugs.length >= limit) break;
|
|
3456
|
+
}
|
|
3457
|
+
return slugs;
|
|
3458
|
+
}
|
|
3459
|
+
function buildChunkReasoning(chunks, pages) {
|
|
3460
|
+
const top = chunks.slice(0, pages.length);
|
|
3461
|
+
const summary = top.map((c) => `${c.slug}#${c.chunkIndex} (${c.score.toFixed(3)})`).join(", ");
|
|
3462
|
+
return `Selected ${pages.length} page(s) from ${chunks.length} reranked chunks: ${summary}`;
|
|
3463
|
+
}
|
|
3464
|
+
function buildDebug(chunks, pageSlugs, reranked) {
|
|
3465
|
+
const bestPerPage = /* @__PURE__ */ new Map();
|
|
3466
|
+
for (const c of chunks) {
|
|
3467
|
+
const prev = bestPerPage.get(c.slug);
|
|
3468
|
+
if (prev === void 0 || c.score > prev) bestPerPage.set(c.slug, c.score);
|
|
3469
|
+
}
|
|
3470
|
+
return {
|
|
3471
|
+
pages: pageSlugs.map((slug) => ({ slug, score: bestPerPage.get(slug) ?? 0 })),
|
|
3472
|
+
chunks,
|
|
3473
|
+
usedChunks: true,
|
|
3474
|
+
reranked
|
|
3475
|
+
};
|
|
3476
|
+
}
|
|
3477
|
+
async function tryFindRelevantChunks(root, question) {
|
|
3478
|
+
try {
|
|
3479
|
+
return await findRelevantChunks(root, question, CHUNK_TOP_K);
|
|
3480
|
+
} catch (err) {
|
|
3481
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
3482
|
+
status("!", dim(`Chunk pre-filter unavailable (${message}); falling back.`));
|
|
3483
|
+
return [];
|
|
3484
|
+
}
|
|
2908
3485
|
}
|
|
2909
3486
|
async function tryFindRelevantPages(root, question) {
|
|
2910
3487
|
try {
|
|
@@ -2920,7 +3497,7 @@ async function loadSelectedPages(root, slugs) {
|
|
|
2920
3497
|
for (const slug of slugs) {
|
|
2921
3498
|
let content = "";
|
|
2922
3499
|
for (const dir of PAGE_DIRS) {
|
|
2923
|
-
const candidate = await safeReadFile(
|
|
3500
|
+
const candidate = await safeReadFile(path22.join(root, dir, `${slug}.md`));
|
|
2924
3501
|
if (!candidate) continue;
|
|
2925
3502
|
const { meta } = parseFrontmatter(candidate);
|
|
2926
3503
|
if (meta.orphaned) continue;
|
|
@@ -2937,11 +3514,12 @@ ${content}`);
|
|
|
2937
3514
|
return sections.join("\n\n");
|
|
2938
3515
|
}
|
|
2939
3516
|
var ANSWER_SYSTEM_PROMPT = "You are a knowledge assistant. Answer the question using ONLY the wiki content provided. Cite specific pages using [[Page Title]] wikilinks. If the wiki doesn't contain enough information, say so.";
|
|
2940
|
-
async function callAnswerLLM(question, pagesContent, onToken) {
|
|
3517
|
+
async function callAnswerLLM(question, pagesContent, chunks, onToken) {
|
|
3518
|
+
const provenance = chunks.length > 0 ? buildChunkProvenance(chunks) : "";
|
|
2941
3519
|
const userMessage = `Question: ${question}
|
|
2942
3520
|
|
|
2943
3521
|
Relevant wiki pages:
|
|
2944
|
-
${pagesContent}`;
|
|
3522
|
+
${pagesContent}${provenance}`;
|
|
2945
3523
|
return callClaude({
|
|
2946
3524
|
system: ANSWER_SYSTEM_PROMPT,
|
|
2947
3525
|
messages: [{ role: "user", content: userMessage }],
|
|
@@ -2949,6 +3527,16 @@ ${pagesContent}`;
|
|
|
2949
3527
|
onToken
|
|
2950
3528
|
});
|
|
2951
3529
|
}
|
|
3530
|
+
function buildChunkProvenance(chunks) {
|
|
3531
|
+
const sections = chunks.map(
|
|
3532
|
+
(chunk) => `--- ${chunk.slug} (chunk ${chunk.chunkIndex}) ---
|
|
3533
|
+
${chunk.text}`
|
|
3534
|
+
);
|
|
3535
|
+
return `
|
|
3536
|
+
|
|
3537
|
+
Most relevant excerpts (from chunk-level retrieval):
|
|
3538
|
+
${sections.join("\n\n")}`;
|
|
3539
|
+
}
|
|
2952
3540
|
function summarizeAnswer(answer) {
|
|
2953
3541
|
const firstLine = answer.trim().split(/\n/)[0] ?? "";
|
|
2954
3542
|
const firstSentence = firstLine.split(/(?<=[.!?])\s/)[0] ?? firstLine;
|
|
@@ -2956,7 +3544,7 @@ function summarizeAnswer(answer) {
|
|
|
2956
3544
|
}
|
|
2957
3545
|
async function saveQueryPage(root, question, answer) {
|
|
2958
3546
|
const slug = slugify(question);
|
|
2959
|
-
const filePath =
|
|
3547
|
+
const filePath = path22.join(root, QUERIES_DIR, `${slug}.md`);
|
|
2960
3548
|
const frontmatter = buildFrontmatter({
|
|
2961
3549
|
title: question,
|
|
2962
3550
|
summary: summarizeAnswer(answer),
|
|
@@ -2982,30 +3570,42 @@ ${answer}
|
|
|
2982
3570
|
return slug;
|
|
2983
3571
|
}
|
|
2984
3572
|
async function generateAnswer(root, question, options = {}) {
|
|
2985
|
-
if (!existsSync8(
|
|
3573
|
+
if (!existsSync8(path22.join(root, INDEX_FILE))) {
|
|
2986
3574
|
throw new Error("Wiki index not found. Run `llmwiki compile` first.");
|
|
2987
3575
|
}
|
|
2988
|
-
const
|
|
2989
|
-
options.onPageSelection?.(pages, reasoning);
|
|
2990
|
-
const pagesContent = await loadSelectedPages(root, pages);
|
|
3576
|
+
const selection = await selectRelevantPages(root, question, Boolean(options.debug));
|
|
3577
|
+
options.onPageSelection?.(selection.pages, selection.reasoning);
|
|
3578
|
+
const pagesContent = await loadSelectedPages(root, selection.pages);
|
|
2991
3579
|
if (!pagesContent) {
|
|
2992
|
-
return
|
|
2993
|
-
}
|
|
2994
|
-
const answer = await callAnswerLLM(question, pagesContent, options.onToken);
|
|
2995
|
-
let saved;
|
|
2996
|
-
if (options.save) {
|
|
2997
|
-
saved = await saveQueryPage(root, question, answer);
|
|
3580
|
+
return buildEmptyResult(selection);
|
|
2998
3581
|
}
|
|
2999
|
-
|
|
3582
|
+
const answer = await callAnswerLLM(question, pagesContent, selection.chunks, options.onToken);
|
|
3583
|
+
const saved = options.save ? await saveQueryPage(root, question, answer) : void 0;
|
|
3584
|
+
return {
|
|
3585
|
+
answer,
|
|
3586
|
+
selectedPages: selection.pages,
|
|
3587
|
+
reasoning: selection.reasoning,
|
|
3588
|
+
saved,
|
|
3589
|
+
debug: selection.debug
|
|
3590
|
+
};
|
|
3591
|
+
}
|
|
3592
|
+
function buildEmptyResult(selection) {
|
|
3593
|
+
return {
|
|
3594
|
+
answer: "",
|
|
3595
|
+
selectedPages: selection.pages,
|
|
3596
|
+
reasoning: selection.reasoning,
|
|
3597
|
+
debug: selection.debug
|
|
3598
|
+
};
|
|
3000
3599
|
}
|
|
3001
3600
|
async function queryCommand(root, question, options) {
|
|
3002
|
-
if (!existsSync8(
|
|
3601
|
+
if (!existsSync8(path22.join(root, INDEX_FILE))) {
|
|
3003
3602
|
status("!", error("Wiki index not found. Run `llmwiki compile` first."));
|
|
3004
3603
|
return;
|
|
3005
3604
|
}
|
|
3006
3605
|
header("Selecting relevant pages");
|
|
3007
3606
|
const result = await generateAnswer(root, question, {
|
|
3008
3607
|
save: options.save,
|
|
3608
|
+
debug: options.debug,
|
|
3009
3609
|
onToken: (text) => process.stdout.write(text),
|
|
3010
3610
|
onPageSelection: (pages, reasoning) => {
|
|
3011
3611
|
status("i", dim(`Reasoning: ${reasoning}`));
|
|
@@ -3014,6 +3614,7 @@ async function queryCommand(root, question, options) {
|
|
|
3014
3614
|
}
|
|
3015
3615
|
});
|
|
3016
3616
|
process.stdout.write("\n");
|
|
3617
|
+
if (result.debug) printDebugSnapshot(result.debug);
|
|
3017
3618
|
if (!result.answer) {
|
|
3018
3619
|
status("!", error("No matching pages found. Try refining your question."));
|
|
3019
3620
|
return;
|
|
@@ -3024,14 +3625,34 @@ async function queryCommand(root, question, options) {
|
|
|
3024
3625
|
status("\u2192", dim("Tip: use --save to add this answer to your wiki"));
|
|
3025
3626
|
}
|
|
3026
3627
|
}
|
|
3628
|
+
function printDebugSnapshot(debug) {
|
|
3629
|
+
header("Retrieval debug");
|
|
3630
|
+
status(
|
|
3631
|
+
"i",
|
|
3632
|
+
dim(
|
|
3633
|
+
`Source: ${debug.usedChunks ? "chunk-level" : "page-level"}; reranked: ${debug.reranked ? "yes" : "no"}`
|
|
3634
|
+
)
|
|
3635
|
+
);
|
|
3636
|
+
for (const page of debug.pages) {
|
|
3637
|
+
status("\u2022", `${page.slug} (best chunk score ${page.score.toFixed(3)})`);
|
|
3638
|
+
}
|
|
3639
|
+
for (const chunk of debug.chunks) {
|
|
3640
|
+
const preview = chunk.text.slice(0, DEBUG_CHUNK_PREVIEW_CHARS).replace(/\s+/g, " ").trim();
|
|
3641
|
+
status(
|
|
3642
|
+
"\xB7",
|
|
3643
|
+
dim(`${chunk.slug}#${chunk.chunkIndex} score=${chunk.score.toFixed(3)} :: ${preview}\u2026`)
|
|
3644
|
+
);
|
|
3645
|
+
}
|
|
3646
|
+
}
|
|
3647
|
+
var DEBUG_CHUNK_PREVIEW_CHARS = 120;
|
|
3027
3648
|
|
|
3028
3649
|
// src/commands/watch.ts
|
|
3029
3650
|
import { watch as chokidarWatch } from "chokidar";
|
|
3030
3651
|
import { existsSync as existsSync9 } from "fs";
|
|
3031
|
-
import
|
|
3652
|
+
import path23 from "path";
|
|
3032
3653
|
var DEBOUNCE_MS = 500;
|
|
3033
3654
|
async function watchCommand() {
|
|
3034
|
-
const sourcesPath =
|
|
3655
|
+
const sourcesPath = path23.resolve(SOURCES_DIR);
|
|
3035
3656
|
if (!existsSync9(sourcesPath)) {
|
|
3036
3657
|
status(
|
|
3037
3658
|
"!",
|
|
@@ -3066,7 +3687,7 @@ async function watchCommand() {
|
|
|
3066
3687
|
const scheduleCompile = (eventPath, event) => {
|
|
3067
3688
|
status(
|
|
3068
3689
|
"~",
|
|
3069
|
-
dim(`${event}: ${
|
|
3690
|
+
dim(`${event}: ${path23.basename(eventPath)}`)
|
|
3070
3691
|
);
|
|
3071
3692
|
if (debounceTimer) clearTimeout(debounceTimer);
|
|
3072
3693
|
debounceTimer = setTimeout(triggerCompile, DEBOUNCE_MS);
|
|
@@ -3153,7 +3774,7 @@ async function lintCommand() {
|
|
|
3153
3774
|
// src/commands/schema.ts
|
|
3154
3775
|
import { existsSync as existsSync10 } from "fs";
|
|
3155
3776
|
import { mkdir as mkdir6, writeFile as writeFile5 } from "fs/promises";
|
|
3156
|
-
import
|
|
3777
|
+
import path24 from "path";
|
|
3157
3778
|
async function schemaInitCommand() {
|
|
3158
3779
|
const root = process.cwd();
|
|
3159
3780
|
const defaults = buildDefaultSchema();
|
|
@@ -3162,7 +3783,7 @@ async function schemaInitCommand() {
|
|
|
3162
3783
|
status("!", warn(`Schema file already exists at ${targetPath}`));
|
|
3163
3784
|
return;
|
|
3164
3785
|
}
|
|
3165
|
-
await mkdir6(
|
|
3786
|
+
await mkdir6(path24.dirname(targetPath), { recursive: true });
|
|
3166
3787
|
const serializable = {
|
|
3167
3788
|
version: defaults.version,
|
|
3168
3789
|
defaultKind: defaults.defaultKind,
|
|
@@ -3221,7 +3842,7 @@ async function reviewShowCommand(id) {
|
|
|
3221
3842
|
}
|
|
3222
3843
|
|
|
3223
3844
|
// src/commands/review-approve.ts
|
|
3224
|
-
import
|
|
3845
|
+
import path25 from "path";
|
|
3225
3846
|
|
|
3226
3847
|
// src/commands/review-helpers.ts
|
|
3227
3848
|
async function runReviewUnderLock(id, underLock) {
|
|
@@ -3253,7 +3874,7 @@ async function approveUnderLock(root, id) {
|
|
|
3253
3874
|
process.exitCode = 1;
|
|
3254
3875
|
return;
|
|
3255
3876
|
}
|
|
3256
|
-
const pagePath =
|
|
3877
|
+
const pagePath = path25.join(root, CONCEPTS_DIR, `${candidate.slug}.md`);
|
|
3257
3878
|
await atomicWrite(pagePath, candidate.body);
|
|
3258
3879
|
status("+", success(`Approved \u2192 ${source(pagePath)}`));
|
|
3259
3880
|
await persistCandidateSourceStates(root, candidate);
|
|
@@ -3313,7 +3934,7 @@ import { McpServer as McpServer2 } from "@modelcontextprotocol/sdk/server/mcp.js
|
|
|
3313
3934
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
3314
3935
|
|
|
3315
3936
|
// src/mcp/tools.ts
|
|
3316
|
-
import
|
|
3937
|
+
import path26 from "path";
|
|
3317
3938
|
import { z } from "zod";
|
|
3318
3939
|
|
|
3319
3940
|
// src/mcp/provider-check.ts
|
|
@@ -3406,15 +4027,16 @@ function registerQueryTool(server, root) {
|
|
|
3406
4027
|
"query_wiki",
|
|
3407
4028
|
{
|
|
3408
4029
|
title: "Query Wiki",
|
|
3409
|
-
description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Requires an LLM provider.",
|
|
4030
|
+
description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Set debug=true to include the selected chunks and their scores. Requires an LLM provider.",
|
|
3410
4031
|
inputSchema: {
|
|
3411
4032
|
question: z.string().describe("The natural-language question to answer."),
|
|
3412
|
-
save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true.")
|
|
4033
|
+
save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true."),
|
|
4034
|
+
debug: z.boolean().optional().describe("Include retrieval debug info (selected chunks/pages + scores).")
|
|
3413
4035
|
}
|
|
3414
4036
|
},
|
|
3415
|
-
async ({ question, save }) => {
|
|
4037
|
+
async ({ question, save, debug }) => {
|
|
3416
4038
|
ensureProviderAvailable();
|
|
3417
|
-
const result = await generateAnswer(root, question, { save });
|
|
4039
|
+
const result = await generateAnswer(root, question, { save, debug });
|
|
3418
4040
|
return jsonResult(result);
|
|
3419
4041
|
}
|
|
3420
4042
|
);
|
|
@@ -3438,15 +4060,30 @@ function registerSearchTool(server, root) {
|
|
|
3438
4060
|
);
|
|
3439
4061
|
}
|
|
3440
4062
|
async function pickSearchSlugs(root, question) {
|
|
4063
|
+
try {
|
|
4064
|
+
const chunks = await findRelevantChunks(root, question, CHUNK_TOP_K);
|
|
4065
|
+
if (chunks.length > 0) return dedupePreservingOrder(chunks.map((c) => c.chunk.slug));
|
|
4066
|
+
} catch {
|
|
4067
|
+
}
|
|
3441
4068
|
try {
|
|
3442
4069
|
const candidates = await findRelevantPages(root, question);
|
|
3443
4070
|
if (candidates.length > 0) return candidates.map((c) => c.slug);
|
|
3444
4071
|
} catch {
|
|
3445
4072
|
}
|
|
3446
|
-
const indexContent = await safeReadFile(
|
|
4073
|
+
const indexContent = await safeReadFile(path26.join(root, INDEX_FILE));
|
|
3447
4074
|
const { pages } = await selectPages(question, indexContent);
|
|
3448
4075
|
return pages;
|
|
3449
4076
|
}
|
|
4077
|
+
function dedupePreservingOrder(slugs) {
|
|
4078
|
+
const seen = /* @__PURE__ */ new Set();
|
|
4079
|
+
const out = [];
|
|
4080
|
+
for (const slug of slugs) {
|
|
4081
|
+
if (seen.has(slug)) continue;
|
|
4082
|
+
seen.add(slug);
|
|
4083
|
+
out.push(slug);
|
|
4084
|
+
}
|
|
4085
|
+
return out;
|
|
4086
|
+
}
|
|
3450
4087
|
function registerReadTool(server, root) {
|
|
3451
4088
|
server.registerTool(
|
|
3452
4089
|
"read_page",
|
|
@@ -3492,8 +4129,8 @@ function registerStatusTool(server, root) {
|
|
|
3492
4129
|
);
|
|
3493
4130
|
}
|
|
3494
4131
|
async function collectStatus(root) {
|
|
3495
|
-
const concepts = await collectPageSummaries(
|
|
3496
|
-
const queries = await collectPageSummaries(
|
|
4132
|
+
const concepts = await collectPageSummaries(path26.join(root, CONCEPTS_DIR));
|
|
4133
|
+
const queries = await collectPageSummaries(path26.join(root, QUERIES_DIR));
|
|
3497
4134
|
const state = await readState(root);
|
|
3498
4135
|
const changes = await detectChanges(root, state);
|
|
3499
4136
|
const orphans = await findOrphanedSlugs(root);
|
|
@@ -3510,7 +4147,7 @@ async function collectStatus(root) {
|
|
|
3510
4147
|
};
|
|
3511
4148
|
}
|
|
3512
4149
|
async function findOrphanedSlugs(root) {
|
|
3513
|
-
const scanned = await scanWikiPages(
|
|
4150
|
+
const scanned = await scanWikiPages(path26.join(root, CONCEPTS_DIR));
|
|
3514
4151
|
return scanned.filter(({ meta }) => meta.orphaned).map(({ slug }) => slug);
|
|
3515
4152
|
}
|
|
3516
4153
|
async function loadPageRecords(root, slugs) {
|
|
@@ -3523,7 +4160,7 @@ async function loadPageRecords(root, slugs) {
|
|
|
3523
4160
|
}
|
|
3524
4161
|
async function readPage(root, slug) {
|
|
3525
4162
|
for (const dir of PAGE_DIRS2) {
|
|
3526
|
-
const content = await safeReadFile(
|
|
4163
|
+
const content = await safeReadFile(path26.join(root, dir, `${slug}.md`));
|
|
3527
4164
|
if (!content) continue;
|
|
3528
4165
|
const { meta, body } = parseFrontmatter(content);
|
|
3529
4166
|
if (meta.orphaned) continue;
|
|
@@ -3538,7 +4175,7 @@ async function readPage(root, slug) {
|
|
|
3538
4175
|
}
|
|
3539
4176
|
|
|
3540
4177
|
// src/mcp/resources.ts
|
|
3541
|
-
import
|
|
4178
|
+
import path27 from "path";
|
|
3542
4179
|
import { readdir as readdir9 } from "fs/promises";
|
|
3543
4180
|
import { ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
3544
4181
|
function jsonContent(uri, payload) {
|
|
@@ -3572,7 +4209,7 @@ function registerIndexResource(server, root) {
|
|
|
3572
4209
|
mimeType: "text/markdown"
|
|
3573
4210
|
},
|
|
3574
4211
|
async (uri) => {
|
|
3575
|
-
const content = await safeReadFile(
|
|
4212
|
+
const content = await safeReadFile(path27.join(root, INDEX_FILE));
|
|
3576
4213
|
return { contents: [markdownContent(uri, content)] };
|
|
3577
4214
|
}
|
|
3578
4215
|
);
|
|
@@ -3639,7 +4276,7 @@ function registerQueryResource(server, root) {
|
|
|
3639
4276
|
);
|
|
3640
4277
|
}
|
|
3641
4278
|
async function listSources(root) {
|
|
3642
|
-
const sourcesPath =
|
|
4279
|
+
const sourcesPath = path27.join(root, SOURCES_DIR);
|
|
3643
4280
|
let files;
|
|
3644
4281
|
try {
|
|
3645
4282
|
files = await readdir9(sourcesPath);
|
|
@@ -3648,14 +4285,14 @@ async function listSources(root) {
|
|
|
3648
4285
|
}
|
|
3649
4286
|
const records = [];
|
|
3650
4287
|
for (const file of files.filter((f) => f.endsWith(".md"))) {
|
|
3651
|
-
const content = await safeReadFile(
|
|
4288
|
+
const content = await safeReadFile(path27.join(sourcesPath, file));
|
|
3652
4289
|
const { meta } = parseFrontmatter(content);
|
|
3653
4290
|
records.push({ filename: file, ...meta });
|
|
3654
4291
|
}
|
|
3655
4292
|
return records;
|
|
3656
4293
|
}
|
|
3657
4294
|
async function loadPageWithMeta(root, dir, slug) {
|
|
3658
|
-
const filePath =
|
|
4295
|
+
const filePath = path27.join(root, dir, `${slug}.md`);
|
|
3659
4296
|
const content = await safeReadFile(filePath);
|
|
3660
4297
|
if (!content) {
|
|
3661
4298
|
throw new Error(`Page not found: ${dir}/${slug}.md`);
|
|
@@ -3664,7 +4301,7 @@ async function loadPageWithMeta(root, dir, slug) {
|
|
|
3664
4301
|
return { slug, meta, body: body.trim() };
|
|
3665
4302
|
}
|
|
3666
4303
|
async function listPagesUnder(root, dir, scheme) {
|
|
3667
|
-
const pagesPath =
|
|
4304
|
+
const pagesPath = path27.join(root, dir);
|
|
3668
4305
|
let files;
|
|
3669
4306
|
try {
|
|
3670
4307
|
files = await readdir9(pagesPath);
|
|
@@ -3748,7 +4385,7 @@ reviewCommand.command("reject <id>").description("Reject a candidate and archive
|
|
|
3748
4385
|
process.exit(1);
|
|
3749
4386
|
}
|
|
3750
4387
|
});
|
|
3751
|
-
program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").action(async (question, options) => {
|
|
4388
|
+
program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").option("--debug", "Print which pages and chunks were selected and their scores").action(async (question, options) => {
|
|
3752
4389
|
try {
|
|
3753
4390
|
requireProvider();
|
|
3754
4391
|
await queryCommand(process.cwd(), question, options);
|