llm-wiki-compiler 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -4
- package/dist/cli.js +1007 -371
- package/dist/cli.js.map +1 -1
- package/package.json +6 -3
package/dist/cli.js
CHANGED
|
@@ -6,8 +6,8 @@ import { createRequire } from "module";
|
|
|
6
6
|
import { Command } from "commander";
|
|
7
7
|
|
|
8
8
|
// src/commands/ingest.ts
|
|
9
|
-
import
|
|
10
|
-
import { mkdir as mkdir2, writeFile as writeFile2 } from "fs/promises";
|
|
9
|
+
import path7 from "path";
|
|
10
|
+
import { mkdir as mkdir2, readFile as readFile6, writeFile as writeFile2 } from "fs/promises";
|
|
11
11
|
|
|
12
12
|
// src/utils/markdown.ts
|
|
13
13
|
import { writeFile, rename, readFile, mkdir } from "fs/promises";
|
|
@@ -150,9 +150,17 @@ var LOCK_FILE = ".llmwiki/lock";
|
|
|
150
150
|
var INDEX_FILE = "wiki/index.md";
|
|
151
151
|
var MOC_FILE = "wiki/MOC.md";
|
|
152
152
|
var EMBEDDINGS_FILE = ".llmwiki/embeddings.json";
|
|
153
|
+
var IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".jpg", ".jpeg", ".png", ".gif", ".webp"]);
|
|
154
|
+
var TRANSCRIPT_EXTENSIONS = /* @__PURE__ */ new Set([".vtt", ".srt"]);
|
|
155
|
+
var IMAGE_DESCRIBE_MAX_TOKENS = 2048;
|
|
153
156
|
var CANDIDATES_DIR = ".llmwiki/candidates";
|
|
154
157
|
var CANDIDATES_ARCHIVE_DIR = ".llmwiki/candidates/archive";
|
|
155
158
|
var EMBEDDING_TOP_K = 15;
|
|
159
|
+
var CHUNK_TOP_K = 30;
|
|
160
|
+
var CHUNK_RERANK_KEEP = 12;
|
|
161
|
+
var CHUNK_TARGET_CHARS = 800;
|
|
162
|
+
var CHUNK_MAX_CHARS = 1400;
|
|
163
|
+
var CHUNK_MIN_CHARS = 200;
|
|
156
164
|
var LOW_CONFIDENCE_THRESHOLD = 0.5;
|
|
157
165
|
var MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS = 2;
|
|
158
166
|
var EMBEDDING_MODELS = {
|
|
@@ -237,19 +245,24 @@ async function ingestWeb(url) {
|
|
|
237
245
|
|
|
238
246
|
// src/ingest/file.ts
|
|
239
247
|
import { readFile as readFile2 } from "fs/promises";
|
|
248
|
+
import path3 from "path";
|
|
249
|
+
|
|
250
|
+
// src/ingest/shared.ts
|
|
240
251
|
import path2 from "path";
|
|
241
|
-
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
|
|
242
252
|
function titleFromFilename(filePath) {
|
|
243
253
|
const basename = path2.basename(filePath, path2.extname(filePath));
|
|
244
254
|
return basename.replace(/[-_]+/g, " ").trim();
|
|
245
255
|
}
|
|
256
|
+
|
|
257
|
+
// src/ingest/file.ts
|
|
258
|
+
var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([".md", ".txt"]);
|
|
246
259
|
function wrapPlainText(text) {
|
|
247
260
|
return `\`\`\`
|
|
248
261
|
${text}
|
|
249
262
|
\`\`\``;
|
|
250
263
|
}
|
|
251
264
|
async function ingestFile(filePath) {
|
|
252
|
-
const ext =
|
|
265
|
+
const ext = path3.extname(filePath).toLowerCase();
|
|
253
266
|
if (!SUPPORTED_EXTENSIONS.has(ext)) {
|
|
254
267
|
throw new Error(
|
|
255
268
|
`Unsupported file type "${ext}". Only .md and .txt files are supported.`
|
|
@@ -261,10 +274,439 @@ async function ingestFile(filePath) {
|
|
|
261
274
|
return { title, content };
|
|
262
275
|
}
|
|
263
276
|
|
|
277
|
+
// src/ingest/pdf.ts
|
|
278
|
+
import { readFile as readFile3 } from "fs/promises";
|
|
279
|
+
function resolveTitle(filePath, info2) {
|
|
280
|
+
if (info2 && typeof info2 === "object") {
|
|
281
|
+
const titleField = info2["Title"];
|
|
282
|
+
if (typeof titleField === "string" && titleField.trim().length > 0) {
|
|
283
|
+
return titleField.trim();
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
return titleFromFilename(filePath);
|
|
287
|
+
}
|
|
288
|
+
async function ingestPdf(filePath) {
|
|
289
|
+
const { PDFParse } = await import("pdf-parse");
|
|
290
|
+
const buffer = await readFile3(filePath);
|
|
291
|
+
const parser = new PDFParse({ data: new Uint8Array(buffer) });
|
|
292
|
+
try {
|
|
293
|
+
const textResult = await parser.getText();
|
|
294
|
+
const infoResult = await parser.getInfo();
|
|
295
|
+
const title = resolveTitle(filePath, infoResult.info);
|
|
296
|
+
const content = textResult.text.trim();
|
|
297
|
+
return { title, content };
|
|
298
|
+
} finally {
|
|
299
|
+
await parser.destroy();
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// src/ingest/image.ts
|
|
304
|
+
import { readFile as readFile4 } from "fs/promises";
|
|
305
|
+
import path5 from "path";
|
|
306
|
+
import Anthropic2 from "@anthropic-ai/sdk";
|
|
307
|
+
|
|
308
|
+
// src/providers/anthropic.ts
|
|
309
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
310
|
+
var VOYAGE_EMBEDDINGS_URL = "https://api.voyageai.com/v1/embeddings";
|
|
311
|
+
function buildAnthropicClientOptions(options = {}) {
|
|
312
|
+
const trimmedBaseURL = options.baseURL?.trim();
|
|
313
|
+
const trimmedApiKey = options.apiKey?.trim();
|
|
314
|
+
const trimmedAuthToken = options.authToken?.trim();
|
|
315
|
+
const result = {};
|
|
316
|
+
if (trimmedApiKey) {
|
|
317
|
+
result.apiKey = trimmedApiKey;
|
|
318
|
+
}
|
|
319
|
+
if (trimmedAuthToken) {
|
|
320
|
+
result.authToken = trimmedAuthToken;
|
|
321
|
+
}
|
|
322
|
+
if (!trimmedBaseURL) {
|
|
323
|
+
return result;
|
|
324
|
+
}
|
|
325
|
+
const normalizedBaseURL = trimmedBaseURL.endsWith("/") && trimmedBaseURL.length > 1 ? trimmedBaseURL.slice(0, -1) : trimmedBaseURL;
|
|
326
|
+
result.baseURL = normalizedBaseURL;
|
|
327
|
+
return result;
|
|
328
|
+
}
|
|
329
|
+
var AnthropicProvider = class {
|
|
330
|
+
client;
|
|
331
|
+
model;
|
|
332
|
+
constructor(model, options = {}) {
|
|
333
|
+
this.model = model;
|
|
334
|
+
this.client = new Anthropic(buildAnthropicClientOptions(options));
|
|
335
|
+
}
|
|
336
|
+
/** Send a single non-streaming completion request. */
|
|
337
|
+
async complete(system, messages, maxTokens) {
|
|
338
|
+
const response = await this.client.messages.create({
|
|
339
|
+
model: this.model,
|
|
340
|
+
max_tokens: maxTokens,
|
|
341
|
+
system,
|
|
342
|
+
messages
|
|
343
|
+
});
|
|
344
|
+
const textBlock = response.content.find((block) => block.type === "text");
|
|
345
|
+
return textBlock?.type === "text" ? textBlock.text : "";
|
|
346
|
+
}
|
|
347
|
+
/** Stream a completion, invoking onToken for each text chunk. */
|
|
348
|
+
async stream(system, messages, maxTokens, onToken) {
|
|
349
|
+
const stream = this.client.messages.stream({
|
|
350
|
+
model: this.model,
|
|
351
|
+
max_tokens: maxTokens,
|
|
352
|
+
system,
|
|
353
|
+
messages
|
|
354
|
+
});
|
|
355
|
+
let fullText = "";
|
|
356
|
+
for await (const event of stream) {
|
|
357
|
+
if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
|
|
358
|
+
fullText += event.delta.text;
|
|
359
|
+
onToken?.(event.delta.text);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
return fullText;
|
|
363
|
+
}
|
|
364
|
+
/** Call Claude with tool definitions and return the parsed tool input as JSON. */
|
|
365
|
+
async toolCall(system, messages, tools, maxTokens) {
|
|
366
|
+
const anthropicTools = tools.map((t) => ({
|
|
367
|
+
name: t.name,
|
|
368
|
+
description: t.description,
|
|
369
|
+
input_schema: t.input_schema
|
|
370
|
+
}));
|
|
371
|
+
const response = await this.client.messages.create({
|
|
372
|
+
model: this.model,
|
|
373
|
+
max_tokens: maxTokens,
|
|
374
|
+
system,
|
|
375
|
+
messages,
|
|
376
|
+
tools: anthropicTools
|
|
377
|
+
});
|
|
378
|
+
const toolBlock = response.content.find((block) => block.type === "tool_use");
|
|
379
|
+
if (toolBlock?.type === "tool_use") {
|
|
380
|
+
return JSON.stringify(toolBlock.input);
|
|
381
|
+
}
|
|
382
|
+
const textBlock = response.content.find((block) => block.type === "text");
|
|
383
|
+
return textBlock?.type === "text" ? textBlock.text : "";
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* Produce a single embedding vector via the Voyage API.
|
|
387
|
+
*
|
|
388
|
+
* Anthropic does not ship a first-party embeddings endpoint, so we delegate
|
|
389
|
+
* to Voyage (their recommended partner). Requires VOYAGE_API_KEY.
|
|
390
|
+
*/
|
|
391
|
+
async embed(text) {
|
|
392
|
+
const apiKey = process.env.VOYAGE_API_KEY?.trim();
|
|
393
|
+
if (!apiKey) {
|
|
394
|
+
throw new Error(
|
|
395
|
+
"VOYAGE_API_KEY is not set. Anthropic embeddings use Voyage \u2014 set VOYAGE_API_KEY to enable semantic search."
|
|
396
|
+
);
|
|
397
|
+
}
|
|
398
|
+
const response = await fetch(VOYAGE_EMBEDDINGS_URL, {
|
|
399
|
+
method: "POST",
|
|
400
|
+
headers: {
|
|
401
|
+
"Content-Type": "application/json",
|
|
402
|
+
Authorization: `Bearer ${apiKey}`
|
|
403
|
+
},
|
|
404
|
+
body: JSON.stringify({ input: text, model: EMBEDDING_MODELS.anthropic })
|
|
405
|
+
});
|
|
406
|
+
if (!response.ok) {
|
|
407
|
+
const detail = await response.text();
|
|
408
|
+
throw new Error(`Voyage embeddings request failed (${response.status}): ${detail}`);
|
|
409
|
+
}
|
|
410
|
+
const json = await response.json();
|
|
411
|
+
const vector = json.data?.[0]?.embedding;
|
|
412
|
+
if (!Array.isArray(vector)) {
|
|
413
|
+
throw new Error("Voyage embeddings response did not include a vector.");
|
|
414
|
+
}
|
|
415
|
+
return vector;
|
|
416
|
+
}
|
|
417
|
+
};
|
|
418
|
+
|
|
419
|
+
// src/utils/claude-settings.ts
|
|
420
|
+
import { readFileSync } from "fs";
|
|
421
|
+
import { homedir } from "os";
|
|
422
|
+
import path4 from "path";
|
|
423
|
+
var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
|
|
424
|
+
function isRecord(value) {
|
|
425
|
+
return typeof value === "object" && value !== null;
|
|
426
|
+
}
|
|
427
|
+
function normalize(value) {
|
|
428
|
+
if (typeof value !== "string") return void 0;
|
|
429
|
+
const trimmed = value.trim();
|
|
430
|
+
return trimmed.length > 0 ? trimmed : void 0;
|
|
431
|
+
}
|
|
432
|
+
function resolveClaudeSettingsPath(env) {
|
|
433
|
+
return env[CLAUDE_SETTINGS_PATH_ENV] ?? path4.join(homedir(), ".claude", "settings.json");
|
|
434
|
+
}
|
|
435
|
+
function readClaudeSettingsFile(settingsPath) {
|
|
436
|
+
try {
|
|
437
|
+
return readFileSync(settingsPath, "utf8");
|
|
438
|
+
} catch (err) {
|
|
439
|
+
if (isRecord(err) && err.code === "ENOENT") {
|
|
440
|
+
return void 0;
|
|
441
|
+
}
|
|
442
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
443
|
+
throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
function readClaudeSettingsEnv(env = process.env) {
|
|
447
|
+
const settingsPath = resolveClaudeSettingsPath(env);
|
|
448
|
+
const raw = readClaudeSettingsFile(settingsPath);
|
|
449
|
+
if (!raw) return void 0;
|
|
450
|
+
let parsed;
|
|
451
|
+
try {
|
|
452
|
+
parsed = JSON.parse(raw);
|
|
453
|
+
} catch (err) {
|
|
454
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
455
|
+
throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
|
|
456
|
+
}
|
|
457
|
+
if (!isRecord(parsed) || !isRecord(parsed.env)) {
|
|
458
|
+
return void 0;
|
|
459
|
+
}
|
|
460
|
+
const values = {
|
|
461
|
+
ANTHROPIC_API_KEY: normalize(parsed.env.ANTHROPIC_API_KEY),
|
|
462
|
+
ANTHROPIC_AUTH_TOKEN: normalize(parsed.env.ANTHROPIC_AUTH_TOKEN),
|
|
463
|
+
ANTHROPIC_BASE_URL: normalize(parsed.env.ANTHROPIC_BASE_URL),
|
|
464
|
+
ANTHROPIC_MODEL: normalize(parsed.env.ANTHROPIC_MODEL)
|
|
465
|
+
};
|
|
466
|
+
if (!values.ANTHROPIC_API_KEY && !values.ANTHROPIC_AUTH_TOKEN && !values.ANTHROPIC_BASE_URL && !values.ANTHROPIC_MODEL) {
|
|
467
|
+
return void 0;
|
|
468
|
+
}
|
|
469
|
+
return values;
|
|
470
|
+
}
|
|
471
|
+
function tryReadClaudeSettingsEnv(env) {
|
|
472
|
+
try {
|
|
473
|
+
return readClaudeSettingsEnv(env);
|
|
474
|
+
} catch {
|
|
475
|
+
return void 0;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
function validateAnthropicBaseURL(value) {
|
|
479
|
+
const normalized = value.trim();
|
|
480
|
+
try {
|
|
481
|
+
const parsed = new URL(normalized);
|
|
482
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
483
|
+
throw new Error("Must use http:// or https:// protocol.");
|
|
484
|
+
}
|
|
485
|
+
} catch (err) {
|
|
486
|
+
const message = err instanceof Error ? err.message : "Must be a valid http(s) URL.";
|
|
487
|
+
throw new Error(`Invalid ANTHROPIC_BASE_URL: "${normalized}". ${message}`);
|
|
488
|
+
}
|
|
489
|
+
return normalized;
|
|
490
|
+
}
|
|
491
|
+
function resolveAnthropicAuthFromEnv(env = process.env) {
|
|
492
|
+
const explicitApiKey = normalize(env.ANTHROPIC_API_KEY);
|
|
493
|
+
if (explicitApiKey) return { apiKey: explicitApiKey };
|
|
494
|
+
const explicitAuthToken = normalize(env.ANTHROPIC_AUTH_TOKEN);
|
|
495
|
+
if (explicitAuthToken) return { authToken: explicitAuthToken };
|
|
496
|
+
const fallback = readClaudeSettingsEnv(env);
|
|
497
|
+
if (fallback?.ANTHROPIC_API_KEY) return { apiKey: fallback.ANTHROPIC_API_KEY };
|
|
498
|
+
if (fallback?.ANTHROPIC_AUTH_TOKEN) return { authToken: fallback.ANTHROPIC_AUTH_TOKEN };
|
|
499
|
+
return {};
|
|
500
|
+
}
|
|
501
|
+
function resolveAnthropicModelFromEnv(env = process.env) {
|
|
502
|
+
const explicitModel = env.LLMWIKI_MODEL;
|
|
503
|
+
if (explicitModel !== void 0) return explicitModel;
|
|
504
|
+
return tryReadClaudeSettingsEnv(env)?.ANTHROPIC_MODEL;
|
|
505
|
+
}
|
|
506
|
+
function resolveAnthropicBaseURLFromEnv(env = process.env) {
|
|
507
|
+
const explicitBaseURL = normalize(env.ANTHROPIC_BASE_URL);
|
|
508
|
+
if (explicitBaseURL) return validateAnthropicBaseURL(explicitBaseURL);
|
|
509
|
+
const fallbackBaseURL = tryReadClaudeSettingsEnv(env)?.ANTHROPIC_BASE_URL;
|
|
510
|
+
if (!fallbackBaseURL) return void 0;
|
|
511
|
+
return validateAnthropicBaseURL(fallbackBaseURL);
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// src/ingest/image.ts
|
|
515
|
+
var EXTENSION_TO_MIME = {
|
|
516
|
+
".jpg": "image/jpeg",
|
|
517
|
+
".jpeg": "image/jpeg",
|
|
518
|
+
".png": "image/png",
|
|
519
|
+
".gif": "image/gif",
|
|
520
|
+
".webp": "image/webp"
|
|
521
|
+
};
|
|
522
|
+
function mimeTypeForExtension(ext) {
|
|
523
|
+
const mimeType = EXTENSION_TO_MIME[ext.toLowerCase()];
|
|
524
|
+
if (!mimeType) {
|
|
525
|
+
throw new Error(
|
|
526
|
+
`Unsupported image extension "${ext}". Supported: ${Object.keys(EXTENSION_TO_MIME).join(", ")}`
|
|
527
|
+
);
|
|
528
|
+
}
|
|
529
|
+
return mimeType;
|
|
530
|
+
}
|
|
531
|
+
function buildClient() {
|
|
532
|
+
const baseURL = resolveAnthropicBaseURLFromEnv();
|
|
533
|
+
const auth = resolveAnthropicAuthFromEnv();
|
|
534
|
+
return new Anthropic2(buildAnthropicClientOptions({ baseURL, ...auth }));
|
|
535
|
+
}
|
|
536
|
+
async function describeImageWithVision(client, model, imageData, mimeType) {
|
|
537
|
+
const response = await client.messages.create({
|
|
538
|
+
model,
|
|
539
|
+
max_tokens: IMAGE_DESCRIBE_MAX_TOKENS,
|
|
540
|
+
messages: [
|
|
541
|
+
{
|
|
542
|
+
role: "user",
|
|
543
|
+
content: [
|
|
544
|
+
{
|
|
545
|
+
type: "image",
|
|
546
|
+
source: { type: "base64", media_type: mimeType, data: imageData }
|
|
547
|
+
},
|
|
548
|
+
{
|
|
549
|
+
type: "text",
|
|
550
|
+
text: "Extract and transcribe all text visible in this image. Then provide a detailed description of any non-text visual content. Format your response as markdown."
|
|
551
|
+
}
|
|
552
|
+
]
|
|
553
|
+
}
|
|
554
|
+
]
|
|
555
|
+
});
|
|
556
|
+
const textBlock = response.content.find((block) => block.type === "text");
|
|
557
|
+
return textBlock?.type === "text" ? textBlock.text : "";
|
|
558
|
+
}
|
|
559
|
+
async function ingestImage(filePath) {
|
|
560
|
+
const providerName = process.env.LLMWIKI_PROVIDER ?? "anthropic";
|
|
561
|
+
if (providerName !== "anthropic") {
|
|
562
|
+
throw new Error(
|
|
563
|
+
`Image ingest requires the Anthropic provider (vision). Current provider: "${providerName}". Set LLMWIKI_PROVIDER=anthropic and ANTHROPIC_API_KEY to use image ingest.`
|
|
564
|
+
);
|
|
565
|
+
}
|
|
566
|
+
const ext = path5.extname(filePath).toLowerCase();
|
|
567
|
+
const mimeType = mimeTypeForExtension(ext);
|
|
568
|
+
const imageBuffer = await readFile4(filePath);
|
|
569
|
+
const imageData = imageBuffer.toString("base64");
|
|
570
|
+
const client = buildClient();
|
|
571
|
+
const model = resolveAnthropicModelFromEnv() ?? PROVIDER_MODELS.anthropic;
|
|
572
|
+
const content = await describeImageWithVision(client, model, imageData, mimeType);
|
|
573
|
+
const title = titleFromFilename(filePath);
|
|
574
|
+
return { title, content };
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// src/ingest/transcript.ts
|
|
578
|
+
import { readFile as readFile5 } from "fs/promises";
|
|
579
|
+
import path6 from "path";
|
|
580
|
+
import { YoutubeTranscript } from "youtube-transcript";
|
|
581
|
+
var YOUTUBE_URL_PATTERN = /^https?:\/\/(www\.)?(youtube\.com\/watch|youtu\.be\/)/;
|
|
582
|
+
var SRT_SEQUENCE_PATTERN = /^\d+$/;
|
|
583
|
+
var TIMESTAMP_PATTERN = /\d{2}:\d{2}[:.]\d{2}/;
|
|
584
|
+
var MS_PER_MINUTE = 6e4;
|
|
585
|
+
var MS_PER_SECOND = 1e3;
|
|
586
|
+
function isYoutubeUrl(source2) {
|
|
587
|
+
return YOUTUBE_URL_PATTERN.test(source2);
|
|
588
|
+
}
|
|
589
|
+
function extractVideoId(url) {
|
|
590
|
+
const match = url.match(/(?:v=|youtu\.be\/)([^&?/]+)/);
|
|
591
|
+
if (!match) {
|
|
592
|
+
throw new Error(`Could not extract video ID from YouTube URL: ${url}`);
|
|
593
|
+
}
|
|
594
|
+
return match[1];
|
|
595
|
+
}
|
|
596
|
+
function formatOffset(offsetMs) {
|
|
597
|
+
const minutes = Math.floor(offsetMs / MS_PER_MINUTE);
|
|
598
|
+
const seconds = Math.floor(offsetMs % MS_PER_MINUTE / MS_PER_SECOND);
|
|
599
|
+
return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}`;
|
|
600
|
+
}
|
|
601
|
+
async function fetchYoutubeTranscript(url) {
|
|
602
|
+
const videoId = extractVideoId(url);
|
|
603
|
+
const segments = await YoutubeTranscript.fetchTranscript(videoId);
|
|
604
|
+
if (!segments || segments.length === 0) {
|
|
605
|
+
throw new Error(`No transcript available for YouTube video: ${url}`);
|
|
606
|
+
}
|
|
607
|
+
const lines = segments.map((seg) => `[${formatOffset(seg.offset)}] ${seg.text}`);
|
|
608
|
+
return {
|
|
609
|
+
title: `YouTube Transcript ${videoId}`,
|
|
610
|
+
content: lines.join("\n")
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
function isCueTimestamp(trimmed) {
|
|
614
|
+
return TIMESTAMP_PATTERN.test(trimmed) && trimmed.includes("-->");
|
|
615
|
+
}
|
|
616
|
+
function parseVtt(raw, filePath) {
|
|
617
|
+
const lines = raw.split("\n");
|
|
618
|
+
const output = [];
|
|
619
|
+
let inCue = false;
|
|
620
|
+
for (const line of lines) {
|
|
621
|
+
const trimmed = line.trim();
|
|
622
|
+
if (trimmed === "WEBVTT" || trimmed === "") {
|
|
623
|
+
inCue = false;
|
|
624
|
+
continue;
|
|
625
|
+
}
|
|
626
|
+
if (isCueTimestamp(trimmed)) {
|
|
627
|
+
output.push(`
|
|
628
|
+
**[${trimmed}]**`);
|
|
629
|
+
inCue = true;
|
|
630
|
+
continue;
|
|
631
|
+
}
|
|
632
|
+
if (inCue && trimmed.length > 0) {
|
|
633
|
+
output.push(trimmed);
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
|
|
637
|
+
}
|
|
638
|
+
function parseSrt(raw, filePath) {
|
|
639
|
+
const lines = raw.split("\n");
|
|
640
|
+
const output = [];
|
|
641
|
+
for (const line of lines) {
|
|
642
|
+
const trimmed = line.trim();
|
|
643
|
+
if (trimmed === "" || SRT_SEQUENCE_PATTERN.test(trimmed)) {
|
|
644
|
+
continue;
|
|
645
|
+
}
|
|
646
|
+
if (isCueTimestamp(trimmed)) {
|
|
647
|
+
output.push(`
|
|
648
|
+
**[${trimmed}]**`);
|
|
649
|
+
continue;
|
|
650
|
+
}
|
|
651
|
+
if (trimmed.length > 0) {
|
|
652
|
+
output.push(trimmed);
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
return { title: titleFromFilename(filePath), content: output.join("\n").trim() };
|
|
656
|
+
}
|
|
657
|
+
function parsePlainTranscript(raw, filePath) {
|
|
658
|
+
return { title: titleFromFilename(filePath), content: raw.trim() };
|
|
659
|
+
}
|
|
660
|
+
async function ingestTranscript(source2) {
|
|
661
|
+
if (isYoutubeUrl(source2)) {
|
|
662
|
+
return fetchYoutubeTranscript(source2);
|
|
663
|
+
}
|
|
664
|
+
const ext = path6.extname(source2).toLowerCase();
|
|
665
|
+
const raw = await readFile5(source2, "utf-8");
|
|
666
|
+
if (ext === ".vtt") return parseVtt(raw, source2);
|
|
667
|
+
if (ext === ".srt") return parseSrt(raw, source2);
|
|
668
|
+
if (ext === ".txt") return parsePlainTranscript(raw, source2);
|
|
669
|
+
throw new Error(
|
|
670
|
+
`Unsupported transcript file type "${ext}". Supported: .vtt, .srt, .txt`
|
|
671
|
+
);
|
|
672
|
+
}
|
|
673
|
+
|
|
264
674
|
// src/commands/ingest.ts
|
|
265
675
|
function isUrl(source2) {
|
|
266
676
|
return source2.startsWith("http://") || source2.startsWith("https://");
|
|
267
677
|
}
|
|
678
|
+
var TXT_SNIFF_BYTES = 2048;
|
|
679
|
+
var SPEAKER_TAG_PATTERN = /^([A-Z][a-zA-Z .'-]{0,40}):\s/gm;
|
|
680
|
+
var TIMESTAMP_PATTERN2 = /^\s*\d{1,2}:\d{2}(:\d{2})?/;
|
|
681
|
+
var MIN_TIMESTAMP_MATCHES = 3;
|
|
682
|
+
var MIN_SPEAKER_REPEAT_COUNT = 2;
|
|
683
|
+
var MIN_DISTINCT_SPEAKERS = 2;
|
|
684
|
+
function countSpeakerOccurrences(sample) {
|
|
685
|
+
const counts = /* @__PURE__ */ new Map();
|
|
686
|
+
SPEAKER_TAG_PATTERN.lastIndex = 0;
|
|
687
|
+
let match;
|
|
688
|
+
while ((match = SPEAKER_TAG_PATTERN.exec(sample)) !== null) {
|
|
689
|
+
const name = match[1].trim();
|
|
690
|
+
counts.set(name, (counts.get(name) ?? 0) + 1);
|
|
691
|
+
}
|
|
692
|
+
return counts;
|
|
693
|
+
}
|
|
694
|
+
function hasSpeakerDialoguePattern(sample) {
|
|
695
|
+
const counts = countSpeakerOccurrences(sample);
|
|
696
|
+
const distinctSpeakers = counts.size;
|
|
697
|
+
const hasEnoughSpeakers = distinctSpeakers >= MIN_DISTINCT_SPEAKERS;
|
|
698
|
+
const hasRepeatedSpeaker = [...counts.values()].some(
|
|
699
|
+
(n) => n >= MIN_SPEAKER_REPEAT_COUNT
|
|
700
|
+
);
|
|
701
|
+
return hasEnoughSpeakers && hasRepeatedSpeaker;
|
|
702
|
+
}
|
|
703
|
+
async function looksLikeTxtTranscript(filePath) {
|
|
704
|
+
const raw = await readFile6(filePath, "utf-8");
|
|
705
|
+
const sample = raw.slice(0, TXT_SNIFF_BYTES);
|
|
706
|
+
if (hasSpeakerDialoguePattern(sample)) return true;
|
|
707
|
+
const timestampMatches = sample.match(new RegExp(TIMESTAMP_PATTERN2.source, "gm"));
|
|
708
|
+
return (timestampMatches?.length ?? 0) >= MIN_TIMESTAMP_MATCHES;
|
|
709
|
+
}
|
|
268
710
|
function enforceCharLimit(content) {
|
|
269
711
|
if (content.length <= MAX_SOURCE_CHARS) {
|
|
270
712
|
return { content, truncated: false, originalChars: content.length };
|
|
@@ -297,12 +739,30 @@ function enforceMinContent(content) {
|
|
|
297
739
|
);
|
|
298
740
|
}
|
|
299
741
|
}
|
|
300
|
-
function
|
|
742
|
+
async function detectSourceType(source2) {
|
|
743
|
+
if (!isUrl(source2)) {
|
|
744
|
+
const ext = path7.extname(source2).toLowerCase();
|
|
745
|
+
if (ext === ".pdf") return "pdf";
|
|
746
|
+
if (IMAGE_EXTENSIONS.has(ext)) return "image";
|
|
747
|
+
if (TRANSCRIPT_EXTENSIONS.has(ext)) return "transcript";
|
|
748
|
+
if (ext === ".txt") {
|
|
749
|
+
const isTranscript = await looksLikeTxtTranscript(source2);
|
|
750
|
+
return isTranscript ? "transcript" : "file";
|
|
751
|
+
}
|
|
752
|
+
return "file";
|
|
753
|
+
}
|
|
754
|
+
if (isYoutubeUrl(source2)) return "transcript";
|
|
755
|
+
return "web";
|
|
756
|
+
}
|
|
757
|
+
function buildDocument(title, source2, result, sourceType) {
|
|
301
758
|
const meta = {
|
|
302
759
|
title,
|
|
303
760
|
source: source2,
|
|
304
761
|
ingestedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
305
762
|
};
|
|
763
|
+
if (sourceType !== void 0) {
|
|
764
|
+
meta.sourceType = sourceType;
|
|
765
|
+
}
|
|
306
766
|
if (result.truncated) {
|
|
307
767
|
meta.truncated = true;
|
|
308
768
|
meta.originalChars = result.originalChars;
|
|
@@ -313,30 +773,46 @@ function buildDocument(title, source2, result) {
|
|
|
313
773
|
${result.content}
|
|
314
774
|
`;
|
|
315
775
|
}
|
|
776
|
+
async function fetchContent(source2, sourceType) {
|
|
777
|
+
switch (sourceType) {
|
|
778
|
+
case "web":
|
|
779
|
+
return ingestWeb(source2);
|
|
780
|
+
case "pdf":
|
|
781
|
+
return ingestPdf(source2);
|
|
782
|
+
case "image":
|
|
783
|
+
return ingestImage(source2);
|
|
784
|
+
case "transcript":
|
|
785
|
+
return ingestTranscript(source2);
|
|
786
|
+
case "file":
|
|
787
|
+
return ingestFile(source2);
|
|
788
|
+
}
|
|
789
|
+
}
|
|
316
790
|
async function saveSource(title, document) {
|
|
317
791
|
const filename = `${slugify(title)}.md`;
|
|
318
|
-
const destPath =
|
|
792
|
+
const destPath = path7.join(SOURCES_DIR, filename);
|
|
319
793
|
await mkdir2(SOURCES_DIR, { recursive: true });
|
|
320
794
|
await writeFile2(destPath, document, "utf-8");
|
|
321
795
|
return destPath;
|
|
322
796
|
}
|
|
323
797
|
async function ingestSource(source2) {
|
|
324
|
-
|
|
325
|
-
|
|
798
|
+
const sourceType = await detectSourceType(source2);
|
|
799
|
+
status("*", info(`Ingesting [${sourceType}]: ${source2}`));
|
|
800
|
+
const { title, content } = await fetchContent(source2, sourceType);
|
|
326
801
|
const result = enforceCharLimit(content);
|
|
327
802
|
enforceMinContent(result.content);
|
|
328
|
-
const document = buildDocument(title, source2, result);
|
|
803
|
+
const document = buildDocument(title, source2, result, sourceType);
|
|
329
804
|
const savedPath = await saveSource(title, document);
|
|
330
805
|
return {
|
|
331
|
-
filename:
|
|
806
|
+
filename: path7.basename(savedPath),
|
|
332
807
|
charCount: result.content.length,
|
|
333
808
|
truncated: result.truncated,
|
|
334
|
-
source: source2
|
|
809
|
+
source: source2,
|
|
810
|
+
sourceType
|
|
335
811
|
};
|
|
336
812
|
}
|
|
337
813
|
async function ingest(source2) {
|
|
338
814
|
const result = await ingestSource(source2);
|
|
339
|
-
const savedPath =
|
|
815
|
+
const savedPath = path7.join(SOURCES_DIR, result.filename);
|
|
340
816
|
status(
|
|
341
817
|
"+",
|
|
342
818
|
success(`Saved ${bold(result.filename)} \u2192 ${source(savedPath)}`)
|
|
@@ -348,23 +824,23 @@ async function ingest(source2) {
|
|
|
348
824
|
import { existsSync as existsSync7 } from "fs";
|
|
349
825
|
|
|
350
826
|
// src/compiler/index.ts
|
|
351
|
-
import { readFile as
|
|
352
|
-
import
|
|
827
|
+
import { readFile as readFile14 } from "fs/promises";
|
|
828
|
+
import path21 from "path";
|
|
353
829
|
|
|
354
830
|
// src/utils/state.ts
|
|
355
|
-
import { readFile as
|
|
831
|
+
import { readFile as readFile7, writeFile as writeFile3, rename as rename2, mkdir as mkdir3, copyFile } from "fs/promises";
|
|
356
832
|
import { existsSync } from "fs";
|
|
357
|
-
import
|
|
833
|
+
import path8 from "path";
|
|
358
834
|
function emptyState() {
|
|
359
835
|
return { version: 1, indexHash: "", sources: {} };
|
|
360
836
|
}
|
|
361
837
|
async function readState(root) {
|
|
362
|
-
const filePath =
|
|
838
|
+
const filePath = path8.join(root, STATE_FILE);
|
|
363
839
|
if (!existsSync(filePath)) {
|
|
364
840
|
return emptyState();
|
|
365
841
|
}
|
|
366
842
|
try {
|
|
367
|
-
const raw = await
|
|
843
|
+
const raw = await readFile7(filePath, "utf-8");
|
|
368
844
|
return JSON.parse(raw);
|
|
369
845
|
} catch {
|
|
370
846
|
const bakPath = filePath + ".bak";
|
|
@@ -374,9 +850,9 @@ async function readState(root) {
|
|
|
374
850
|
}
|
|
375
851
|
}
|
|
376
852
|
async function writeState(root, state) {
|
|
377
|
-
const dir =
|
|
853
|
+
const dir = path8.join(root, LLMWIKI_DIR);
|
|
378
854
|
await mkdir3(dir, { recursive: true });
|
|
379
|
-
const filePath =
|
|
855
|
+
const filePath = path8.join(root, STATE_FILE);
|
|
380
856
|
const tmpPath = filePath + ".tmp";
|
|
381
857
|
await writeFile3(tmpPath, JSON.stringify(state, null, 2), "utf-8");
|
|
382
858
|
await rename2(tmpPath, filePath);
|
|
@@ -393,18 +869,18 @@ async function removeSourceState(root, sourceFile) {
|
|
|
393
869
|
}
|
|
394
870
|
|
|
395
871
|
// src/compiler/source-state.ts
|
|
396
|
-
import
|
|
872
|
+
import path10 from "path";
|
|
397
873
|
|
|
398
874
|
// src/compiler/hasher.ts
|
|
399
875
|
import { createHash } from "crypto";
|
|
400
|
-
import { readFile as
|
|
401
|
-
import
|
|
876
|
+
import { readFile as readFile8, readdir } from "fs/promises";
|
|
877
|
+
import path9 from "path";
|
|
402
878
|
async function hashFile(filePath) {
|
|
403
|
-
const content = await
|
|
879
|
+
const content = await readFile8(filePath, "utf-8");
|
|
404
880
|
return createHash("sha256").update(content).digest("hex");
|
|
405
881
|
}
|
|
406
882
|
async function detectChanges(root, prevState) {
|
|
407
|
-
const sourcesPath =
|
|
883
|
+
const sourcesPath = path9.join(root, SOURCES_DIR);
|
|
408
884
|
const currentFiles = await listSourceFiles(sourcesPath);
|
|
409
885
|
const changes = [];
|
|
410
886
|
for (const file of currentFiles) {
|
|
@@ -424,7 +900,7 @@ async function listSourceFiles(sourcesPath) {
|
|
|
424
900
|
}
|
|
425
901
|
}
|
|
426
902
|
async function classifyFile(root, file, prevState) {
|
|
427
|
-
const filePath =
|
|
903
|
+
const filePath = path9.join(root, SOURCES_DIR, file);
|
|
428
904
|
const hash = await hashFile(filePath);
|
|
429
905
|
const prev = prevState.sources[file];
|
|
430
906
|
if (!prev) return "new";
|
|
@@ -447,133 +923,22 @@ async function buildExtractionSourceStates(root, extractions) {
|
|
|
447
923
|
return snapshot;
|
|
448
924
|
}
|
|
449
925
|
async function buildEntry(root, result, compiledAt) {
|
|
450
|
-
const filePath =
|
|
926
|
+
const filePath = path10.join(root, SOURCES_DIR, result.sourceFile);
|
|
451
927
|
const hash = await hashFile(filePath);
|
|
452
|
-
return {
|
|
453
|
-
hash,
|
|
454
|
-
concepts: result.concepts.map((concept) => slugify(concept.concept)),
|
|
455
|
-
compiledAt
|
|
456
|
-
};
|
|
457
|
-
}
|
|
458
|
-
function pickStatesForSources(allStates, sourceFiles) {
|
|
459
|
-
const picked = {};
|
|
460
|
-
for (const file of sourceFiles) {
|
|
461
|
-
const entry = allStates[file];
|
|
462
|
-
if (entry) picked[file] = entry;
|
|
463
|
-
}
|
|
464
|
-
return picked;
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
// src/providers/anthropic.ts
|
|
468
|
-
import Anthropic from "@anthropic-ai/sdk";
|
|
469
|
-
var VOYAGE_EMBEDDINGS_URL = "https://api.voyageai.com/v1/embeddings";
|
|
470
|
-
function buildAnthropicClientOptions(options = {}) {
|
|
471
|
-
const trimmedBaseURL = options.baseURL?.trim();
|
|
472
|
-
const trimmedApiKey = options.apiKey?.trim();
|
|
473
|
-
const trimmedAuthToken = options.authToken?.trim();
|
|
474
|
-
const result = {};
|
|
475
|
-
if (trimmedApiKey) {
|
|
476
|
-
result.apiKey = trimmedApiKey;
|
|
477
|
-
}
|
|
478
|
-
if (trimmedAuthToken) {
|
|
479
|
-
result.authToken = trimmedAuthToken;
|
|
480
|
-
}
|
|
481
|
-
if (!trimmedBaseURL) {
|
|
482
|
-
return result;
|
|
483
|
-
}
|
|
484
|
-
const normalizedBaseURL = trimmedBaseURL.endsWith("/") && trimmedBaseURL.length > 1 ? trimmedBaseURL.slice(0, -1) : trimmedBaseURL;
|
|
485
|
-
result.baseURL = normalizedBaseURL;
|
|
486
|
-
return result;
|
|
487
|
-
}
|
|
488
|
-
var AnthropicProvider = class {
|
|
489
|
-
client;
|
|
490
|
-
model;
|
|
491
|
-
constructor(model, options = {}) {
|
|
492
|
-
this.model = model;
|
|
493
|
-
this.client = new Anthropic(buildAnthropicClientOptions(options));
|
|
494
|
-
}
|
|
495
|
-
/** Send a single non-streaming completion request. */
|
|
496
|
-
async complete(system, messages, maxTokens) {
|
|
497
|
-
const response = await this.client.messages.create({
|
|
498
|
-
model: this.model,
|
|
499
|
-
max_tokens: maxTokens,
|
|
500
|
-
system,
|
|
501
|
-
messages
|
|
502
|
-
});
|
|
503
|
-
const textBlock = response.content.find((block) => block.type === "text");
|
|
504
|
-
return textBlock?.type === "text" ? textBlock.text : "";
|
|
505
|
-
}
|
|
506
|
-
/** Stream a completion, invoking onToken for each text chunk. */
|
|
507
|
-
async stream(system, messages, maxTokens, onToken) {
|
|
508
|
-
const stream = this.client.messages.stream({
|
|
509
|
-
model: this.model,
|
|
510
|
-
max_tokens: maxTokens,
|
|
511
|
-
system,
|
|
512
|
-
messages
|
|
513
|
-
});
|
|
514
|
-
let fullText = "";
|
|
515
|
-
for await (const event of stream) {
|
|
516
|
-
if (event.type === "content_block_delta" && event.delta.type === "text_delta") {
|
|
517
|
-
fullText += event.delta.text;
|
|
518
|
-
onToken?.(event.delta.text);
|
|
519
|
-
}
|
|
520
|
-
}
|
|
521
|
-
return fullText;
|
|
522
|
-
}
|
|
523
|
-
/** Call Claude with tool definitions and return the parsed tool input as JSON. */
|
|
524
|
-
async toolCall(system, messages, tools, maxTokens) {
|
|
525
|
-
const anthropicTools = tools.map((t) => ({
|
|
526
|
-
name: t.name,
|
|
527
|
-
description: t.description,
|
|
528
|
-
input_schema: t.input_schema
|
|
529
|
-
}));
|
|
530
|
-
const response = await this.client.messages.create({
|
|
531
|
-
model: this.model,
|
|
532
|
-
max_tokens: maxTokens,
|
|
533
|
-
system,
|
|
534
|
-
messages,
|
|
535
|
-
tools: anthropicTools
|
|
536
|
-
});
|
|
537
|
-
const toolBlock = response.content.find((block) => block.type === "tool_use");
|
|
538
|
-
if (toolBlock?.type === "tool_use") {
|
|
539
|
-
return JSON.stringify(toolBlock.input);
|
|
540
|
-
}
|
|
541
|
-
const textBlock = response.content.find((block) => block.type === "text");
|
|
542
|
-
return textBlock?.type === "text" ? textBlock.text : "";
|
|
543
|
-
}
|
|
544
|
-
/**
|
|
545
|
-
* Produce a single embedding vector via the Voyage API.
|
|
546
|
-
*
|
|
547
|
-
* Anthropic does not ship a first-party embeddings endpoint, so we delegate
|
|
548
|
-
* to Voyage (their recommended partner). Requires VOYAGE_API_KEY.
|
|
549
|
-
*/
|
|
550
|
-
async embed(text) {
|
|
551
|
-
const apiKey = process.env.VOYAGE_API_KEY?.trim();
|
|
552
|
-
if (!apiKey) {
|
|
553
|
-
throw new Error(
|
|
554
|
-
"VOYAGE_API_KEY is not set. Anthropic embeddings use Voyage \u2014 set VOYAGE_API_KEY to enable semantic search."
|
|
555
|
-
);
|
|
556
|
-
}
|
|
557
|
-
const response = await fetch(VOYAGE_EMBEDDINGS_URL, {
|
|
558
|
-
method: "POST",
|
|
559
|
-
headers: {
|
|
560
|
-
"Content-Type": "application/json",
|
|
561
|
-
Authorization: `Bearer ${apiKey}`
|
|
562
|
-
},
|
|
563
|
-
body: JSON.stringify({ input: text, model: EMBEDDING_MODELS.anthropic })
|
|
564
|
-
});
|
|
565
|
-
if (!response.ok) {
|
|
566
|
-
const detail = await response.text();
|
|
567
|
-
throw new Error(`Voyage embeddings request failed (${response.status}): ${detail}`);
|
|
568
|
-
}
|
|
569
|
-
const json = await response.json();
|
|
570
|
-
const vector = json.data?.[0]?.embedding;
|
|
571
|
-
if (!Array.isArray(vector)) {
|
|
572
|
-
throw new Error("Voyage embeddings response did not include a vector.");
|
|
573
|
-
}
|
|
574
|
-
return vector;
|
|
928
|
+
return {
|
|
929
|
+
hash,
|
|
930
|
+
concepts: result.concepts.map((concept) => slugify(concept.concept)),
|
|
931
|
+
compiledAt
|
|
932
|
+
};
|
|
933
|
+
}
|
|
934
|
+
function pickStatesForSources(allStates, sourceFiles) {
|
|
935
|
+
const picked = {};
|
|
936
|
+
for (const file of sourceFiles) {
|
|
937
|
+
const entry = allStates[file];
|
|
938
|
+
if (entry) picked[file] = entry;
|
|
575
939
|
}
|
|
576
|
-
|
|
940
|
+
return picked;
|
|
941
|
+
}
|
|
577
942
|
|
|
578
943
|
// src/providers/openai.ts
|
|
579
944
|
import OpenAI from "openai";
|
|
@@ -704,101 +1069,6 @@ var MiniMaxProvider = class extends OpenAIProvider {
|
|
|
704
1069
|
}
|
|
705
1070
|
};
|
|
706
1071
|
|
|
707
|
-
// src/utils/claude-settings.ts
|
|
708
|
-
import { readFileSync } from "fs";
|
|
709
|
-
import { homedir } from "os";
|
|
710
|
-
import path7 from "path";
|
|
711
|
-
var CLAUDE_SETTINGS_PATH_ENV = "LLMWIKI_CLAUDE_SETTINGS_PATH";
|
|
712
|
-
function isRecord(value) {
|
|
713
|
-
return typeof value === "object" && value !== null;
|
|
714
|
-
}
|
|
715
|
-
function normalize(value) {
|
|
716
|
-
if (typeof value !== "string") return void 0;
|
|
717
|
-
const trimmed = value.trim();
|
|
718
|
-
return trimmed.length > 0 ? trimmed : void 0;
|
|
719
|
-
}
|
|
720
|
-
function resolveClaudeSettingsPath(env) {
|
|
721
|
-
return env[CLAUDE_SETTINGS_PATH_ENV] ?? path7.join(homedir(), ".claude", "settings.json");
|
|
722
|
-
}
|
|
723
|
-
function readClaudeSettingsFile(settingsPath) {
|
|
724
|
-
try {
|
|
725
|
-
return readFileSync(settingsPath, "utf8");
|
|
726
|
-
} catch (err) {
|
|
727
|
-
if (isRecord(err) && err.code === "ENOENT") {
|
|
728
|
-
return void 0;
|
|
729
|
-
}
|
|
730
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
731
|
-
throw new Error(`Failed to read Claude settings at "${settingsPath}": ${message}`);
|
|
732
|
-
}
|
|
733
|
-
}
|
|
734
|
-
function readClaudeSettingsEnv(env = process.env) {
|
|
735
|
-
const settingsPath = resolveClaudeSettingsPath(env);
|
|
736
|
-
const raw = readClaudeSettingsFile(settingsPath);
|
|
737
|
-
if (!raw) return void 0;
|
|
738
|
-
let parsed;
|
|
739
|
-
try {
|
|
740
|
-
parsed = JSON.parse(raw);
|
|
741
|
-
} catch (err) {
|
|
742
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
743
|
-
throw new Error(`Failed to parse Claude settings at "${settingsPath}": ${message}`);
|
|
744
|
-
}
|
|
745
|
-
if (!isRecord(parsed) || !isRecord(parsed.env)) {
|
|
746
|
-
return void 0;
|
|
747
|
-
}
|
|
748
|
-
const values = {
|
|
749
|
-
ANTHROPIC_API_KEY: normalize(parsed.env.ANTHROPIC_API_KEY),
|
|
750
|
-
ANTHROPIC_AUTH_TOKEN: normalize(parsed.env.ANTHROPIC_AUTH_TOKEN),
|
|
751
|
-
ANTHROPIC_BASE_URL: normalize(parsed.env.ANTHROPIC_BASE_URL),
|
|
752
|
-
ANTHROPIC_MODEL: normalize(parsed.env.ANTHROPIC_MODEL)
|
|
753
|
-
};
|
|
754
|
-
if (!values.ANTHROPIC_API_KEY && !values.ANTHROPIC_AUTH_TOKEN && !values.ANTHROPIC_BASE_URL && !values.ANTHROPIC_MODEL) {
|
|
755
|
-
return void 0;
|
|
756
|
-
}
|
|
757
|
-
return values;
|
|
758
|
-
}
|
|
759
|
-
function tryReadClaudeSettingsEnv(env) {
|
|
760
|
-
try {
|
|
761
|
-
return readClaudeSettingsEnv(env);
|
|
762
|
-
} catch {
|
|
763
|
-
return void 0;
|
|
764
|
-
}
|
|
765
|
-
}
|
|
766
|
-
function validateAnthropicBaseURL(value) {
|
|
767
|
-
const normalized = value.trim();
|
|
768
|
-
try {
|
|
769
|
-
const parsed = new URL(normalized);
|
|
770
|
-
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
771
|
-
throw new Error("Must use http:// or https:// protocol.");
|
|
772
|
-
}
|
|
773
|
-
} catch (err) {
|
|
774
|
-
const message = err instanceof Error ? err.message : "Must be a valid http(s) URL.";
|
|
775
|
-
throw new Error(`Invalid ANTHROPIC_BASE_URL: "${normalized}". ${message}`);
|
|
776
|
-
}
|
|
777
|
-
return normalized;
|
|
778
|
-
}
|
|
779
|
-
function resolveAnthropicAuthFromEnv(env = process.env) {
|
|
780
|
-
const explicitApiKey = normalize(env.ANTHROPIC_API_KEY);
|
|
781
|
-
if (explicitApiKey) return { apiKey: explicitApiKey };
|
|
782
|
-
const explicitAuthToken = normalize(env.ANTHROPIC_AUTH_TOKEN);
|
|
783
|
-
if (explicitAuthToken) return { authToken: explicitAuthToken };
|
|
784
|
-
const fallback = readClaudeSettingsEnv(env);
|
|
785
|
-
if (fallback?.ANTHROPIC_API_KEY) return { apiKey: fallback.ANTHROPIC_API_KEY };
|
|
786
|
-
if (fallback?.ANTHROPIC_AUTH_TOKEN) return { authToken: fallback.ANTHROPIC_AUTH_TOKEN };
|
|
787
|
-
return {};
|
|
788
|
-
}
|
|
789
|
-
function resolveAnthropicModelFromEnv(env = process.env) {
|
|
790
|
-
const explicitModel = env.LLMWIKI_MODEL;
|
|
791
|
-
if (explicitModel !== void 0) return explicitModel;
|
|
792
|
-
return tryReadClaudeSettingsEnv(env)?.ANTHROPIC_MODEL;
|
|
793
|
-
}
|
|
794
|
-
function resolveAnthropicBaseURLFromEnv(env = process.env) {
|
|
795
|
-
const explicitBaseURL = normalize(env.ANTHROPIC_BASE_URL);
|
|
796
|
-
if (explicitBaseURL) return validateAnthropicBaseURL(explicitBaseURL);
|
|
797
|
-
const fallbackBaseURL = tryReadClaudeSettingsEnv(env)?.ANTHROPIC_BASE_URL;
|
|
798
|
-
if (!fallbackBaseURL) return void 0;
|
|
799
|
-
return validateAnthropicBaseURL(fallbackBaseURL);
|
|
800
|
-
}
|
|
801
|
-
|
|
802
1072
|
// src/utils/provider.ts
|
|
803
1073
|
var SUPPORTED_PROVIDERS = /* @__PURE__ */ new Set(["anthropic", "openai", "ollama", "minimax"]);
|
|
804
1074
|
function getProvider() {
|
|
@@ -891,8 +1161,8 @@ async function callClaude(options) {
|
|
|
891
1161
|
}
|
|
892
1162
|
|
|
893
1163
|
// src/utils/lock.ts
|
|
894
|
-
import { open, readFile as
|
|
895
|
-
import
|
|
1164
|
+
import { open, readFile as readFile9, unlink, mkdir as mkdir4 } from "fs/promises";
|
|
1165
|
+
import path11 from "path";
|
|
896
1166
|
var RECLAIM_SUFFIX = ".reclaim";
|
|
897
1167
|
var MAX_ACQUIRE_ATTEMPTS = 2;
|
|
898
1168
|
function isProcessAlive(pid) {
|
|
@@ -904,8 +1174,8 @@ function isProcessAlive(pid) {
|
|
|
904
1174
|
}
|
|
905
1175
|
}
|
|
906
1176
|
async function acquireLock(root) {
|
|
907
|
-
const lockPath =
|
|
908
|
-
await mkdir4(
|
|
1177
|
+
const lockPath = path11.join(root, LOCK_FILE);
|
|
1178
|
+
await mkdir4(path11.join(root, LLMWIKI_DIR), { recursive: true });
|
|
909
1179
|
for (let attempt = 0; attempt < MAX_ACQUIRE_ATTEMPTS; attempt++) {
|
|
910
1180
|
const created = await tryCreateLock(lockPath);
|
|
911
1181
|
if (created) return true;
|
|
@@ -968,7 +1238,7 @@ async function tryCreateLock(lockPath) {
|
|
|
968
1238
|
}
|
|
969
1239
|
async function isLockStale(lockPath) {
|
|
970
1240
|
try {
|
|
971
|
-
const content = await
|
|
1241
|
+
const content = await readFile9(lockPath, "utf-8");
|
|
972
1242
|
const pid = parseInt(content.trim(), 10);
|
|
973
1243
|
if (isNaN(pid)) return true;
|
|
974
1244
|
return !isProcessAlive(pid);
|
|
@@ -977,7 +1247,7 @@ async function isLockStale(lockPath) {
|
|
|
977
1247
|
}
|
|
978
1248
|
}
|
|
979
1249
|
async function releaseLock(root) {
|
|
980
|
-
const lockPath =
|
|
1250
|
+
const lockPath = path11.join(root, LOCK_FILE);
|
|
981
1251
|
try {
|
|
982
1252
|
await unlink(lockPath);
|
|
983
1253
|
} catch {
|
|
@@ -1220,8 +1490,8 @@ function buildDefaultSchema() {
|
|
|
1220
1490
|
|
|
1221
1491
|
// src/schema/loader.ts
|
|
1222
1492
|
import { existsSync as existsSync2 } from "fs";
|
|
1223
|
-
import { readFile as
|
|
1224
|
-
import
|
|
1493
|
+
import { readFile as readFile10 } from "fs/promises";
|
|
1494
|
+
import path12 from "path";
|
|
1225
1495
|
import yaml2 from "js-yaml";
|
|
1226
1496
|
var SCHEMA_CANDIDATE_PATHS = [
|
|
1227
1497
|
".llmwiki/schema.json",
|
|
@@ -1232,7 +1502,7 @@ var SCHEMA_CANDIDATE_PATHS = [
|
|
|
1232
1502
|
];
|
|
1233
1503
|
function findSchemaPath(root) {
|
|
1234
1504
|
for (const candidate of SCHEMA_CANDIDATE_PATHS) {
|
|
1235
|
-
const absolute =
|
|
1505
|
+
const absolute = path12.join(root, candidate);
|
|
1236
1506
|
if (existsSync2(absolute)) return absolute;
|
|
1237
1507
|
}
|
|
1238
1508
|
return null;
|
|
@@ -1285,12 +1555,12 @@ async function loadSchema(root) {
|
|
|
1285
1555
|
const defaults = buildDefaultSchema();
|
|
1286
1556
|
const schemaPath = findSchemaPath(root);
|
|
1287
1557
|
if (!schemaPath) return defaults;
|
|
1288
|
-
const raw = await
|
|
1558
|
+
const raw = await readFile10(schemaPath, "utf-8");
|
|
1289
1559
|
const parsed = parseSchemaFile(schemaPath, raw);
|
|
1290
1560
|
return applyOverrides(defaults, parsed, schemaPath);
|
|
1291
1561
|
}
|
|
1292
1562
|
function defaultSchemaInitPath(root) {
|
|
1293
|
-
return
|
|
1563
|
+
return path12.join(root, SCHEMA_CANDIDATE_PATHS[0]);
|
|
1294
1564
|
}
|
|
1295
1565
|
|
|
1296
1566
|
// src/schema/helpers.ts
|
|
@@ -1462,7 +1732,7 @@ async function freezeFailedExtractions(root, results, frozenSlugs) {
|
|
|
1462
1732
|
}
|
|
1463
1733
|
|
|
1464
1734
|
// src/compiler/orphan.ts
|
|
1465
|
-
import
|
|
1735
|
+
import path13 from "path";
|
|
1466
1736
|
async function markOrphaned(root, sourceFile, state) {
|
|
1467
1737
|
const sourceEntry = state.sources[sourceFile];
|
|
1468
1738
|
if (!sourceEntry) return;
|
|
@@ -1488,7 +1758,7 @@ async function orphanUnownedFrozenPages(root, frozenSlugs) {
|
|
|
1488
1758
|
}
|
|
1489
1759
|
}
|
|
1490
1760
|
async function orphanPage(root, slug, reason) {
|
|
1491
|
-
const pagePath =
|
|
1761
|
+
const pagePath = path13.join(root, CONCEPTS_DIR, `${slug}.md`);
|
|
1492
1762
|
const content = await safeReadFile(pagePath);
|
|
1493
1763
|
if (!content) return;
|
|
1494
1764
|
const { meta } = parseFrontmatter(content);
|
|
@@ -1499,18 +1769,18 @@ async function orphanPage(root, slug, reason) {
|
|
|
1499
1769
|
}
|
|
1500
1770
|
|
|
1501
1771
|
// src/compiler/resolver.ts
|
|
1502
|
-
import { readdir as readdir2, readFile as
|
|
1503
|
-
import
|
|
1772
|
+
import { readdir as readdir2, readFile as readFile11 } from "fs/promises";
|
|
1773
|
+
import path14 from "path";
|
|
1504
1774
|
import { existsSync as existsSync3 } from "fs";
|
|
1505
1775
|
async function buildTitleIndex(root) {
|
|
1506
|
-
const conceptsDir =
|
|
1776
|
+
const conceptsDir = path14.join(root, CONCEPTS_DIR);
|
|
1507
1777
|
if (!existsSync3(conceptsDir)) return [];
|
|
1508
1778
|
const files = await readdir2(conceptsDir);
|
|
1509
1779
|
const pages = [];
|
|
1510
1780
|
for (const file of files) {
|
|
1511
1781
|
if (!file.endsWith(".md")) continue;
|
|
1512
|
-
const filePath =
|
|
1513
|
-
const content = await
|
|
1782
|
+
const filePath = path14.join(conceptsDir, file);
|
|
1783
|
+
const content = await readFile11(filePath, "utf-8");
|
|
1514
1784
|
const { meta } = parseFrontmatter(content);
|
|
1515
1785
|
if (meta.title && typeof meta.title === "string" && !meta.orphaned) {
|
|
1516
1786
|
pages.push({
|
|
@@ -1596,7 +1866,7 @@ async function resolveInboundLinks(titleIndex, newSlugs) {
|
|
|
1596
1866
|
let count = 0;
|
|
1597
1867
|
for (const page of titleIndex) {
|
|
1598
1868
|
if (newSlugs.includes(page.slug)) continue;
|
|
1599
|
-
const content = await
|
|
1869
|
+
const content = await readFile11(page.filePath, "utf-8");
|
|
1600
1870
|
const { body } = parseFrontmatter(content);
|
|
1601
1871
|
const linked = addWikilinks(body, newTitles, page.title);
|
|
1602
1872
|
if (linked !== body) {
|
|
@@ -1608,7 +1878,7 @@ async function resolveInboundLinks(titleIndex, newSlugs) {
|
|
|
1608
1878
|
return count;
|
|
1609
1879
|
}
|
|
1610
1880
|
async function linkPage(page, titleIndex) {
|
|
1611
|
-
const content = await
|
|
1881
|
+
const content = await readFile11(page.filePath, "utf-8");
|
|
1612
1882
|
const { body } = parseFrontmatter(content);
|
|
1613
1883
|
const linked = addWikilinks(body, titleIndex, page.title);
|
|
1614
1884
|
if (linked === body) return false;
|
|
@@ -1619,17 +1889,17 @@ async function linkPage(page, titleIndex) {
|
|
|
1619
1889
|
|
|
1620
1890
|
// src/compiler/indexgen.ts
|
|
1621
1891
|
import { readdir as readdir3 } from "fs/promises";
|
|
1622
|
-
import
|
|
1892
|
+
import path15 from "path";
|
|
1623
1893
|
async function generateIndex(root) {
|
|
1624
1894
|
status("*", info("Generating index..."));
|
|
1625
|
-
const conceptsPath =
|
|
1626
|
-
const queriesPath =
|
|
1895
|
+
const conceptsPath = path15.join(root, CONCEPTS_DIR);
|
|
1896
|
+
const queriesPath = path15.join(root, QUERIES_DIR);
|
|
1627
1897
|
const concepts = await collectPageSummaries(conceptsPath);
|
|
1628
1898
|
const queries = await collectPageSummaries(queriesPath);
|
|
1629
1899
|
concepts.sort((a, b) => a.title.localeCompare(b.title));
|
|
1630
1900
|
queries.sort((a, b) => a.title.localeCompare(b.title));
|
|
1631
1901
|
const indexContent = buildIndexContent(concepts, queries);
|
|
1632
|
-
const indexPath =
|
|
1902
|
+
const indexPath = path15.join(root, INDEX_FILE);
|
|
1633
1903
|
await atomicWrite(indexPath, indexContent);
|
|
1634
1904
|
const total = concepts.length + queries.length;
|
|
1635
1905
|
status("+", success(`Index updated with ${total} pages.`));
|
|
@@ -1643,7 +1913,7 @@ async function scanWikiPages(dirPath) {
|
|
|
1643
1913
|
}
|
|
1644
1914
|
const scanned = [];
|
|
1645
1915
|
for (const file of files.filter((f) => f.endsWith(".md"))) {
|
|
1646
|
-
const content = await safeReadFile(
|
|
1916
|
+
const content = await safeReadFile(path15.join(dirPath, file));
|
|
1647
1917
|
const { meta } = parseFrontmatter(content);
|
|
1648
1918
|
scanned.push({ slug: file.replace(/\.md$/, ""), meta });
|
|
1649
1919
|
}
|
|
@@ -1680,7 +1950,7 @@ function buildIndexContent(concepts, queries) {
|
|
|
1680
1950
|
|
|
1681
1951
|
// src/compiler/obsidian.ts
|
|
1682
1952
|
import { readdir as readdir4 } from "fs/promises";
|
|
1683
|
-
import
|
|
1953
|
+
import path16 from "path";
|
|
1684
1954
|
var ABBREVIATION_MIN_WORDS = 3;
|
|
1685
1955
|
var SWAP_CONJUNCTIONS = [" and ", " or "];
|
|
1686
1956
|
function addObsidianMeta(frontmatter, conceptTitle, tags) {
|
|
@@ -1722,11 +1992,11 @@ function generateAbbreviation(title) {
|
|
|
1722
1992
|
return abbreviation;
|
|
1723
1993
|
}
|
|
1724
1994
|
async function generateMOC(root) {
|
|
1725
|
-
const conceptsPath =
|
|
1995
|
+
const conceptsPath = path16.join(root, CONCEPTS_DIR);
|
|
1726
1996
|
const pages = await loadConceptPages(conceptsPath);
|
|
1727
1997
|
const tagGroups = groupPagesByTag(pages);
|
|
1728
1998
|
const content = buildMOCContent(tagGroups);
|
|
1729
|
-
await atomicWrite(
|
|
1999
|
+
await atomicWrite(path16.join(root, MOC_FILE), content);
|
|
1730
2000
|
}
|
|
1731
2001
|
async function loadConceptPages(conceptsPath) {
|
|
1732
2002
|
let files;
|
|
@@ -1738,7 +2008,7 @@ async function loadConceptPages(conceptsPath) {
|
|
|
1738
2008
|
const pages = [];
|
|
1739
2009
|
for (const file of files) {
|
|
1740
2010
|
if (!file.endsWith(".md")) continue;
|
|
1741
|
-
const content = await safeReadFile(
|
|
2011
|
+
const content = await safeReadFile(path16.join(conceptsPath, file));
|
|
1742
2012
|
if (!content) continue;
|
|
1743
2013
|
const { meta } = parseFrontmatter(content);
|
|
1744
2014
|
if (meta.orphaned) continue;
|
|
@@ -1789,9 +2059,143 @@ function buildMOCContent(tagGroups) {
|
|
|
1789
2059
|
}
|
|
1790
2060
|
|
|
1791
2061
|
// src/utils/embeddings.ts
|
|
1792
|
-
import { readFile as
|
|
2062
|
+
import { readFile as readFile12, readdir as readdir5 } from "fs/promises";
|
|
1793
2063
|
import { existsSync as existsSync4 } from "fs";
|
|
1794
|
-
import
|
|
2064
|
+
import path17 from "path";
|
|
2065
|
+
|
|
2066
|
+
// src/utils/retrieval.ts
|
|
2067
|
+
import { createHash as createHash2 } from "crypto";
|
|
2068
|
+
function hashChunkText(text) {
|
|
2069
|
+
return createHash2("sha256").update(text, "utf8").digest("hex").slice(0, 16);
|
|
2070
|
+
}
|
|
2071
|
+
function splitIntoChunks(body) {
|
|
2072
|
+
const paragraphs = extractParagraphs(body);
|
|
2073
|
+
if (paragraphs.length === 0) return [];
|
|
2074
|
+
const chunks = [];
|
|
2075
|
+
let buffer = "";
|
|
2076
|
+
for (const paragraph of paragraphs) {
|
|
2077
|
+
for (const piece of splitOversizedParagraph(paragraph)) {
|
|
2078
|
+
buffer = appendParagraph(buffer, piece, chunks);
|
|
2079
|
+
}
|
|
2080
|
+
}
|
|
2081
|
+
if (buffer.length > 0) chunks.push(buffer);
|
|
2082
|
+
return mergeTrailingFragment(chunks);
|
|
2083
|
+
}
|
|
2084
|
+
function appendParagraph(buffer, paragraph, chunks) {
|
|
2085
|
+
const candidate = buffer ? `${buffer}
|
|
2086
|
+
|
|
2087
|
+
${paragraph}` : paragraph;
|
|
2088
|
+
if (candidate.length <= CHUNK_TARGET_CHARS) return candidate;
|
|
2089
|
+
if (buffer.length > 0) {
|
|
2090
|
+
chunks.push(buffer);
|
|
2091
|
+
return paragraph;
|
|
2092
|
+
}
|
|
2093
|
+
chunks.push(candidate);
|
|
2094
|
+
return "";
|
|
2095
|
+
}
|
|
2096
|
+
function mergeTrailingFragment(chunks) {
|
|
2097
|
+
if (chunks.length < 2) return chunks;
|
|
2098
|
+
const last = chunks[chunks.length - 1];
|
|
2099
|
+
if (last.length >= CHUNK_MIN_CHARS) return chunks;
|
|
2100
|
+
const previous = chunks[chunks.length - 2];
|
|
2101
|
+
if (previous.length + last.length + 2 > CHUNK_MAX_CHARS) return chunks;
|
|
2102
|
+
const merged = chunks.slice(0, -2);
|
|
2103
|
+
merged.push(`${previous}
|
|
2104
|
+
|
|
2105
|
+
${last}`);
|
|
2106
|
+
return merged;
|
|
2107
|
+
}
|
|
2108
|
+
function extractParagraphs(body) {
|
|
2109
|
+
return body.split(/\n{2,}/).map((p) => p.trim()).filter((p) => p.length > 0);
|
|
2110
|
+
}
|
|
2111
|
+
function splitOversizedParagraph(paragraph) {
|
|
2112
|
+
if (paragraph.length <= CHUNK_MAX_CHARS) return [paragraph];
|
|
2113
|
+
const sentences = paragraph.split(/(?<=[.!?])\s+/);
|
|
2114
|
+
const pieces = [];
|
|
2115
|
+
let buffer = "";
|
|
2116
|
+
for (const sentence of sentences) {
|
|
2117
|
+
if ((buffer + " " + sentence).length > CHUNK_MAX_CHARS && buffer.length > 0) {
|
|
2118
|
+
pieces.push(buffer.trim());
|
|
2119
|
+
buffer = sentence;
|
|
2120
|
+
} else {
|
|
2121
|
+
buffer = buffer ? `${buffer} ${sentence}` : sentence;
|
|
2122
|
+
}
|
|
2123
|
+
}
|
|
2124
|
+
if (buffer.length > 0) pieces.push(buffer.trim());
|
|
2125
|
+
return pieces.flatMap(hardCut);
|
|
2126
|
+
}
|
|
2127
|
+
function hardCut(text) {
|
|
2128
|
+
if (text.length <= CHUNK_MAX_CHARS) return [text];
|
|
2129
|
+
const pieces = [];
|
|
2130
|
+
for (let start = 0; start < text.length; start += CHUNK_MAX_CHARS) {
|
|
2131
|
+
pieces.push(text.slice(start, start + CHUNK_MAX_CHARS));
|
|
2132
|
+
}
|
|
2133
|
+
return pieces;
|
|
2134
|
+
}
|
|
2135
|
+
function rerankWithBm25(query, candidates) {
|
|
2136
|
+
if (candidates.length === 0) return [];
|
|
2137
|
+
const queryTerms = tokenize(query);
|
|
2138
|
+
if (queryTerms.length === 0) {
|
|
2139
|
+
return candidates.map((candidate) => ({ candidate, score: candidate.baseScore }));
|
|
2140
|
+
}
|
|
2141
|
+
const docs = candidates.map((c) => tokenize(c.text));
|
|
2142
|
+
const stats = buildCorpusStats(docs);
|
|
2143
|
+
return rankByBm25Score(candidates, docs, queryTerms, stats);
|
|
2144
|
+
}
|
|
2145
|
+
function rankByBm25Score(candidates, docs, queryTerms, stats) {
|
|
2146
|
+
const scored = candidates.map((candidate, index) => {
|
|
2147
|
+
const lexical = bm25Score(queryTerms, docs[index], stats);
|
|
2148
|
+
return { candidate, score: lexical + candidate.baseScore * BASE_SCORE_WEIGHT };
|
|
2149
|
+
});
|
|
2150
|
+
scored.sort((a, b) => b.score - a.score);
|
|
2151
|
+
return scored;
|
|
2152
|
+
}
|
|
2153
|
+
function tokenize(text) {
|
|
2154
|
+
return text.toLowerCase().match(/[a-z0-9]+/g) ?? [];
|
|
2155
|
+
}
|
|
2156
|
+
function buildCorpusStats(docs) {
|
|
2157
|
+
const docFreq = /* @__PURE__ */ new Map();
|
|
2158
|
+
let totalLen = 0;
|
|
2159
|
+
for (const tokens of docs) {
|
|
2160
|
+
totalLen += tokens.length;
|
|
2161
|
+
const unique = new Set(tokens);
|
|
2162
|
+
for (const term of unique) docFreq.set(term, (docFreq.get(term) ?? 0) + 1);
|
|
2163
|
+
}
|
|
2164
|
+
const totalDocs = docs.length;
|
|
2165
|
+
const avgDocLen = totalDocs > 0 ? totalLen / totalDocs : 0;
|
|
2166
|
+
return { docFreq, avgDocLen, totalDocs };
|
|
2167
|
+
}
|
|
2168
|
+
var BM25_K1 = 1.5;
|
|
2169
|
+
var BM25_B = 0.75;
|
|
2170
|
+
var BASE_SCORE_WEIGHT = 0.5;
|
|
2171
|
+
function bm25Score(queryTerms, docTokens, stats) {
|
|
2172
|
+
if (docTokens.length === 0 || stats.totalDocs === 0) return 0;
|
|
2173
|
+
const termFreq = countTerms(docTokens);
|
|
2174
|
+
const lengthRatio = docTokens.length / (stats.avgDocLen || 1);
|
|
2175
|
+
let total = 0;
|
|
2176
|
+
for (const term of queryTerms) {
|
|
2177
|
+
const tf = termFreq.get(term) ?? 0;
|
|
2178
|
+
if (tf === 0) continue;
|
|
2179
|
+
const idf = idfWeight(stats.docFreq.get(term) ?? 0, stats.totalDocs);
|
|
2180
|
+
const numerator = tf * (BM25_K1 + 1);
|
|
2181
|
+
const denominator = tf + BM25_K1 * (1 - BM25_B + BM25_B * lengthRatio);
|
|
2182
|
+
total += idf * (numerator / denominator);
|
|
2183
|
+
}
|
|
2184
|
+
return total;
|
|
2185
|
+
}
|
|
2186
|
+
function idfWeight(docFrequency, totalDocs) {
|
|
2187
|
+
const numerator = totalDocs - docFrequency + 0.5;
|
|
2188
|
+
const denominator = docFrequency + 0.5;
|
|
2189
|
+
return Math.log(1 + numerator / denominator);
|
|
2190
|
+
}
|
|
2191
|
+
function countTerms(tokens) {
|
|
2192
|
+
const counts = /* @__PURE__ */ new Map();
|
|
2193
|
+
for (const token of tokens) counts.set(token, (counts.get(token) ?? 0) + 1);
|
|
2194
|
+
return counts;
|
|
2195
|
+
}
|
|
2196
|
+
|
|
2197
|
+
// src/utils/embeddings.ts
|
|
2198
|
+
var STORE_VERSION = 2;
|
|
1795
2199
|
function cosineSimilarity(a, b) {
|
|
1796
2200
|
if (a.length !== b.length || a.length === 0) return 0;
|
|
1797
2201
|
let dot = 0;
|
|
@@ -1813,24 +2217,27 @@ function findTopK(queryVec, store, k) {
|
|
|
1813
2217
|
scored.sort((left, right) => right.score - left.score);
|
|
1814
2218
|
return scored.slice(0, k).map((item) => item.entry);
|
|
1815
2219
|
}
|
|
2220
|
+
function findTopKChunks(queryVec, chunks, k) {
|
|
2221
|
+
const scored = chunks.map((chunk) => ({
|
|
2222
|
+
chunk,
|
|
2223
|
+
score: cosineSimilarity(queryVec, chunk.vector)
|
|
2224
|
+
}));
|
|
2225
|
+
scored.sort((left, right) => right.score - left.score);
|
|
2226
|
+
return scored.slice(0, k);
|
|
2227
|
+
}
|
|
1816
2228
|
async function readEmbeddingStore(root) {
|
|
1817
|
-
const filePath =
|
|
2229
|
+
const filePath = path17.join(root, EMBEDDINGS_FILE);
|
|
1818
2230
|
if (!existsSync4(filePath)) return null;
|
|
1819
|
-
const raw = await
|
|
2231
|
+
const raw = await readFile12(filePath, "utf-8");
|
|
1820
2232
|
return JSON.parse(raw);
|
|
1821
2233
|
}
|
|
1822
2234
|
async function writeEmbeddingStore(root, store) {
|
|
1823
|
-
const filePath =
|
|
2235
|
+
const filePath = path17.join(root, EMBEDDINGS_FILE);
|
|
1824
2236
|
await atomicWrite(filePath, JSON.stringify(store, null, 2));
|
|
1825
2237
|
}
|
|
1826
2238
|
async function findRelevantPages(root, question) {
|
|
1827
|
-
const store = await
|
|
1828
|
-
if (!store
|
|
1829
|
-
const activeModel = resolveEmbeddingModel();
|
|
1830
|
-
if (store.model !== activeModel) {
|
|
1831
|
-
warnStaleEmbeddingStore(store.model, activeModel);
|
|
1832
|
-
return [];
|
|
1833
|
-
}
|
|
2239
|
+
const store = await loadActiveStore(root, (s) => s.entries.length > 0);
|
|
2240
|
+
if (!store) return [];
|
|
1834
2241
|
const queryVec = await getProvider().embed(question);
|
|
1835
2242
|
return findTopK(queryVec, store, EMBEDDING_TOP_K).map((entry) => ({
|
|
1836
2243
|
slug: entry.slug,
|
|
@@ -1838,10 +2245,26 @@ async function findRelevantPages(root, question) {
|
|
|
1838
2245
|
summary: entry.summary
|
|
1839
2246
|
}));
|
|
1840
2247
|
}
|
|
2248
|
+
async function findRelevantChunks(root, question, k) {
|
|
2249
|
+
const store = await loadActiveStore(root, (s) => Boolean(s.chunks && s.chunks.length > 0));
|
|
2250
|
+
if (!store) return [];
|
|
2251
|
+
const queryVec = await getProvider().embed(question);
|
|
2252
|
+
return findTopKChunks(queryVec, store.chunks ?? [], k);
|
|
2253
|
+
}
|
|
2254
|
+
async function loadActiveStore(root, hasContent) {
|
|
2255
|
+
const store = await readEmbeddingStore(root);
|
|
2256
|
+
if (!store || !hasContent(store)) return null;
|
|
2257
|
+
const activeModel = resolveEmbeddingModel();
|
|
2258
|
+
if (store.model !== activeModel) {
|
|
2259
|
+
warnStaleEmbeddingStore(store.model, activeModel);
|
|
2260
|
+
return null;
|
|
2261
|
+
}
|
|
2262
|
+
return store;
|
|
2263
|
+
}
|
|
1841
2264
|
async function collectPageRecords(root) {
|
|
1842
2265
|
const records = [];
|
|
1843
2266
|
for (const dir of [CONCEPTS_DIR, QUERIES_DIR]) {
|
|
1844
|
-
const absDir =
|
|
2267
|
+
const absDir = path17.join(root, dir);
|
|
1845
2268
|
let files;
|
|
1846
2269
|
try {
|
|
1847
2270
|
files = await readdir5(absDir);
|
|
@@ -1849,18 +2272,23 @@ async function collectPageRecords(root) {
|
|
|
1849
2272
|
continue;
|
|
1850
2273
|
}
|
|
1851
2274
|
for (const file of files.filter((f) => f.endsWith(".md"))) {
|
|
1852
|
-
const
|
|
1853
|
-
|
|
1854
|
-
if (meta.orphaned || typeof meta.title !== "string") continue;
|
|
1855
|
-
records.push({
|
|
1856
|
-
slug: file.replace(/\.md$/, ""),
|
|
1857
|
-
title: meta.title,
|
|
1858
|
-
summary: typeof meta.summary === "string" ? meta.summary : ""
|
|
1859
|
-
});
|
|
2275
|
+
const record = await readPageRecord(absDir, file);
|
|
2276
|
+
if (record) records.push(record);
|
|
1860
2277
|
}
|
|
1861
2278
|
}
|
|
1862
2279
|
return records;
|
|
1863
2280
|
}
|
|
2281
|
+
async function readPageRecord(absDir, file) {
|
|
2282
|
+
const content = await safeReadFile(path17.join(absDir, file));
|
|
2283
|
+
const { meta, body } = parseFrontmatter(content);
|
|
2284
|
+
if (meta.orphaned || typeof meta.title !== "string") return null;
|
|
2285
|
+
return {
|
|
2286
|
+
slug: file.replace(/\.md$/, ""),
|
|
2287
|
+
title: meta.title,
|
|
2288
|
+
summary: typeof meta.summary === "string" ? meta.summary : "",
|
|
2289
|
+
body
|
|
2290
|
+
};
|
|
2291
|
+
}
|
|
1864
2292
|
function buildEmbeddingText(record) {
|
|
1865
2293
|
return record.summary ? `${record.title}
|
|
1866
2294
|
|
|
@@ -1913,6 +2341,56 @@ function mergeEntries(existing, fresh, liveSlugs) {
|
|
|
1913
2341
|
}
|
|
1914
2342
|
return Array.from(bySlug.values());
|
|
1915
2343
|
}
|
|
2344
|
+
async function refreshChunkEmbeddings(records, existing, forceAll) {
|
|
2345
|
+
const liveSlugs = new Set(records.map((r) => r.slug));
|
|
2346
|
+
const existingByKey = indexChunksByKey(existing.filter((c) => liveSlugs.has(c.slug)));
|
|
2347
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
2348
|
+
const fresh = [];
|
|
2349
|
+
for (const record of records) {
|
|
2350
|
+
const pageChunks = await embedRecordChunks(record, existingByKey, forceAll, now);
|
|
2351
|
+
fresh.push(...pageChunks);
|
|
2352
|
+
}
|
|
2353
|
+
return fresh;
|
|
2354
|
+
}
|
|
2355
|
+
async function embedRecordChunks(record, existingByKey, forceAll, now) {
|
|
2356
|
+
const provider = getProvider();
|
|
2357
|
+
const chunkTexts = splitIntoChunks(record.body);
|
|
2358
|
+
const out = [];
|
|
2359
|
+
for (let i = 0; i < chunkTexts.length; i++) {
|
|
2360
|
+
const text = chunkTexts[i];
|
|
2361
|
+
const contentHash = hashChunkText(text);
|
|
2362
|
+
const reused = pickReusableChunk(existingByKey, record.slug, i, contentHash, forceAll);
|
|
2363
|
+
if (reused) {
|
|
2364
|
+
out.push({ ...reused, title: record.title });
|
|
2365
|
+
continue;
|
|
2366
|
+
}
|
|
2367
|
+
const vector = await provider.embed(text);
|
|
2368
|
+
out.push({
|
|
2369
|
+
slug: record.slug,
|
|
2370
|
+
title: record.title,
|
|
2371
|
+
chunkIndex: i,
|
|
2372
|
+
contentHash,
|
|
2373
|
+
text,
|
|
2374
|
+
vector,
|
|
2375
|
+
updatedAt: now
|
|
2376
|
+
});
|
|
2377
|
+
}
|
|
2378
|
+
return out;
|
|
2379
|
+
}
|
|
2380
|
+
function indexChunksByKey(chunks) {
|
|
2381
|
+
const byKey = /* @__PURE__ */ new Map();
|
|
2382
|
+
for (const chunk of chunks) byKey.set(chunkKey(chunk.slug, chunk.chunkIndex), chunk);
|
|
2383
|
+
return byKey;
|
|
2384
|
+
}
|
|
2385
|
+
function chunkKey(slug, chunkIndex) {
|
|
2386
|
+
return `${slug}#${chunkIndex}`;
|
|
2387
|
+
}
|
|
2388
|
+
function pickReusableChunk(byKey, slug, chunkIndex, contentHash, forceAll) {
|
|
2389
|
+
if (forceAll) return null;
|
|
2390
|
+
const existing = byKey.get(chunkKey(slug, chunkIndex));
|
|
2391
|
+
if (!existing) return null;
|
|
2392
|
+
return existing.contentHash === contentHash ? existing : null;
|
|
2393
|
+
}
|
|
1916
2394
|
async function updateEmbeddings(root, changedSlugs) {
|
|
1917
2395
|
const records = await collectPageRecords(root);
|
|
1918
2396
|
const liveSlugs = new Set(records.map((r) => r.slug));
|
|
@@ -1921,29 +2399,51 @@ async function updateEmbeddings(root, changedSlugs) {
|
|
|
1921
2399
|
const modelChanged = Boolean(existingStore && existingStore.model !== embeddingModel);
|
|
1922
2400
|
const toEmbed = new Set(changedSlugs.filter((slug) => liveSlugs.has(slug)));
|
|
1923
2401
|
const previousEntries = modelChanged ? [] : existingStore?.entries ?? [];
|
|
1924
|
-
|
|
2402
|
+
const previousChunks = modelChanged ? [] : existingStore?.chunks ?? [];
|
|
2403
|
+
const isEmptyStore = isStoreEmpty(existingStore);
|
|
2404
|
+
if (!existingStore || modelChanged || isEmptyStore && liveSlugs.size > 0) {
|
|
1925
2405
|
for (const record of records) toEmbed.add(record.slug);
|
|
1926
2406
|
}
|
|
1927
|
-
if (!modelChanged
|
|
2407
|
+
if (!shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs)) {
|
|
1928
2408
|
return;
|
|
1929
2409
|
}
|
|
1930
2410
|
const freshEntries = await embedPages(records, toEmbed);
|
|
1931
2411
|
const mergedEntries = mergeEntries(previousEntries, freshEntries, liveSlugs);
|
|
1932
|
-
const
|
|
2412
|
+
const mergedChunks = await refreshChunkEmbeddings(records, previousChunks, modelChanged);
|
|
2413
|
+
await persistRefreshedStore(root, embeddingModel, mergedEntries, mergedChunks);
|
|
2414
|
+
}
|
|
2415
|
+
async function persistRefreshedStore(root, embeddingModel, entries, chunks) {
|
|
2416
|
+
const dimensions = entries[0]?.vector.length ?? chunks[0]?.vector.length ?? 0;
|
|
1933
2417
|
const store = {
|
|
1934
|
-
version:
|
|
2418
|
+
version: STORE_VERSION,
|
|
1935
2419
|
model: embeddingModel,
|
|
1936
2420
|
dimensions,
|
|
1937
|
-
entries
|
|
2421
|
+
entries,
|
|
2422
|
+
chunks
|
|
1938
2423
|
};
|
|
1939
2424
|
await writeEmbeddingStore(root, store);
|
|
1940
|
-
status(
|
|
2425
|
+
status(
|
|
2426
|
+
"*",
|
|
2427
|
+
dim(`Embeddings updated (${entries.length} pages, ${chunks.length} chunks).`)
|
|
2428
|
+
);
|
|
2429
|
+
}
|
|
2430
|
+
function isStoreEmpty(store) {
|
|
2431
|
+
if (!store) return false;
|
|
2432
|
+
return store.entries.length === 0 && (!store.chunks || store.chunks.length === 0);
|
|
2433
|
+
}
|
|
2434
|
+
function shouldRunEmbedding(modelChanged, toEmbed, previousEntries, previousChunks, liveSlugs) {
|
|
2435
|
+
if (modelChanged) return true;
|
|
2436
|
+
if (toEmbed.size > 0) return true;
|
|
2437
|
+
if (!previousEntries.every((e) => liveSlugs.has(e.slug))) return true;
|
|
2438
|
+
if (!previousChunks.every((c) => liveSlugs.has(c.slug))) return true;
|
|
2439
|
+
if (previousEntries.length > 0 && previousChunks.length === 0 && liveSlugs.size > 0) return true;
|
|
2440
|
+
return false;
|
|
1941
2441
|
}
|
|
1942
2442
|
|
|
1943
2443
|
// src/compiler/candidates.ts
|
|
1944
2444
|
import { readdir as readdir6, rename as rename3, unlink as unlink2, writeFile as writeFile4, mkdir as mkdir5 } from "fs/promises";
|
|
1945
2445
|
import { existsSync as existsSync5 } from "fs";
|
|
1946
|
-
import
|
|
2446
|
+
import path18 from "path";
|
|
1947
2447
|
import { randomBytes } from "crypto";
|
|
1948
2448
|
var ID_SUFFIX_BYTES = 4;
|
|
1949
2449
|
var CANDIDATE_EXT = ".json";
|
|
@@ -1952,10 +2452,10 @@ function buildCandidateId(slug) {
|
|
|
1952
2452
|
return `${slug}-${suffix}`;
|
|
1953
2453
|
}
|
|
1954
2454
|
function candidatePath(root, id) {
|
|
1955
|
-
return
|
|
2455
|
+
return path18.join(root, CANDIDATES_DIR, `${id}${CANDIDATE_EXT}`);
|
|
1956
2456
|
}
|
|
1957
2457
|
function archivePath(root, id) {
|
|
1958
|
-
return
|
|
2458
|
+
return path18.join(root, CANDIDATES_ARCHIVE_DIR, `${id}${CANDIDATE_EXT}`);
|
|
1959
2459
|
}
|
|
1960
2460
|
async function writeCandidate(root, draft) {
|
|
1961
2461
|
const candidate = {
|
|
@@ -2006,7 +2506,7 @@ function isValidCandidate(value) {
|
|
|
2006
2506
|
return typeof candidate.id === "string" && typeof candidate.title === "string" && typeof candidate.slug === "string" && typeof candidate.body === "string" && Array.isArray(candidate.sources);
|
|
2007
2507
|
}
|
|
2008
2508
|
async function listCandidates(root) {
|
|
2009
|
-
const dir =
|
|
2509
|
+
const dir = path18.join(root, CANDIDATES_DIR);
|
|
2010
2510
|
if (!existsSync5(dir)) return [];
|
|
2011
2511
|
const entries = await readdir6(dir, { withFileTypes: true });
|
|
2012
2512
|
const candidates = [];
|
|
@@ -2033,7 +2533,7 @@ async function archiveCandidate(root, id) {
|
|
|
2033
2533
|
const sourcePath = candidatePath(root, id);
|
|
2034
2534
|
if (!existsSync5(sourcePath)) return false;
|
|
2035
2535
|
const target = archivePath(root, id);
|
|
2036
|
-
await mkdir5(
|
|
2536
|
+
await mkdir5(path18.dirname(target), { recursive: true });
|
|
2037
2537
|
try {
|
|
2038
2538
|
await rename3(sourcePath, target);
|
|
2039
2539
|
} catch {
|
|
@@ -2045,9 +2545,9 @@ async function archiveCandidate(root, id) {
|
|
|
2045
2545
|
}
|
|
2046
2546
|
|
|
2047
2547
|
// src/linter/rules.ts
|
|
2048
|
-
import { readdir as readdir7, readFile as
|
|
2548
|
+
import { readdir as readdir7, readFile as readFile13 } from "fs/promises";
|
|
2049
2549
|
import { existsSync as existsSync6 } from "fs";
|
|
2050
|
-
import
|
|
2550
|
+
import path19 from "path";
|
|
2051
2551
|
var MIN_BODY_LENGTH = 50;
|
|
2052
2552
|
var WIKILINK_PATTERN2 = /\[\[([^\]]+)\]\]/g;
|
|
2053
2553
|
var CITATION_PATTERN = /\^\[([^\]]+)\]/g;
|
|
@@ -2068,22 +2568,22 @@ async function readMarkdownFiles(dirPath) {
|
|
|
2068
2568
|
const mdFiles = entries.filter((f) => f.endsWith(".md"));
|
|
2069
2569
|
const results = await Promise.all(
|
|
2070
2570
|
mdFiles.map(async (fileName) => {
|
|
2071
|
-
const filePath =
|
|
2072
|
-
const content = await
|
|
2571
|
+
const filePath = path19.join(dirPath, fileName);
|
|
2572
|
+
const content = await readFile13(filePath, "utf-8");
|
|
2073
2573
|
return { filePath, content };
|
|
2074
2574
|
})
|
|
2075
2575
|
);
|
|
2076
2576
|
return results;
|
|
2077
2577
|
}
|
|
2078
2578
|
async function collectAllPages(root) {
|
|
2079
|
-
const conceptPages = await readMarkdownFiles(
|
|
2080
|
-
const queryPages = await readMarkdownFiles(
|
|
2579
|
+
const conceptPages = await readMarkdownFiles(path19.join(root, CONCEPTS_DIR));
|
|
2580
|
+
const queryPages = await readMarkdownFiles(path19.join(root, QUERIES_DIR));
|
|
2081
2581
|
return [...conceptPages, ...queryPages];
|
|
2082
2582
|
}
|
|
2083
2583
|
function buildPageSlugSet(pages) {
|
|
2084
2584
|
const slugs = /* @__PURE__ */ new Set();
|
|
2085
2585
|
for (const page of pages) {
|
|
2086
|
-
const baseName =
|
|
2586
|
+
const baseName = path19.basename(page.filePath, ".md");
|
|
2087
2587
|
slugs.add(baseName.toLowerCase());
|
|
2088
2588
|
}
|
|
2089
2589
|
return slugs;
|
|
@@ -2318,7 +2818,7 @@ function countLines(content) {
|
|
|
2318
2818
|
}
|
|
2319
2819
|
async function checkBrokenCitations(root) {
|
|
2320
2820
|
const pages = await collectAllPages(root);
|
|
2321
|
-
const sourcesDir =
|
|
2821
|
+
const sourcesDir = path19.join(root, SOURCES_DIR);
|
|
2322
2822
|
const results = [];
|
|
2323
2823
|
const lineCountCache = /* @__PURE__ */ new Map();
|
|
2324
2824
|
for (const page of pages) {
|
|
@@ -2333,7 +2833,7 @@ async function collectBrokenForMarker(captured, line, pageFile, sourcesDir, line
|
|
|
2333
2833
|
const trimmed = part.trim();
|
|
2334
2834
|
if (trimmed.length === 0) continue;
|
|
2335
2835
|
const filename = stripSpanSuffix(trimmed);
|
|
2336
|
-
const citedPath =
|
|
2836
|
+
const citedPath = path19.join(sourcesDir, filename);
|
|
2337
2837
|
if (!existsSync6(citedPath)) {
|
|
2338
2838
|
out.push({
|
|
2339
2839
|
rule: "broken-citation",
|
|
@@ -2387,7 +2887,7 @@ async function checkMalformedClaimCitations(root) {
|
|
|
2387
2887
|
|
|
2388
2888
|
// src/compiler/page-renderer.ts
|
|
2389
2889
|
import { readdir as readdir8 } from "fs/promises";
|
|
2390
|
-
import
|
|
2890
|
+
import path20 from "path";
|
|
2391
2891
|
|
|
2392
2892
|
// src/compiler/provenance.ts
|
|
2393
2893
|
function addProvenanceMeta(fields, concept) {
|
|
@@ -2417,7 +2917,7 @@ function reportContradictionWarnings(conceptTitle, concept) {
|
|
|
2417
2917
|
// src/compiler/page-renderer.ts
|
|
2418
2918
|
var RELATED_PAGE_CONTEXT_LIMIT = 5;
|
|
2419
2919
|
async function renderMergedPageContent(root, entry, schema) {
|
|
2420
|
-
const pagePath =
|
|
2920
|
+
const pagePath = path20.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
|
|
2421
2921
|
const existingPage = await safeReadFile(pagePath);
|
|
2422
2922
|
const relatedPages = await loadRelatedPages(root, entry.slug);
|
|
2423
2923
|
const system = buildPagePrompt(
|
|
@@ -2456,7 +2956,7 @@ function buildMergedFrontmatter(entry, existingPage, schema) {
|
|
|
2456
2956
|
return buildFrontmatter(frontmatterFields);
|
|
2457
2957
|
}
|
|
2458
2958
|
async function loadRelatedPages(root, excludeSlug) {
|
|
2459
|
-
const conceptsPath =
|
|
2959
|
+
const conceptsPath = path20.join(root, CONCEPTS_DIR);
|
|
2460
2960
|
let files;
|
|
2461
2961
|
try {
|
|
2462
2962
|
files = await readdir8(conceptsPath);
|
|
@@ -2466,7 +2966,7 @@ async function loadRelatedPages(root, excludeSlug) {
|
|
|
2466
2966
|
const related = files.filter((f) => f.endsWith(".md") && f !== `${excludeSlug}.md`).slice(0, RELATED_PAGE_CONTEXT_LIMIT);
|
|
2467
2967
|
const contents = [];
|
|
2468
2968
|
for (const f of related) {
|
|
2469
|
-
const content = await safeReadFile(
|
|
2969
|
+
const content = await safeReadFile(path20.join(conceptsPath, f));
|
|
2470
2970
|
if (!content) continue;
|
|
2471
2971
|
const { meta } = parseFrontmatter(content);
|
|
2472
2972
|
if (meta.orphaned) continue;
|
|
@@ -2667,9 +3167,9 @@ function printChangesSummary(changes) {
|
|
|
2667
3167
|
}
|
|
2668
3168
|
async function extractForSource(root, sourceFile) {
|
|
2669
3169
|
status("*", info(`Extracting: ${sourceFile}`));
|
|
2670
|
-
const sourcePath =
|
|
2671
|
-
const sourceContent = await
|
|
2672
|
-
const existingIndex = await safeReadFile(
|
|
3170
|
+
const sourcePath = path21.join(root, SOURCES_DIR, sourceFile);
|
|
3171
|
+
const sourceContent = await readFile14(sourcePath, "utf-8");
|
|
3172
|
+
const existingIndex = await safeReadFile(path21.join(root, INDEX_FILE));
|
|
2673
3173
|
const concepts = await extractConcepts(sourceContent, existingIndex);
|
|
2674
3174
|
if (concepts.length > 0) {
|
|
2675
3175
|
const names = concepts.map((c) => c.concept).join(", ");
|
|
@@ -2732,7 +3232,7 @@ async function generateMergedPage(root, entry, schema, options, sourceStates) {
|
|
|
2732
3232
|
if (options.review) {
|
|
2733
3233
|
return await persistReviewCandidate(root, entry, fullPage, sourceStates, schema);
|
|
2734
3234
|
}
|
|
2735
|
-
const pagePath =
|
|
3235
|
+
const pagePath = path21.join(root, CONCEPTS_DIR, `${entry.slug}.md`);
|
|
2736
3236
|
const error2 = await writePageIfValid(pagePath, fullPage, entry.concept.concept);
|
|
2737
3237
|
return { error: error2 ?? void 0 };
|
|
2738
3238
|
}
|
|
@@ -2760,7 +3260,7 @@ async function generateSeedPages(root, schema, generation) {
|
|
|
2760
3260
|
}
|
|
2761
3261
|
async function generateSingleSeedPage(root, schema, seed) {
|
|
2762
3262
|
const slug = slugify(seed.title);
|
|
2763
|
-
const pagePath =
|
|
3263
|
+
const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
|
|
2764
3264
|
const relatedContent = await loadSeedRelatedPages(root, seed.relatedSlugs ?? []);
|
|
2765
3265
|
const rule = schema.kinds[seed.kind];
|
|
2766
3266
|
const system = buildSeedPagePrompt(seed, rule, relatedContent);
|
|
@@ -2792,7 +3292,7 @@ async function loadSeedRelatedPages(root, slugs) {
|
|
|
2792
3292
|
if (slugs.length === 0) return "";
|
|
2793
3293
|
const contents = [];
|
|
2794
3294
|
for (const slug of slugs) {
|
|
2795
|
-
const pagePath =
|
|
3295
|
+
const pagePath = path21.join(root, CONCEPTS_DIR, `${slug}.md`);
|
|
2796
3296
|
const content = await safeReadFile(pagePath);
|
|
2797
3297
|
if (content) contents.push(content);
|
|
2798
3298
|
}
|
|
@@ -2847,7 +3347,7 @@ async function compileCommand(options = {}) {
|
|
|
2847
3347
|
|
|
2848
3348
|
// src/commands/query.ts
|
|
2849
3349
|
import { existsSync as existsSync8 } from "fs";
|
|
2850
|
-
import
|
|
3350
|
+
import path22 from "path";
|
|
2851
3351
|
var PAGE_DIRS = [CONCEPTS_DIR, QUERIES_DIR];
|
|
2852
3352
|
var PAGE_SELECTION_TOOL = {
|
|
2853
3353
|
name: "select_pages",
|
|
@@ -2895,16 +3395,92 @@ ${indexContent}`;
|
|
|
2895
3395
|
function buildFilteredIndex(candidates) {
|
|
2896
3396
|
return candidates.map((entry) => `- **${entry.slug}**: ${entry.title} \u2014 ${entry.summary}`).join("\n");
|
|
2897
3397
|
}
|
|
2898
|
-
async function selectRelevantPages(root, question) {
|
|
3398
|
+
async function selectRelevantPages(root, question, debug) {
|
|
3399
|
+
const chunkSelection = await trySelectViaChunks(root, question, debug);
|
|
3400
|
+
if (chunkSelection) return chunkSelection;
|
|
2899
3401
|
const candidates = await tryFindRelevantPages(root, question);
|
|
2900
3402
|
if (candidates.length > 0) {
|
|
2901
3403
|
const filteredIndex = buildFilteredIndex(candidates);
|
|
2902
3404
|
const { pages: rawPages2, reasoning: reasoning2 } = await selectPages(question, filteredIndex);
|
|
2903
|
-
return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2 };
|
|
3405
|
+
return { pages: rawPages2, rawPages: rawPages2, reasoning: reasoning2, chunks: [] };
|
|
2904
3406
|
}
|
|
2905
|
-
const indexContent = await safeReadFile(
|
|
3407
|
+
const indexContent = await safeReadFile(path22.join(root, INDEX_FILE));
|
|
2906
3408
|
const { pages: rawPages, reasoning } = await selectPages(question, indexContent);
|
|
2907
|
-
return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning };
|
|
3409
|
+
return { pages: rawPages.map((p) => slugify(p)), rawPages, reasoning, chunks: [] };
|
|
3410
|
+
}
|
|
3411
|
+
async function trySelectViaChunks(root, question, debug) {
|
|
3412
|
+
const ranked = await tryFindRelevantChunks(root, question);
|
|
3413
|
+
if (ranked.length === 0) return null;
|
|
3414
|
+
const reranked = rerankWithBm25(
|
|
3415
|
+
question,
|
|
3416
|
+
ranked.map(({ chunk, score }) => ({ text: chunk.text, baseScore: score, chunk }))
|
|
3417
|
+
);
|
|
3418
|
+
const kept = reranked.slice(0, CHUNK_RERANK_KEEP);
|
|
3419
|
+
const reorderingHappened = wasReordered(ranked, kept.map((k) => k.candidate.chunk));
|
|
3420
|
+
const chunkCitations = toChunkCitations(kept);
|
|
3421
|
+
const pageSlugs = collapseToPages(chunkCitations, QUERY_PAGE_LIMIT);
|
|
3422
|
+
const reasoning = buildChunkReasoning(chunkCitations, pageSlugs);
|
|
3423
|
+
return {
|
|
3424
|
+
pages: pageSlugs,
|
|
3425
|
+
rawPages: pageSlugs,
|
|
3426
|
+
reasoning,
|
|
3427
|
+
chunks: chunkCitations,
|
|
3428
|
+
debug: debug ? buildDebug(chunkCitations, pageSlugs, reorderingHappened) : void 0
|
|
3429
|
+
};
|
|
3430
|
+
}
|
|
3431
|
+
function wasReordered(before, after) {
|
|
3432
|
+
const limit = Math.min(before.length, after.length);
|
|
3433
|
+
for (let i = 0; i < limit; i++) {
|
|
3434
|
+
if (before[i].chunk !== after[i]) return true;
|
|
3435
|
+
}
|
|
3436
|
+
return false;
|
|
3437
|
+
}
|
|
3438
|
+
function toChunkCitations(ranked) {
|
|
3439
|
+
return ranked.map(({ candidate, score }) => ({
|
|
3440
|
+
slug: candidate.chunk.slug,
|
|
3441
|
+
title: candidate.chunk.title,
|
|
3442
|
+
chunkIndex: candidate.chunk.chunkIndex,
|
|
3443
|
+
score,
|
|
3444
|
+
text: candidate.chunk.text
|
|
3445
|
+
}));
|
|
3446
|
+
}
|
|
3447
|
+
function collapseToPages(chunks, limit) {
|
|
3448
|
+
const slugs = [];
|
|
3449
|
+
const seen = /* @__PURE__ */ new Set();
|
|
3450
|
+
for (const chunk of chunks) {
|
|
3451
|
+
if (seen.has(chunk.slug)) continue;
|
|
3452
|
+
seen.add(chunk.slug);
|
|
3453
|
+
slugs.push(chunk.slug);
|
|
3454
|
+
if (slugs.length >= limit) break;
|
|
3455
|
+
}
|
|
3456
|
+
return slugs;
|
|
3457
|
+
}
|
|
3458
|
+
function buildChunkReasoning(chunks, pages) {
|
|
3459
|
+
const top = chunks.slice(0, pages.length);
|
|
3460
|
+
const summary = top.map((c) => `${c.slug}#${c.chunkIndex} (${c.score.toFixed(3)})`).join(", ");
|
|
3461
|
+
return `Selected ${pages.length} page(s) from ${chunks.length} reranked chunks: ${summary}`;
|
|
3462
|
+
}
|
|
3463
|
+
function buildDebug(chunks, pageSlugs, reranked) {
|
|
3464
|
+
const bestPerPage = /* @__PURE__ */ new Map();
|
|
3465
|
+
for (const c of chunks) {
|
|
3466
|
+
const prev = bestPerPage.get(c.slug);
|
|
3467
|
+
if (prev === void 0 || c.score > prev) bestPerPage.set(c.slug, c.score);
|
|
3468
|
+
}
|
|
3469
|
+
return {
|
|
3470
|
+
pages: pageSlugs.map((slug) => ({ slug, score: bestPerPage.get(slug) ?? 0 })),
|
|
3471
|
+
chunks,
|
|
3472
|
+
usedChunks: true,
|
|
3473
|
+
reranked
|
|
3474
|
+
};
|
|
3475
|
+
}
|
|
3476
|
+
async function tryFindRelevantChunks(root, question) {
|
|
3477
|
+
try {
|
|
3478
|
+
return await findRelevantChunks(root, question, CHUNK_TOP_K);
|
|
3479
|
+
} catch (err) {
|
|
3480
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
3481
|
+
status("!", dim(`Chunk pre-filter unavailable (${message}); falling back.`));
|
|
3482
|
+
return [];
|
|
3483
|
+
}
|
|
2908
3484
|
}
|
|
2909
3485
|
async function tryFindRelevantPages(root, question) {
|
|
2910
3486
|
try {
|
|
@@ -2920,7 +3496,7 @@ async function loadSelectedPages(root, slugs) {
|
|
|
2920
3496
|
for (const slug of slugs) {
|
|
2921
3497
|
let content = "";
|
|
2922
3498
|
for (const dir of PAGE_DIRS) {
|
|
2923
|
-
const candidate = await safeReadFile(
|
|
3499
|
+
const candidate = await safeReadFile(path22.join(root, dir, `${slug}.md`));
|
|
2924
3500
|
if (!candidate) continue;
|
|
2925
3501
|
const { meta } = parseFrontmatter(candidate);
|
|
2926
3502
|
if (meta.orphaned) continue;
|
|
@@ -2937,11 +3513,12 @@ ${content}`);
|
|
|
2937
3513
|
return sections.join("\n\n");
|
|
2938
3514
|
}
|
|
2939
3515
|
var ANSWER_SYSTEM_PROMPT = "You are a knowledge assistant. Answer the question using ONLY the wiki content provided. Cite specific pages using [[Page Title]] wikilinks. If the wiki doesn't contain enough information, say so.";
|
|
2940
|
-
async function callAnswerLLM(question, pagesContent, onToken) {
|
|
3516
|
+
async function callAnswerLLM(question, pagesContent, chunks, onToken) {
|
|
3517
|
+
const provenance = chunks.length > 0 ? buildChunkProvenance(chunks) : "";
|
|
2941
3518
|
const userMessage = `Question: ${question}
|
|
2942
3519
|
|
|
2943
3520
|
Relevant wiki pages:
|
|
2944
|
-
${pagesContent}`;
|
|
3521
|
+
${pagesContent}${provenance}`;
|
|
2945
3522
|
return callClaude({
|
|
2946
3523
|
system: ANSWER_SYSTEM_PROMPT,
|
|
2947
3524
|
messages: [{ role: "user", content: userMessage }],
|
|
@@ -2949,6 +3526,16 @@ ${pagesContent}`;
|
|
|
2949
3526
|
onToken
|
|
2950
3527
|
});
|
|
2951
3528
|
}
|
|
3529
|
+
function buildChunkProvenance(chunks) {
|
|
3530
|
+
const sections = chunks.map(
|
|
3531
|
+
(chunk) => `--- ${chunk.slug} (chunk ${chunk.chunkIndex}) ---
|
|
3532
|
+
${chunk.text}`
|
|
3533
|
+
);
|
|
3534
|
+
return `
|
|
3535
|
+
|
|
3536
|
+
Most relevant excerpts (from chunk-level retrieval):
|
|
3537
|
+
${sections.join("\n\n")}`;
|
|
3538
|
+
}
|
|
2952
3539
|
function summarizeAnswer(answer) {
|
|
2953
3540
|
const firstLine = answer.trim().split(/\n/)[0] ?? "";
|
|
2954
3541
|
const firstSentence = firstLine.split(/(?<=[.!?])\s/)[0] ?? firstLine;
|
|
@@ -2956,7 +3543,7 @@ function summarizeAnswer(answer) {
|
|
|
2956
3543
|
}
|
|
2957
3544
|
async function saveQueryPage(root, question, answer) {
|
|
2958
3545
|
const slug = slugify(question);
|
|
2959
|
-
const filePath =
|
|
3546
|
+
const filePath = path22.join(root, QUERIES_DIR, `${slug}.md`);
|
|
2960
3547
|
const frontmatter = buildFrontmatter({
|
|
2961
3548
|
title: question,
|
|
2962
3549
|
summary: summarizeAnswer(answer),
|
|
@@ -2982,30 +3569,42 @@ ${answer}
|
|
|
2982
3569
|
return slug;
|
|
2983
3570
|
}
|
|
2984
3571
|
async function generateAnswer(root, question, options = {}) {
|
|
2985
|
-
if (!existsSync8(
|
|
3572
|
+
if (!existsSync8(path22.join(root, INDEX_FILE))) {
|
|
2986
3573
|
throw new Error("Wiki index not found. Run `llmwiki compile` first.");
|
|
2987
3574
|
}
|
|
2988
|
-
const
|
|
2989
|
-
options.onPageSelection?.(pages, reasoning);
|
|
2990
|
-
const pagesContent = await loadSelectedPages(root, pages);
|
|
3575
|
+
const selection = await selectRelevantPages(root, question, Boolean(options.debug));
|
|
3576
|
+
options.onPageSelection?.(selection.pages, selection.reasoning);
|
|
3577
|
+
const pagesContent = await loadSelectedPages(root, selection.pages);
|
|
2991
3578
|
if (!pagesContent) {
|
|
2992
|
-
return
|
|
2993
|
-
}
|
|
2994
|
-
const answer = await callAnswerLLM(question, pagesContent, options.onToken);
|
|
2995
|
-
let saved;
|
|
2996
|
-
if (options.save) {
|
|
2997
|
-
saved = await saveQueryPage(root, question, answer);
|
|
3579
|
+
return buildEmptyResult(selection);
|
|
2998
3580
|
}
|
|
2999
|
-
|
|
3581
|
+
const answer = await callAnswerLLM(question, pagesContent, selection.chunks, options.onToken);
|
|
3582
|
+
const saved = options.save ? await saveQueryPage(root, question, answer) : void 0;
|
|
3583
|
+
return {
|
|
3584
|
+
answer,
|
|
3585
|
+
selectedPages: selection.pages,
|
|
3586
|
+
reasoning: selection.reasoning,
|
|
3587
|
+
saved,
|
|
3588
|
+
debug: selection.debug
|
|
3589
|
+
};
|
|
3590
|
+
}
|
|
3591
|
+
function buildEmptyResult(selection) {
|
|
3592
|
+
return {
|
|
3593
|
+
answer: "",
|
|
3594
|
+
selectedPages: selection.pages,
|
|
3595
|
+
reasoning: selection.reasoning,
|
|
3596
|
+
debug: selection.debug
|
|
3597
|
+
};
|
|
3000
3598
|
}
|
|
3001
3599
|
async function queryCommand(root, question, options) {
|
|
3002
|
-
if (!existsSync8(
|
|
3600
|
+
if (!existsSync8(path22.join(root, INDEX_FILE))) {
|
|
3003
3601
|
status("!", error("Wiki index not found. Run `llmwiki compile` first."));
|
|
3004
3602
|
return;
|
|
3005
3603
|
}
|
|
3006
3604
|
header("Selecting relevant pages");
|
|
3007
3605
|
const result = await generateAnswer(root, question, {
|
|
3008
3606
|
save: options.save,
|
|
3607
|
+
debug: options.debug,
|
|
3009
3608
|
onToken: (text) => process.stdout.write(text),
|
|
3010
3609
|
onPageSelection: (pages, reasoning) => {
|
|
3011
3610
|
status("i", dim(`Reasoning: ${reasoning}`));
|
|
@@ -3014,6 +3613,7 @@ async function queryCommand(root, question, options) {
|
|
|
3014
3613
|
}
|
|
3015
3614
|
});
|
|
3016
3615
|
process.stdout.write("\n");
|
|
3616
|
+
if (result.debug) printDebugSnapshot(result.debug);
|
|
3017
3617
|
if (!result.answer) {
|
|
3018
3618
|
status("!", error("No matching pages found. Try refining your question."));
|
|
3019
3619
|
return;
|
|
@@ -3024,14 +3624,34 @@ async function queryCommand(root, question, options) {
|
|
|
3024
3624
|
status("\u2192", dim("Tip: use --save to add this answer to your wiki"));
|
|
3025
3625
|
}
|
|
3026
3626
|
}
|
|
3627
|
+
function printDebugSnapshot(debug) {
|
|
3628
|
+
header("Retrieval debug");
|
|
3629
|
+
status(
|
|
3630
|
+
"i",
|
|
3631
|
+
dim(
|
|
3632
|
+
`Source: ${debug.usedChunks ? "chunk-level" : "page-level"}; reranked: ${debug.reranked ? "yes" : "no"}`
|
|
3633
|
+
)
|
|
3634
|
+
);
|
|
3635
|
+
for (const page of debug.pages) {
|
|
3636
|
+
status("\u2022", `${page.slug} (best chunk score ${page.score.toFixed(3)})`);
|
|
3637
|
+
}
|
|
3638
|
+
for (const chunk of debug.chunks) {
|
|
3639
|
+
const preview = chunk.text.slice(0, DEBUG_CHUNK_PREVIEW_CHARS).replace(/\s+/g, " ").trim();
|
|
3640
|
+
status(
|
|
3641
|
+
"\xB7",
|
|
3642
|
+
dim(`${chunk.slug}#${chunk.chunkIndex} score=${chunk.score.toFixed(3)} :: ${preview}\u2026`)
|
|
3643
|
+
);
|
|
3644
|
+
}
|
|
3645
|
+
}
|
|
3646
|
+
var DEBUG_CHUNK_PREVIEW_CHARS = 120;
|
|
3027
3647
|
|
|
3028
3648
|
// src/commands/watch.ts
|
|
3029
3649
|
import { watch as chokidarWatch } from "chokidar";
|
|
3030
3650
|
import { existsSync as existsSync9 } from "fs";
|
|
3031
|
-
import
|
|
3651
|
+
import path23 from "path";
|
|
3032
3652
|
var DEBOUNCE_MS = 500;
|
|
3033
3653
|
async function watchCommand() {
|
|
3034
|
-
const sourcesPath =
|
|
3654
|
+
const sourcesPath = path23.resolve(SOURCES_DIR);
|
|
3035
3655
|
if (!existsSync9(sourcesPath)) {
|
|
3036
3656
|
status(
|
|
3037
3657
|
"!",
|
|
@@ -3066,7 +3686,7 @@ async function watchCommand() {
|
|
|
3066
3686
|
const scheduleCompile = (eventPath, event) => {
|
|
3067
3687
|
status(
|
|
3068
3688
|
"~",
|
|
3069
|
-
dim(`${event}: ${
|
|
3689
|
+
dim(`${event}: ${path23.basename(eventPath)}`)
|
|
3070
3690
|
);
|
|
3071
3691
|
if (debounceTimer) clearTimeout(debounceTimer);
|
|
3072
3692
|
debounceTimer = setTimeout(triggerCompile, DEBOUNCE_MS);
|
|
@@ -3153,7 +3773,7 @@ async function lintCommand() {
|
|
|
3153
3773
|
// src/commands/schema.ts
|
|
3154
3774
|
import { existsSync as existsSync10 } from "fs";
|
|
3155
3775
|
import { mkdir as mkdir6, writeFile as writeFile5 } from "fs/promises";
|
|
3156
|
-
import
|
|
3776
|
+
import path24 from "path";
|
|
3157
3777
|
async function schemaInitCommand() {
|
|
3158
3778
|
const root = process.cwd();
|
|
3159
3779
|
const defaults = buildDefaultSchema();
|
|
@@ -3162,7 +3782,7 @@ async function schemaInitCommand() {
|
|
|
3162
3782
|
status("!", warn(`Schema file already exists at ${targetPath}`));
|
|
3163
3783
|
return;
|
|
3164
3784
|
}
|
|
3165
|
-
await mkdir6(
|
|
3785
|
+
await mkdir6(path24.dirname(targetPath), { recursive: true });
|
|
3166
3786
|
const serializable = {
|
|
3167
3787
|
version: defaults.version,
|
|
3168
3788
|
defaultKind: defaults.defaultKind,
|
|
@@ -3221,7 +3841,7 @@ async function reviewShowCommand(id) {
|
|
|
3221
3841
|
}
|
|
3222
3842
|
|
|
3223
3843
|
// src/commands/review-approve.ts
|
|
3224
|
-
import
|
|
3844
|
+
import path25 from "path";
|
|
3225
3845
|
|
|
3226
3846
|
// src/commands/review-helpers.ts
|
|
3227
3847
|
async function runReviewUnderLock(id, underLock) {
|
|
@@ -3253,7 +3873,7 @@ async function approveUnderLock(root, id) {
|
|
|
3253
3873
|
process.exitCode = 1;
|
|
3254
3874
|
return;
|
|
3255
3875
|
}
|
|
3256
|
-
const pagePath =
|
|
3876
|
+
const pagePath = path25.join(root, CONCEPTS_DIR, `${candidate.slug}.md`);
|
|
3257
3877
|
await atomicWrite(pagePath, candidate.body);
|
|
3258
3878
|
status("+", success(`Approved \u2192 ${source(pagePath)}`));
|
|
3259
3879
|
await persistCandidateSourceStates(root, candidate);
|
|
@@ -3313,7 +3933,7 @@ import { McpServer as McpServer2 } from "@modelcontextprotocol/sdk/server/mcp.js
|
|
|
3313
3933
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
3314
3934
|
|
|
3315
3935
|
// src/mcp/tools.ts
|
|
3316
|
-
import
|
|
3936
|
+
import path26 from "path";
|
|
3317
3937
|
import { z } from "zod";
|
|
3318
3938
|
|
|
3319
3939
|
// src/mcp/provider-check.ts
|
|
@@ -3406,15 +4026,16 @@ function registerQueryTool(server, root) {
|
|
|
3406
4026
|
"query_wiki",
|
|
3407
4027
|
{
|
|
3408
4028
|
title: "Query Wiki",
|
|
3409
|
-
description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Requires an LLM provider.",
|
|
4029
|
+
description: "Ask a natural-language question. Selects relevant pages with the LLM, loads them, and returns a grounded answer with citations. Set save=true to persist the answer as a wiki page. Set debug=true to include the selected chunks and their scores. Requires an LLM provider.",
|
|
3410
4030
|
inputSchema: {
|
|
3411
4031
|
question: z.string().describe("The natural-language question to answer."),
|
|
3412
|
-
save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true.")
|
|
4032
|
+
save: z.boolean().optional().describe("Persist the answer as a wiki/queries/ page when true."),
|
|
4033
|
+
debug: z.boolean().optional().describe("Include retrieval debug info (selected chunks/pages + scores).")
|
|
3413
4034
|
}
|
|
3414
4035
|
},
|
|
3415
|
-
async ({ question, save }) => {
|
|
4036
|
+
async ({ question, save, debug }) => {
|
|
3416
4037
|
ensureProviderAvailable();
|
|
3417
|
-
const result = await generateAnswer(root, question, { save });
|
|
4038
|
+
const result = await generateAnswer(root, question, { save, debug });
|
|
3418
4039
|
return jsonResult(result);
|
|
3419
4040
|
}
|
|
3420
4041
|
);
|
|
@@ -3438,15 +4059,30 @@ function registerSearchTool(server, root) {
|
|
|
3438
4059
|
);
|
|
3439
4060
|
}
|
|
3440
4061
|
async function pickSearchSlugs(root, question) {
|
|
4062
|
+
try {
|
|
4063
|
+
const chunks = await findRelevantChunks(root, question, CHUNK_TOP_K);
|
|
4064
|
+
if (chunks.length > 0) return dedupePreservingOrder(chunks.map((c) => c.chunk.slug));
|
|
4065
|
+
} catch {
|
|
4066
|
+
}
|
|
3441
4067
|
try {
|
|
3442
4068
|
const candidates = await findRelevantPages(root, question);
|
|
3443
4069
|
if (candidates.length > 0) return candidates.map((c) => c.slug);
|
|
3444
4070
|
} catch {
|
|
3445
4071
|
}
|
|
3446
|
-
const indexContent = await safeReadFile(
|
|
4072
|
+
const indexContent = await safeReadFile(path26.join(root, INDEX_FILE));
|
|
3447
4073
|
const { pages } = await selectPages(question, indexContent);
|
|
3448
4074
|
return pages;
|
|
3449
4075
|
}
|
|
4076
|
+
function dedupePreservingOrder(slugs) {
|
|
4077
|
+
const seen = /* @__PURE__ */ new Set();
|
|
4078
|
+
const out = [];
|
|
4079
|
+
for (const slug of slugs) {
|
|
4080
|
+
if (seen.has(slug)) continue;
|
|
4081
|
+
seen.add(slug);
|
|
4082
|
+
out.push(slug);
|
|
4083
|
+
}
|
|
4084
|
+
return out;
|
|
4085
|
+
}
|
|
3450
4086
|
function registerReadTool(server, root) {
|
|
3451
4087
|
server.registerTool(
|
|
3452
4088
|
"read_page",
|
|
@@ -3492,8 +4128,8 @@ function registerStatusTool(server, root) {
|
|
|
3492
4128
|
);
|
|
3493
4129
|
}
|
|
3494
4130
|
async function collectStatus(root) {
|
|
3495
|
-
const concepts = await collectPageSummaries(
|
|
3496
|
-
const queries = await collectPageSummaries(
|
|
4131
|
+
const concepts = await collectPageSummaries(path26.join(root, CONCEPTS_DIR));
|
|
4132
|
+
const queries = await collectPageSummaries(path26.join(root, QUERIES_DIR));
|
|
3497
4133
|
const state = await readState(root);
|
|
3498
4134
|
const changes = await detectChanges(root, state);
|
|
3499
4135
|
const orphans = await findOrphanedSlugs(root);
|
|
@@ -3510,7 +4146,7 @@ async function collectStatus(root) {
|
|
|
3510
4146
|
};
|
|
3511
4147
|
}
|
|
3512
4148
|
async function findOrphanedSlugs(root) {
|
|
3513
|
-
const scanned = await scanWikiPages(
|
|
4149
|
+
const scanned = await scanWikiPages(path26.join(root, CONCEPTS_DIR));
|
|
3514
4150
|
return scanned.filter(({ meta }) => meta.orphaned).map(({ slug }) => slug);
|
|
3515
4151
|
}
|
|
3516
4152
|
async function loadPageRecords(root, slugs) {
|
|
@@ -3523,7 +4159,7 @@ async function loadPageRecords(root, slugs) {
|
|
|
3523
4159
|
}
|
|
3524
4160
|
async function readPage(root, slug) {
|
|
3525
4161
|
for (const dir of PAGE_DIRS2) {
|
|
3526
|
-
const content = await safeReadFile(
|
|
4162
|
+
const content = await safeReadFile(path26.join(root, dir, `${slug}.md`));
|
|
3527
4163
|
if (!content) continue;
|
|
3528
4164
|
const { meta, body } = parseFrontmatter(content);
|
|
3529
4165
|
if (meta.orphaned) continue;
|
|
@@ -3538,7 +4174,7 @@ async function readPage(root, slug) {
|
|
|
3538
4174
|
}
|
|
3539
4175
|
|
|
3540
4176
|
// src/mcp/resources.ts
|
|
3541
|
-
import
|
|
4177
|
+
import path27 from "path";
|
|
3542
4178
|
import { readdir as readdir9 } from "fs/promises";
|
|
3543
4179
|
import { ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
3544
4180
|
function jsonContent(uri, payload) {
|
|
@@ -3572,7 +4208,7 @@ function registerIndexResource(server, root) {
|
|
|
3572
4208
|
mimeType: "text/markdown"
|
|
3573
4209
|
},
|
|
3574
4210
|
async (uri) => {
|
|
3575
|
-
const content = await safeReadFile(
|
|
4211
|
+
const content = await safeReadFile(path27.join(root, INDEX_FILE));
|
|
3576
4212
|
return { contents: [markdownContent(uri, content)] };
|
|
3577
4213
|
}
|
|
3578
4214
|
);
|
|
@@ -3639,7 +4275,7 @@ function registerQueryResource(server, root) {
|
|
|
3639
4275
|
);
|
|
3640
4276
|
}
|
|
3641
4277
|
async function listSources(root) {
|
|
3642
|
-
const sourcesPath =
|
|
4278
|
+
const sourcesPath = path27.join(root, SOURCES_DIR);
|
|
3643
4279
|
let files;
|
|
3644
4280
|
try {
|
|
3645
4281
|
files = await readdir9(sourcesPath);
|
|
@@ -3648,14 +4284,14 @@ async function listSources(root) {
|
|
|
3648
4284
|
}
|
|
3649
4285
|
const records = [];
|
|
3650
4286
|
for (const file of files.filter((f) => f.endsWith(".md"))) {
|
|
3651
|
-
const content = await safeReadFile(
|
|
4287
|
+
const content = await safeReadFile(path27.join(sourcesPath, file));
|
|
3652
4288
|
const { meta } = parseFrontmatter(content);
|
|
3653
4289
|
records.push({ filename: file, ...meta });
|
|
3654
4290
|
}
|
|
3655
4291
|
return records;
|
|
3656
4292
|
}
|
|
3657
4293
|
async function loadPageWithMeta(root, dir, slug) {
|
|
3658
|
-
const filePath =
|
|
4294
|
+
const filePath = path27.join(root, dir, `${slug}.md`);
|
|
3659
4295
|
const content = await safeReadFile(filePath);
|
|
3660
4296
|
if (!content) {
|
|
3661
4297
|
throw new Error(`Page not found: ${dir}/${slug}.md`);
|
|
@@ -3664,7 +4300,7 @@ async function loadPageWithMeta(root, dir, slug) {
|
|
|
3664
4300
|
return { slug, meta, body: body.trim() };
|
|
3665
4301
|
}
|
|
3666
4302
|
async function listPagesUnder(root, dir, scheme) {
|
|
3667
|
-
const pagesPath =
|
|
4303
|
+
const pagesPath = path27.join(root, dir);
|
|
3668
4304
|
let files;
|
|
3669
4305
|
try {
|
|
3670
4306
|
files = await readdir9(pagesPath);
|
|
@@ -3748,7 +4384,7 @@ reviewCommand.command("reject <id>").description("Reject a candidate and archive
|
|
|
3748
4384
|
process.exit(1);
|
|
3749
4385
|
}
|
|
3750
4386
|
});
|
|
3751
|
-
program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").action(async (question, options) => {
|
|
4387
|
+
program.command("query <question>").description("Ask a question against the wiki").option("--save", "Save the answer as a wiki page").option("--debug", "Print which pages and chunks were selected and their scores").action(async (question, options) => {
|
|
3752
4388
|
try {
|
|
3753
4389
|
requireProvider();
|
|
3754
4390
|
await queryCommand(process.cwd(), question, options);
|