@bndynet/ragbox 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +765 -0
- package/README.zh-CN.md +774 -0
- package/dist/src/advanced.d.ts +13 -0
- package/dist/src/advanced.js +29 -0
- package/dist/src/cli.d.ts +2 -0
- package/dist/src/cli.js +1013 -0
- package/dist/src/config-file.d.ts +69 -0
- package/dist/src/config-file.js +246 -0
- package/dist/src/folder-index/config.d.ts +2 -0
- package/dist/src/folder-index/config.js +56 -0
- package/dist/src/folder-index/hash.d.ts +1 -0
- package/dist/src/folder-index/hash.js +14 -0
- package/dist/src/folder-index/indexer.d.ts +2 -0
- package/dist/src/folder-index/indexer.js +154 -0
- package/dist/src/folder-index/llm-client.d.ts +3 -0
- package/dist/src/folder-index/llm-client.js +45 -0
- package/dist/src/folder-index/manifest.d.ts +17 -0
- package/dist/src/folder-index/manifest.js +158 -0
- package/dist/src/folder-index/multi-query.d.ts +45 -0
- package/dist/src/folder-index/multi-query.js +109 -0
- package/dist/src/folder-index/pageindex-runner.d.ts +3 -0
- package/dist/src/folder-index/pageindex-runner.js +218 -0
- package/dist/src/folder-index/path-utils.d.ts +5 -0
- package/dist/src/folder-index/path-utils.js +33 -0
- package/dist/src/folder-index/query.d.ts +19 -0
- package/dist/src/folder-index/query.js +597 -0
- package/dist/src/folder-index/queue.d.ts +1 -0
- package/dist/src/folder-index/queue.js +18 -0
- package/dist/src/folder-index/root-tree.d.ts +3 -0
- package/dist/src/folder-index/root-tree.js +82 -0
- package/dist/src/folder-index/scan.d.ts +14 -0
- package/dist/src/folder-index/scan.js +152 -0
- package/dist/src/folder-index/types.d.ts +368 -0
- package/dist/src/folder-index/types.js +2 -0
- package/dist/src/folder-index/watch.d.ts +17 -0
- package/dist/src/folder-index/watch.js +550 -0
- package/dist/src/index.d.ts +6 -0
- package/dist/src/index.js +45 -0
- package/dist/src/sdk.d.ts +101 -0
- package/dist/src/sdk.js +352 -0
- package/dist/src/serve.d.ts +64 -0
- package/dist/src/serve.js +466 -0
- package/dist/src/setup-pageindex.d.ts +30 -0
- package/dist/src/setup-pageindex.js +184 -0
- package/package.json +43 -0
|
@@ -0,0 +1,597 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.QueryStageError = void 0;
|
|
7
|
+
exports.stripText = stripText;
|
|
8
|
+
exports.buildNodeMap = buildNodeMap;
|
|
9
|
+
exports.extractNodeTextFromMarkdown = extractNodeTextFromMarkdown;
|
|
10
|
+
exports.resolveQueryIndexLocation = resolveQueryIndexLocation;
|
|
11
|
+
exports.queryFolder = queryFolder;
|
|
12
|
+
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
13
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
14
|
+
const config_1 = require("./config");
|
|
15
|
+
const llm_client_1 = require("./llm-client");
|
|
16
|
+
const manifest_1 = require("./manifest");
|
|
17
|
+
class QueryStageError extends Error {
|
|
18
|
+
stage;
|
|
19
|
+
cause;
|
|
20
|
+
constructor(stage, error) {
|
|
21
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
22
|
+
super(`Query failed during ${stage}: ${message}`);
|
|
23
|
+
this.name = "QueryStageError";
|
|
24
|
+
this.stage = stage;
|
|
25
|
+
this.cause = error;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
exports.QueryStageError = QueryStageError;
|
|
29
|
+
function isObject(value) {
|
|
30
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
31
|
+
}
|
|
32
|
+
function isVerbose() {
|
|
33
|
+
return process.env.RAGBOX_VERBOSE === "1" || process.env.RAGBOX_E2E_VERBOSE === "1";
|
|
34
|
+
}
|
|
35
|
+
function logVerbose(message) {
|
|
36
|
+
if (isVerbose()) {
|
|
37
|
+
console.error(`[ragbox] ${message}`);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
function byteLength(value) {
|
|
41
|
+
return Buffer.byteLength(value, "utf8");
|
|
42
|
+
}
|
|
43
|
+
function estimateTokenCount(value) {
|
|
44
|
+
const trimmed = value.trim();
|
|
45
|
+
return trimmed ? Math.ceil(trimmed.length / 4) : 0;
|
|
46
|
+
}
|
|
47
|
+
async function runQueryStage(stage, task) {
|
|
48
|
+
try {
|
|
49
|
+
return await task();
|
|
50
|
+
}
|
|
51
|
+
catch (error) {
|
|
52
|
+
if (error instanceof QueryStageError) {
|
|
53
|
+
throw error;
|
|
54
|
+
}
|
|
55
|
+
throw new QueryStageError(stage, error);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
function addTraceFailure(trace, failure) {
|
|
59
|
+
trace?.failures.push(failure);
|
|
60
|
+
}
|
|
61
|
+
function stripText(value) {
|
|
62
|
+
if (Array.isArray(value)) {
|
|
63
|
+
return value.map((item) => stripText(item));
|
|
64
|
+
}
|
|
65
|
+
if (!isObject(value)) {
|
|
66
|
+
return value;
|
|
67
|
+
}
|
|
68
|
+
const stripped = {};
|
|
69
|
+
for (const [key, nestedValue] of Object.entries(value)) {
|
|
70
|
+
if (key === "text") {
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
stripped[key] = stripText(nestedValue);
|
|
74
|
+
}
|
|
75
|
+
return stripped;
|
|
76
|
+
}
|
|
77
|
+
function getNodeId(value) {
|
|
78
|
+
for (const key of ["node_id", "nodeId", "id"]) {
|
|
79
|
+
const nodeId = value[key];
|
|
80
|
+
if (typeof nodeId === "string" && nodeId.length > 0) {
|
|
81
|
+
return nodeId;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
return undefined;
|
|
85
|
+
}
|
|
86
|
+
function buildNodeMap(tree) {
|
|
87
|
+
const map = new Map();
|
|
88
|
+
const seen = new Set();
|
|
89
|
+
function visit(value) {
|
|
90
|
+
if (!value || seen.has(value)) {
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
if (Array.isArray(value)) {
|
|
94
|
+
seen.add(value);
|
|
95
|
+
for (const item of value) {
|
|
96
|
+
visit(item);
|
|
97
|
+
}
|
|
98
|
+
return;
|
|
99
|
+
}
|
|
100
|
+
if (!isObject(value)) {
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
seen.add(value);
|
|
104
|
+
const nodeId = getNodeId(value);
|
|
105
|
+
if (nodeId) {
|
|
106
|
+
map.set(nodeId, value);
|
|
107
|
+
}
|
|
108
|
+
for (const nestedValue of Object.values(value)) {
|
|
109
|
+
if (typeof nestedValue === "object" && nestedValue !== null) {
|
|
110
|
+
visit(nestedValue);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
visit(tree);
|
|
115
|
+
return map;
|
|
116
|
+
}
|
|
117
|
+
function parseJsonObject(raw) {
|
|
118
|
+
try {
|
|
119
|
+
return JSON.parse(raw);
|
|
120
|
+
}
|
|
121
|
+
catch {
|
|
122
|
+
const start = raw.indexOf("{");
|
|
123
|
+
const end = raw.lastIndexOf("}");
|
|
124
|
+
if (start >= 0 && end > start) {
|
|
125
|
+
return JSON.parse(raw.slice(start, end + 1));
|
|
126
|
+
}
|
|
127
|
+
throw new Error(`Expected JSON object from LLM, got: ${raw}`);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
function lightweightRootTree(node) {
|
|
131
|
+
const base = {
|
|
132
|
+
node_id: node.node_id,
|
|
133
|
+
type: node.type,
|
|
134
|
+
title: node.title
|
|
135
|
+
};
|
|
136
|
+
if (node.type === "document") {
|
|
137
|
+
base.summary = node.summary;
|
|
138
|
+
base.path = node.path;
|
|
139
|
+
}
|
|
140
|
+
if (node.children?.length) {
|
|
141
|
+
base.children = node.children.map(lightweightRootTree);
|
|
142
|
+
}
|
|
143
|
+
return base;
|
|
144
|
+
}
|
|
145
|
+
function findDocumentNodes(rootTree) {
|
|
146
|
+
const documents = new Map();
|
|
147
|
+
function visit(node) {
|
|
148
|
+
if (node.type === "document") {
|
|
149
|
+
documents.set(node.node_id, node);
|
|
150
|
+
}
|
|
151
|
+
for (const child of node.children ?? []) {
|
|
152
|
+
visit(child);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
visit(rootTree);
|
|
156
|
+
return documents;
|
|
157
|
+
}
|
|
158
|
+
function extractNodeText(node) {
|
|
159
|
+
const text = node.text;
|
|
160
|
+
return typeof text === "string" && text.trim() ? text.trim() : undefined;
|
|
161
|
+
}
|
|
162
|
+
function extractLexicalQueryTerms(question) {
|
|
163
|
+
const terms = new Set();
|
|
164
|
+
const codeLikeMatches = question.match(/[A-Za-z0-9_./:-]{6,}/g) ?? [];
|
|
165
|
+
for (const match of codeLikeMatches) {
|
|
166
|
+
terms.add(match.toLowerCase());
|
|
167
|
+
}
|
|
168
|
+
return [...terms];
|
|
169
|
+
}
|
|
170
|
+
function findLexicalNodeMatches(tree, question) {
|
|
171
|
+
const terms = extractLexicalQueryTerms(question);
|
|
172
|
+
const matches = new Set();
|
|
173
|
+
if (terms.length === 0) {
|
|
174
|
+
return matches;
|
|
175
|
+
}
|
|
176
|
+
const seen = new Set();
|
|
177
|
+
function visit(value) {
|
|
178
|
+
if (!value || seen.has(value)) {
|
|
179
|
+
return;
|
|
180
|
+
}
|
|
181
|
+
if (Array.isArray(value)) {
|
|
182
|
+
seen.add(value);
|
|
183
|
+
for (const item of value) {
|
|
184
|
+
visit(item);
|
|
185
|
+
}
|
|
186
|
+
return;
|
|
187
|
+
}
|
|
188
|
+
if (!isObject(value)) {
|
|
189
|
+
return;
|
|
190
|
+
}
|
|
191
|
+
seen.add(value);
|
|
192
|
+
const nodeId = getNodeId(value);
|
|
193
|
+
const text = extractNodeText(value)?.toLowerCase();
|
|
194
|
+
if (nodeId && text && terms.some((term) => text.includes(term))) {
|
|
195
|
+
matches.add(nodeId);
|
|
196
|
+
}
|
|
197
|
+
for (const nestedValue of Object.values(value)) {
|
|
198
|
+
if (typeof nestedValue === "object" && nestedValue !== null) {
|
|
199
|
+
visit(nestedValue);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
visit(tree);
|
|
204
|
+
return matches;
|
|
205
|
+
}
|
|
206
|
+
function getLineNumber(value) {
|
|
207
|
+
for (const key of ["line_num", "lineNum", "line"]) {
|
|
208
|
+
const lineNumber = value[key];
|
|
209
|
+
if (typeof lineNumber === "number" && Number.isFinite(lineNumber) && lineNumber > 0) {
|
|
210
|
+
return Math.floor(lineNumber);
|
|
211
|
+
}
|
|
212
|
+
if (typeof lineNumber === "string") {
|
|
213
|
+
const parsed = Number.parseInt(lineNumber, 10);
|
|
214
|
+
if (Number.isFinite(parsed) && parsed > 0) {
|
|
215
|
+
return parsed;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
return undefined;
|
|
220
|
+
}
|
|
221
|
+
function collectLineNumbers(tree) {
|
|
222
|
+
const lineNumbers = new Set();
|
|
223
|
+
const seen = new Set();
|
|
224
|
+
function visit(value) {
|
|
225
|
+
if (!value || seen.has(value)) {
|
|
226
|
+
return;
|
|
227
|
+
}
|
|
228
|
+
if (Array.isArray(value)) {
|
|
229
|
+
seen.add(value);
|
|
230
|
+
for (const item of value) {
|
|
231
|
+
visit(item);
|
|
232
|
+
}
|
|
233
|
+
return;
|
|
234
|
+
}
|
|
235
|
+
if (!isObject(value)) {
|
|
236
|
+
return;
|
|
237
|
+
}
|
|
238
|
+
seen.add(value);
|
|
239
|
+
const lineNumber = getLineNumber(value);
|
|
240
|
+
if (lineNumber) {
|
|
241
|
+
lineNumbers.add(lineNumber);
|
|
242
|
+
}
|
|
243
|
+
for (const nestedValue of Object.values(value)) {
|
|
244
|
+
if (typeof nestedValue === "object" && nestedValue !== null) {
|
|
245
|
+
visit(nestedValue);
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
visit(tree);
|
|
250
|
+
return [...lineNumbers].sort((left, right) => left - right);
|
|
251
|
+
}
|
|
252
|
+
function extractNodeTextFromMarkdown(node, tree, markdown) {
|
|
253
|
+
const startLine = getLineNumber(node);
|
|
254
|
+
if (!startLine) {
|
|
255
|
+
return undefined;
|
|
256
|
+
}
|
|
257
|
+
const lines = markdown.split(/\r?\n/);
|
|
258
|
+
const nextLine = collectLineNumbers(tree).find((lineNumber) => lineNumber > startLine) ?? lines.length + 1;
|
|
259
|
+
const text = lines.slice(startLine - 1, Math.max(startLine, nextLine - 1)).join("\n").trim();
|
|
260
|
+
return text || undefined;
|
|
261
|
+
}
|
|
262
|
+
async function readSourceMarkdown(rootDir, record) {
|
|
263
|
+
const candidates = [node_path_1.default.join(rootDir, record.path), record.absolutePath];
|
|
264
|
+
for (const candidate of candidates) {
|
|
265
|
+
try {
|
|
266
|
+
return await promises_1.default.readFile(candidate, "utf8");
|
|
267
|
+
}
|
|
268
|
+
catch {
|
|
269
|
+
// Try the next path. The manifest may have been moved with the index.
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
return undefined;
|
|
273
|
+
}
|
|
274
|
+
async function readJson(filePath) {
|
|
275
|
+
return JSON.parse(await promises_1.default.readFile(filePath, "utf8"));
|
|
276
|
+
}
|
|
277
|
+
async function pathExists(filePath) {
|
|
278
|
+
try {
|
|
279
|
+
await promises_1.default.access(filePath);
|
|
280
|
+
return true;
|
|
281
|
+
}
|
|
282
|
+
catch {
|
|
283
|
+
return false;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
async function hasQueryIndexFiles(outputDir) {
|
|
287
|
+
return (await pathExists(node_path_1.default.join(outputDir, manifest_1.MANIFEST_FILE))) && (await pathExists(node_path_1.default.join(outputDir, manifest_1.ROOT_TREE_FILE)));
|
|
288
|
+
}
|
|
289
|
+
async function readQueryIndexLocation(rootDir, outputDir) {
|
|
290
|
+
const manifestPath = node_path_1.default.join(outputDir, manifest_1.MANIFEST_FILE);
|
|
291
|
+
const rootTreePath = node_path_1.default.join(outputDir, manifest_1.ROOT_TREE_FILE);
|
|
292
|
+
const manifest = await readJson(manifestPath);
|
|
293
|
+
return {
|
|
294
|
+
rootDir: manifest.rootDir ? node_path_1.default.resolve(manifest.rootDir) : rootDir,
|
|
295
|
+
outputDir,
|
|
296
|
+
manifestPath,
|
|
297
|
+
rootTreePath
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
async function resolveQueryIndexLocation(target) {
|
|
301
|
+
const resolvedTarget = node_path_1.default.resolve(target);
|
|
302
|
+
const defaultOutputDir = node_path_1.default.join(resolvedTarget, manifest_1.PAGEINDEX_DIR);
|
|
303
|
+
if (await hasQueryIndexFiles(resolvedTarget)) {
|
|
304
|
+
return await readQueryIndexLocation(resolvedTarget, resolvedTarget);
|
|
305
|
+
}
|
|
306
|
+
if (await hasQueryIndexFiles(defaultOutputDir)) {
|
|
307
|
+
return await readQueryIndexLocation(resolvedTarget, defaultOutputDir);
|
|
308
|
+
}
|
|
309
|
+
throw new Error(`Expected a docs folder with ${manifest_1.PAGEINDEX_DIR}/${manifest_1.MANIFEST_FILE} and ${manifest_1.PAGEINDEX_DIR}/${manifest_1.ROOT_TREE_FILE}, or a ragbox output directory with ${manifest_1.MANIFEST_FILE} and ${manifest_1.ROOT_TREE_FILE}: ${target}`);
|
|
310
|
+
}
|
|
311
|
+
async function selectDocuments(question, rootTree, options) {
|
|
312
|
+
const prompt = `You are given a user question and a root documentation tree.
|
|
313
|
+
Each document node has:
|
|
314
|
+
- node_id
|
|
315
|
+
- title
|
|
316
|
+
- summary
|
|
317
|
+
- path
|
|
318
|
+
Select the documents most likely to contain the answer.
|
|
319
|
+
Return only valid JSON:
|
|
320
|
+
{
|
|
321
|
+
"documents": ["node_id_1", "node_id_2"]
|
|
322
|
+
}
|
|
323
|
+
User question:
|
|
324
|
+
${question}
|
|
325
|
+
Root tree:
|
|
326
|
+
${JSON.stringify(lightweightRootTree(rootTree), null, 2)}`;
|
|
327
|
+
const response = await (0, llm_client_1.chatCompletion)([{ role: "user", content: prompt }], options);
|
|
328
|
+
const parsed = parseJsonObject(response);
|
|
329
|
+
return {
|
|
330
|
+
ids: Array.isArray(parsed.documents) ? parsed.documents.filter((id) => typeof id === "string") : [],
|
|
331
|
+
promptBytes: byteLength(prompt),
|
|
332
|
+
rawResponse: response,
|
|
333
|
+
responseBytes: byteLength(response)
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
async function selectPageIndexNodes(question, treeWithoutText, options) {
|
|
337
|
+
const prompt = `You are given a user question and a document tree.
|
|
338
|
+
Each node has:
|
|
339
|
+
- node_id
|
|
340
|
+
- title
|
|
341
|
+
- summary
|
|
342
|
+
- child nodes
|
|
343
|
+
Select the nodes most likely to contain the answer.
|
|
344
|
+
Return only valid JSON:
|
|
345
|
+
{
|
|
346
|
+
"nodes": ["node_id_1", "node_id_2"]
|
|
347
|
+
}
|
|
348
|
+
User question:
|
|
349
|
+
${question}
|
|
350
|
+
Document tree:
|
|
351
|
+
${JSON.stringify(treeWithoutText, null, 2)}`;
|
|
352
|
+
const response = await (0, llm_client_1.chatCompletion)([{ role: "user", content: prompt }], options);
|
|
353
|
+
const parsed = parseJsonObject(response);
|
|
354
|
+
return {
|
|
355
|
+
ids: Array.isArray(parsed.nodes) ? parsed.nodes.filter((id) => typeof id === "string") : [],
|
|
356
|
+
promptBytes: byteLength(prompt),
|
|
357
|
+
rawResponse: response,
|
|
358
|
+
responseBytes: byteLength(response)
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
function elapsedSince(startedAt) {
|
|
362
|
+
return Date.now() - startedAt;
|
|
363
|
+
}
|
|
364
|
+
function documentSkipReason(documentNode, manifestRecord, indexPath) {
|
|
365
|
+
if (!documentNode) {
|
|
366
|
+
return "missing_root_tree_document";
|
|
367
|
+
}
|
|
368
|
+
if (!manifestRecord) {
|
|
369
|
+
return "missing_manifest_record";
|
|
370
|
+
}
|
|
371
|
+
if (!indexPath) {
|
|
372
|
+
return "missing_index_path";
|
|
373
|
+
}
|
|
374
|
+
if (manifestRecord.status !== "ready") {
|
|
375
|
+
return "document_not_ready";
|
|
376
|
+
}
|
|
377
|
+
return undefined;
|
|
378
|
+
}
|
|
379
|
+
async function queryFolder(target, question, options = {}) {
|
|
380
|
+
const totalStartedAt = Date.now();
|
|
381
|
+
const timings = {
|
|
382
|
+
resolve: 0,
|
|
383
|
+
selectDocuments: 0,
|
|
384
|
+
selectNodes: 0,
|
|
385
|
+
answer: 0,
|
|
386
|
+
total: 0
|
|
387
|
+
};
|
|
388
|
+
const warnings = [];
|
|
389
|
+
const selectedDocuments = [];
|
|
390
|
+
const selectedNodes = [];
|
|
391
|
+
const sources = [];
|
|
392
|
+
const config = (0, config_1.loadPageIndexConfig)(options);
|
|
393
|
+
const resolvedTarget = node_path_1.default.resolve(target);
|
|
394
|
+
const trace = options.trace
|
|
395
|
+
? {
|
|
396
|
+
version: 1,
|
|
397
|
+
nodeSelections: [],
|
|
398
|
+
context: {
|
|
399
|
+
sourceCount: 0,
|
|
400
|
+
bytes: 0,
|
|
401
|
+
tokens: 0
|
|
402
|
+
},
|
|
403
|
+
failures: []
|
|
404
|
+
}
|
|
405
|
+
: undefined;
|
|
406
|
+
logVerbose(`query resolve target=${node_path_1.default.resolve(target)}`);
|
|
407
|
+
const resolveStartedAt = Date.now();
|
|
408
|
+
const location = await runQueryStage("resolve", async () => await resolveQueryIndexLocation(target));
|
|
409
|
+
logVerbose(`query index root=${location.rootDir} output=${location.outputDir ?? "(default)"}`);
|
|
410
|
+
const { manifest, rootTree } = await runQueryStage("read-index", async () => ({
|
|
411
|
+
manifest: await readJson(location.manifestPath),
|
|
412
|
+
rootTree: await readJson(location.rootTreePath)
|
|
413
|
+
}));
|
|
414
|
+
timings.resolve = elapsedSince(resolveStartedAt);
|
|
415
|
+
const documentNodes = findDocumentNodes(rootTree);
|
|
416
|
+
const manifestByDocId = new Map(manifest.documents.map((record) => [record.docId, record]));
|
|
417
|
+
logVerbose(`query select documents total=${manifest.documents.length}`);
|
|
418
|
+
const selectDocumentsStartedAt = Date.now();
|
|
419
|
+
const documentSelection = await runQueryStage("select-documents", async () => await selectDocuments(question, rootTree, options));
|
|
420
|
+
const selectedDocumentIds = documentSelection.ids;
|
|
421
|
+
if (trace) {
|
|
422
|
+
trace.documentSelection = {
|
|
423
|
+
promptBytes: documentSelection.promptBytes,
|
|
424
|
+
responseBytes: documentSelection.responseBytes,
|
|
425
|
+
rawResponse: documentSelection.rawResponse,
|
|
426
|
+
selectedDocumentIds
|
|
427
|
+
};
|
|
428
|
+
}
|
|
429
|
+
timings.selectDocuments = elapsedSince(selectDocumentsStartedAt);
|
|
430
|
+
logVerbose(`query selected documents count=${selectedDocumentIds.length} ids=${selectedDocumentIds.join(",")}`);
|
|
431
|
+
for (const docId of selectedDocumentIds) {
|
|
432
|
+
const documentNode = documentNodes.get(docId);
|
|
433
|
+
const manifestRecord = manifestByDocId.get(docId);
|
|
434
|
+
const indexPath = documentNode?.index_path ?? manifestRecord?.indexPath;
|
|
435
|
+
const available = Boolean(documentNode && manifestRecord && indexPath && manifestRecord.status === "ready");
|
|
436
|
+
const skipReason = documentSkipReason(documentNode, manifestRecord, indexPath);
|
|
437
|
+
const selectedDocument = {
|
|
438
|
+
docId,
|
|
439
|
+
available,
|
|
440
|
+
path: manifestRecord?.path ?? documentNode?.path,
|
|
441
|
+
title: manifestRecord?.title ?? documentNode?.title,
|
|
442
|
+
status: manifestRecord?.status,
|
|
443
|
+
indexPath,
|
|
444
|
+
selectionReason: "selected_by_document_planner"
|
|
445
|
+
};
|
|
446
|
+
if (skipReason) {
|
|
447
|
+
selectedDocument.skipReason = skipReason;
|
|
448
|
+
}
|
|
449
|
+
selectedDocuments.push(selectedDocument);
|
|
450
|
+
if (!available || !documentNode || !manifestRecord || !indexPath) {
|
|
451
|
+
logVerbose(`query skip unavailable document id=${docId}`);
|
|
452
|
+
warnings.push(`Selected document is unavailable: ${docId}`);
|
|
453
|
+
addTraceFailure(trace, {
|
|
454
|
+
stage: "read-document-index",
|
|
455
|
+
code: skipReason ?? "document_unavailable",
|
|
456
|
+
message: `Selected document is unavailable: ${docId}`,
|
|
457
|
+
docId,
|
|
458
|
+
path: manifestRecord?.path ?? documentNode?.path
|
|
459
|
+
});
|
|
460
|
+
continue;
|
|
461
|
+
}
|
|
462
|
+
logVerbose(`query read pageindex path=${manifestRecord.path}`);
|
|
463
|
+
const pageIndexJson = await runQueryStage("read-document-index", async () => await readJson((0, manifest_1.resolveDocumentIndexPath)(location.rootDir, indexPath, location.outputDir)));
|
|
464
|
+
logVerbose(`query select nodes path=${manifestRecord.path}`);
|
|
465
|
+
const selectNodesStartedAt = Date.now();
|
|
466
|
+
const treeWithoutText = stripText(pageIndexJson);
|
|
467
|
+
const nodeSelection = await runQueryStage("select-nodes", async () => await selectPageIndexNodes(question, treeWithoutText, options));
|
|
468
|
+
const lexicalNodeIds = findLexicalNodeMatches(pageIndexJson, question);
|
|
469
|
+
const selectedNodeIds = [...new Set([...nodeSelection.ids, ...lexicalNodeIds])];
|
|
470
|
+
trace?.nodeSelections.push({
|
|
471
|
+
docId,
|
|
472
|
+
path: manifestRecord.path,
|
|
473
|
+
promptBytes: nodeSelection.promptBytes,
|
|
474
|
+
responseBytes: nodeSelection.responseBytes,
|
|
475
|
+
rawResponse: nodeSelection.rawResponse,
|
|
476
|
+
selectedNodeIds
|
|
477
|
+
});
|
|
478
|
+
timings.selectNodes += elapsedSince(selectNodesStartedAt);
|
|
479
|
+
logVerbose(`query selected nodes path=${manifestRecord.path} count=${selectedNodeIds.length} ids=${selectedNodeIds.join(",")}`);
|
|
480
|
+
const nodeMap = buildNodeMap(pageIndexJson);
|
|
481
|
+
let sourceMarkdown;
|
|
482
|
+
for (const nodeId of selectedNodeIds) {
|
|
483
|
+
const node = nodeMap.get(nodeId);
|
|
484
|
+
const reference = `${manifestRecord.path}#${nodeId}`;
|
|
485
|
+
const selectedNode = {
|
|
486
|
+
docId,
|
|
487
|
+
path: manifestRecord.path,
|
|
488
|
+
nodeId,
|
|
489
|
+
found: Boolean(node),
|
|
490
|
+
hasText: false,
|
|
491
|
+
reference,
|
|
492
|
+
selectionReason: nodeSelection.ids.includes(nodeId) ? "selected_by_node_planner" : "matched_query_text"
|
|
493
|
+
};
|
|
494
|
+
if (!node) {
|
|
495
|
+
selectedNode.skipReason = "node_not_found";
|
|
496
|
+
selectedNodes.push(selectedNode);
|
|
497
|
+
warnings.push(`Selected node was not found: ${reference}`);
|
|
498
|
+
addTraceFailure(trace, {
|
|
499
|
+
stage: "extract-context",
|
|
500
|
+
code: "node_not_found",
|
|
501
|
+
message: `Selected node was not found: ${reference}`,
|
|
502
|
+
docId,
|
|
503
|
+
nodeId,
|
|
504
|
+
path: manifestRecord.path,
|
|
505
|
+
reference
|
|
506
|
+
});
|
|
507
|
+
continue;
|
|
508
|
+
}
|
|
509
|
+
let text = extractNodeText(node);
|
|
510
|
+
if (!text) {
|
|
511
|
+
sourceMarkdown ??= await readSourceMarkdown(location.rootDir, manifestRecord);
|
|
512
|
+
text = sourceMarkdown ? extractNodeTextFromMarkdown(node, pageIndexJson, sourceMarkdown) : undefined;
|
|
513
|
+
if (text) {
|
|
514
|
+
logVerbose(`query fallback markdown text path=${manifestRecord.path} node=${nodeId}`);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
if (!text) {
|
|
518
|
+
selectedNode.skipReason = "missing_text";
|
|
519
|
+
selectedNodes.push(selectedNode);
|
|
520
|
+
warnings.push(`Selected node has no extractable text: ${reference}`);
|
|
521
|
+
addTraceFailure(trace, {
|
|
522
|
+
stage: "extract-context",
|
|
523
|
+
code: "missing_text",
|
|
524
|
+
message: `Selected node has no extractable text: ${reference}`,
|
|
525
|
+
docId,
|
|
526
|
+
nodeId,
|
|
527
|
+
path: manifestRecord.path,
|
|
528
|
+
reference
|
|
529
|
+
});
|
|
530
|
+
continue;
|
|
531
|
+
}
|
|
532
|
+
selectedNode.hasText = true;
|
|
533
|
+
selectedNode.textBytes = byteLength(text);
|
|
534
|
+
selectedNodes.push(selectedNode);
|
|
535
|
+
sources.push({
|
|
536
|
+
path: manifestRecord.path,
|
|
537
|
+
nodeId,
|
|
538
|
+
reference,
|
|
539
|
+
text
|
|
540
|
+
});
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
if (sources.length === 0) {
|
|
544
|
+
warnings.push("No relevant context was extracted from the selected index nodes.");
|
|
545
|
+
addTraceFailure(trace, {
|
|
546
|
+
stage: "extract-context",
|
|
547
|
+
code: "empty_context",
|
|
548
|
+
message: "No relevant context was extracted from the selected index nodes."
|
|
549
|
+
});
|
|
550
|
+
}
|
|
551
|
+
const context = sources.length > 0 ? sources.map((source) => `Source: ${source.reference}\n${source.text}`).join("\n\n---\n\n") : "(no relevant context found)";
|
|
552
|
+
const contextBytes = byteLength(context);
|
|
553
|
+
const contextTokens = estimateTokenCount(context);
|
|
554
|
+
if (trace) {
|
|
555
|
+
trace.context = {
|
|
556
|
+
sourceCount: sources.length,
|
|
557
|
+
bytes: contextBytes,
|
|
558
|
+
tokens: contextTokens
|
|
559
|
+
};
|
|
560
|
+
}
|
|
561
|
+
logVerbose(`query final answer contextParts=${sources.length}`);
|
|
562
|
+
const finalPrompt = `Answer the user question using only the provided context.
|
|
563
|
+
If the context is insufficient, say that you could not find enough information in the available documentation.
|
|
564
|
+
Do not expose implementation details about how the documentation was found or prepared.
|
|
565
|
+
Include source references using the file path and node_id when possible.
|
|
566
|
+
User question:
|
|
567
|
+
${question}
|
|
568
|
+
Context:
|
|
569
|
+
${context}`;
|
|
570
|
+
const answerStartedAt = Date.now();
|
|
571
|
+
const answer = await runQueryStage("answer", async () => await (0, llm_client_1.chatCompletion)([{ role: "user", content: finalPrompt }], options));
|
|
572
|
+
if (trace) {
|
|
573
|
+
trace.answer = {
|
|
574
|
+
promptBytes: byteLength(finalPrompt),
|
|
575
|
+
responseBytes: byteLength(answer)
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
timings.answer = elapsedSince(answerStartedAt);
|
|
579
|
+
timings.total = elapsedSince(totalStartedAt);
|
|
580
|
+
return {
|
|
581
|
+
version: 1,
|
|
582
|
+
target: resolvedTarget,
|
|
583
|
+
rootDir: location.rootDir,
|
|
584
|
+
outputDir: location.outputDir ?? node_path_1.default.join(location.rootDir, manifest_1.PAGEINDEX_DIR),
|
|
585
|
+
question,
|
|
586
|
+
model: config.model,
|
|
587
|
+
answer,
|
|
588
|
+
contextBytes,
|
|
589
|
+
contextTokens,
|
|
590
|
+
selectedDocuments,
|
|
591
|
+
selectedNodes,
|
|
592
|
+
sources,
|
|
593
|
+
warnings,
|
|
594
|
+
timingsMs: timings,
|
|
595
|
+
...(trace ? { trace } : {})
|
|
596
|
+
};
|
|
597
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function runWithConcurrency<T, R>(items: T[], concurrency: number, worker: (item: T, index: number) => Promise<R>): Promise<R[]>;
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.runWithConcurrency = runWithConcurrency;
|
|
4
|
+
async function runWithConcurrency(items, concurrency, worker) {
|
|
5
|
+
const limit = Math.max(1, Math.floor(concurrency));
|
|
6
|
+
const results = new Array(items.length);
|
|
7
|
+
let nextIndex = 0;
|
|
8
|
+
async function runWorker() {
|
|
9
|
+
while (nextIndex < items.length) {
|
|
10
|
+
const currentIndex = nextIndex;
|
|
11
|
+
nextIndex += 1;
|
|
12
|
+
results[currentIndex] = await worker(items[currentIndex], currentIndex);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
const workers = Array.from({ length: Math.min(limit, items.length) }, () => runWorker());
|
|
16
|
+
await Promise.all(workers);
|
|
17
|
+
return results;
|
|
18
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.generateRootTree = generateRootTree;
|
|
7
|
+
exports.writeRootTree = writeRootTree;
|
|
8
|
+
const node_crypto_1 = require("node:crypto");
|
|
9
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
10
|
+
const manifest_1 = require("./manifest");
|
|
11
|
+
function createDirectoryNodeId(relativePath) {
|
|
12
|
+
const digest = (0, node_crypto_1.createHash)("sha1").update(relativePath || ".").digest("hex");
|
|
13
|
+
return `dir:${digest}`;
|
|
14
|
+
}
|
|
15
|
+
function sortTree(node) {
|
|
16
|
+
if (!node.children) {
|
|
17
|
+
return;
|
|
18
|
+
}
|
|
19
|
+
node.children.sort((left, right) => {
|
|
20
|
+
if (left.type !== right.type) {
|
|
21
|
+
if (left.type === "directory") {
|
|
22
|
+
return -1;
|
|
23
|
+
}
|
|
24
|
+
if (right.type === "directory") {
|
|
25
|
+
return 1;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return left.title.localeCompare(right.title) || (left.path ?? "").localeCompare(right.path ?? "");
|
|
29
|
+
});
|
|
30
|
+
for (const child of node.children) {
|
|
31
|
+
sortTree(child);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
function generateRootTree(manifest) {
|
|
35
|
+
const rootTitle = node_path_1.default.basename(manifest.rootDir) || manifest.rootDir;
|
|
36
|
+
const root = {
|
|
37
|
+
node_id: "root",
|
|
38
|
+
type: "root",
|
|
39
|
+
title: rootTitle,
|
|
40
|
+
children: []
|
|
41
|
+
};
|
|
42
|
+
const directories = new Map([["", root]]);
|
|
43
|
+
for (const record of manifest.documents) {
|
|
44
|
+
if (record.status !== "ready") {
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
const parts = record.path.split("/");
|
|
48
|
+
let parent = root;
|
|
49
|
+
let currentRelativeDir = "";
|
|
50
|
+
for (const part of parts.slice(0, -1)) {
|
|
51
|
+
currentRelativeDir = currentRelativeDir ? `${currentRelativeDir}/${part}` : part;
|
|
52
|
+
let directoryNode = directories.get(currentRelativeDir);
|
|
53
|
+
if (!directoryNode) {
|
|
54
|
+
directoryNode = {
|
|
55
|
+
node_id: createDirectoryNodeId(currentRelativeDir),
|
|
56
|
+
type: "directory",
|
|
57
|
+
title: part,
|
|
58
|
+
path: currentRelativeDir,
|
|
59
|
+
children: []
|
|
60
|
+
};
|
|
61
|
+
directories.set(currentRelativeDir, directoryNode);
|
|
62
|
+
parent.children ??= [];
|
|
63
|
+
parent.children.push(directoryNode);
|
|
64
|
+
}
|
|
65
|
+
parent = directoryNode;
|
|
66
|
+
}
|
|
67
|
+
parent.children ??= [];
|
|
68
|
+
parent.children.push({
|
|
69
|
+
node_id: record.docId,
|
|
70
|
+
type: "document",
|
|
71
|
+
title: record.title,
|
|
72
|
+
summary: record.summary,
|
|
73
|
+
path: record.path,
|
|
74
|
+
index_path: record.indexPath
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
sortTree(root);
|
|
78
|
+
return root;
|
|
79
|
+
}
|
|
80
|
+
async function writeRootTree(rootDir, rootTree, outputDir) {
|
|
81
|
+
await (0, manifest_1.atomicWriteJson)((0, manifest_1.getPageIndexPath)(rootDir, manifest_1.ROOT_TREE_FILE, outputDir), rootTree);
|
|
82
|
+
}
|