@bndynet/ragbox 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +765 -0
- package/README.zh-CN.md +774 -0
- package/dist/src/advanced.d.ts +13 -0
- package/dist/src/advanced.js +29 -0
- package/dist/src/cli.d.ts +2 -0
- package/dist/src/cli.js +1013 -0
- package/dist/src/config-file.d.ts +69 -0
- package/dist/src/config-file.js +246 -0
- package/dist/src/folder-index/config.d.ts +2 -0
- package/dist/src/folder-index/config.js +56 -0
- package/dist/src/folder-index/hash.d.ts +1 -0
- package/dist/src/folder-index/hash.js +14 -0
- package/dist/src/folder-index/indexer.d.ts +2 -0
- package/dist/src/folder-index/indexer.js +154 -0
- package/dist/src/folder-index/llm-client.d.ts +3 -0
- package/dist/src/folder-index/llm-client.js +45 -0
- package/dist/src/folder-index/manifest.d.ts +17 -0
- package/dist/src/folder-index/manifest.js +158 -0
- package/dist/src/folder-index/multi-query.d.ts +45 -0
- package/dist/src/folder-index/multi-query.js +109 -0
- package/dist/src/folder-index/pageindex-runner.d.ts +3 -0
- package/dist/src/folder-index/pageindex-runner.js +218 -0
- package/dist/src/folder-index/path-utils.d.ts +5 -0
- package/dist/src/folder-index/path-utils.js +33 -0
- package/dist/src/folder-index/query.d.ts +19 -0
- package/dist/src/folder-index/query.js +597 -0
- package/dist/src/folder-index/queue.d.ts +1 -0
- package/dist/src/folder-index/queue.js +18 -0
- package/dist/src/folder-index/root-tree.d.ts +3 -0
- package/dist/src/folder-index/root-tree.js +82 -0
- package/dist/src/folder-index/scan.d.ts +14 -0
- package/dist/src/folder-index/scan.js +152 -0
- package/dist/src/folder-index/types.d.ts +368 -0
- package/dist/src/folder-index/types.js +2 -0
- package/dist/src/folder-index/watch.d.ts +17 -0
- package/dist/src/folder-index/watch.js +550 -0
- package/dist/src/index.d.ts +6 -0
- package/dist/src/index.js +45 -0
- package/dist/src/sdk.d.ts +101 -0
- package/dist/src/sdk.js +352 -0
- package/dist/src/serve.d.ts +64 -0
- package/dist/src/serve.js +466 -0
- package/dist/src/setup-pageindex.d.ts +30 -0
- package/dist/src/setup-pageindex.js +184 -0
- package/package.json +43 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.FILE_STATE_FILE = exports.ROOT_TREE_FILE = exports.MANIFEST_FILE = exports.INDEXES_DIR = exports.PAGEINDEX_DIR = void 0;
|
|
7
|
+
exports.createEmptyManifest = createEmptyManifest;
|
|
8
|
+
exports.resolvePageIndexDir = resolvePageIndexDir;
|
|
9
|
+
exports.getPageIndexPath = getPageIndexPath;
|
|
10
|
+
exports.resolveDocumentIndexPath = resolveDocumentIndexPath;
|
|
11
|
+
exports.readManifest = readManifest;
|
|
12
|
+
exports.diffManifest = diffManifest;
|
|
13
|
+
exports.recordFromScannedFile = recordFromScannedFile;
|
|
14
|
+
exports.atomicWriteJson = atomicWriteJson;
|
|
15
|
+
exports.writeManifest = writeManifest;
|
|
16
|
+
exports.writeFileState = writeFileState;
|
|
17
|
+
exports.removeDeletedIndexFiles = removeDeletedIndexFiles;
|
|
18
|
+
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
19
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
20
|
+
const path_utils_1 = require("./path-utils");
|
|
21
|
+
exports.PAGEINDEX_DIR = ".pageindex";
|
|
22
|
+
exports.INDEXES_DIR = "indexes";
|
|
23
|
+
exports.MANIFEST_FILE = "manifest.json";
|
|
24
|
+
exports.ROOT_TREE_FILE = "root-tree.json";
|
|
25
|
+
exports.FILE_STATE_FILE = node_path_1.default.join("state", "file-state.json");
|
|
26
|
+
function createEmptyManifest(rootDir) {
|
|
27
|
+
return {
|
|
28
|
+
version: 1,
|
|
29
|
+
rootDir: (0, path_utils_1.normalizeAbsolutePath)(rootDir),
|
|
30
|
+
generatedAt: new Date().toISOString(),
|
|
31
|
+
documents: []
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
function resolvePageIndexDir(rootDir, outputDir) {
|
|
35
|
+
return node_path_1.default.resolve(outputDir ?? node_path_1.default.join(rootDir, exports.PAGEINDEX_DIR));
|
|
36
|
+
}
|
|
37
|
+
function getPageIndexPath(rootDir, relativePath, outputDir) {
|
|
38
|
+
return node_path_1.default.join(resolvePageIndexDir(rootDir, outputDir), relativePath);
|
|
39
|
+
}
|
|
40
|
+
function resolveDocumentIndexPath(rootDir, indexPath, outputDir) {
|
|
41
|
+
if (node_path_1.default.isAbsolute(indexPath)) {
|
|
42
|
+
return indexPath;
|
|
43
|
+
}
|
|
44
|
+
const normalizedIndexPath = (0, path_utils_1.normalizeRelativePath)(indexPath);
|
|
45
|
+
if (normalizedIndexPath === exports.PAGEINDEX_DIR || normalizedIndexPath.startsWith(`${exports.PAGEINDEX_DIR}/`)) {
|
|
46
|
+
return node_path_1.default.join(rootDir, normalizedIndexPath);
|
|
47
|
+
}
|
|
48
|
+
return node_path_1.default.join(resolvePageIndexDir(rootDir, outputDir), normalizedIndexPath);
|
|
49
|
+
}
|
|
50
|
+
async function readManifest(rootDir, outputDir) {
|
|
51
|
+
const manifestPath = getPageIndexPath(rootDir, exports.MANIFEST_FILE, outputDir);
|
|
52
|
+
try {
|
|
53
|
+
const raw = await promises_1.default.readFile(manifestPath, "utf8");
|
|
54
|
+
const manifest = JSON.parse(raw);
|
|
55
|
+
return {
|
|
56
|
+
...manifest,
|
|
57
|
+
documents: Array.isArray(manifest.documents) ? manifest.documents : []
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
catch (error) {
|
|
61
|
+
if (error.code === "ENOENT") {
|
|
62
|
+
return createEmptyManifest(rootDir);
|
|
63
|
+
}
|
|
64
|
+
throw error;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
function diffManifest(previous, scannedFiles) {
|
|
68
|
+
const previousByPath = new Map(previous.documents.map((record) => [record.path, record]));
|
|
69
|
+
const scannedByPath = new Map(scannedFiles.map((file) => [file.path, file]));
|
|
70
|
+
const added = [];
|
|
71
|
+
const modified = [];
|
|
72
|
+
const retryFailed = [];
|
|
73
|
+
const unchanged = [];
|
|
74
|
+
const deleted = [];
|
|
75
|
+
for (const scannedFile of scannedFiles) {
|
|
76
|
+
const previousRecord = previousByPath.get(scannedFile.path);
|
|
77
|
+
if (!previousRecord) {
|
|
78
|
+
added.push(scannedFile);
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
if (previousRecord.status === "failed") {
|
|
82
|
+
retryFailed.push(scannedFile);
|
|
83
|
+
continue;
|
|
84
|
+
}
|
|
85
|
+
if (previousRecord.contentHash !== scannedFile.contentHash) {
|
|
86
|
+
modified.push(scannedFile);
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
unchanged.push(scannedFile);
|
|
90
|
+
}
|
|
91
|
+
for (const record of previous.documents) {
|
|
92
|
+
if (!scannedByPath.has(record.path)) {
|
|
93
|
+
deleted.push(record);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
return {
|
|
97
|
+
added,
|
|
98
|
+
modified,
|
|
99
|
+
retryFailed,
|
|
100
|
+
unchanged,
|
|
101
|
+
deleted,
|
|
102
|
+
toIndex: [...added, ...modified, ...retryFailed]
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
function recordFromScannedFile(scannedFile, fields = {}) {
|
|
106
|
+
return {
|
|
107
|
+
docId: scannedFile.docId,
|
|
108
|
+
path: scannedFile.path,
|
|
109
|
+
absolutePath: scannedFile.absolutePath,
|
|
110
|
+
contentHash: scannedFile.contentHash,
|
|
111
|
+
size: scannedFile.size,
|
|
112
|
+
mtimeMs: scannedFile.mtimeMs,
|
|
113
|
+
title: scannedFile.title,
|
|
114
|
+
indexPath: scannedFile.indexPath,
|
|
115
|
+
status: fields.status ?? "ready",
|
|
116
|
+
summary: fields.summary,
|
|
117
|
+
error: fields.error
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
async function atomicWriteJson(filePath, value) {
|
|
121
|
+
await promises_1.default.mkdir(node_path_1.default.dirname(filePath), { recursive: true });
|
|
122
|
+
const tempPath = `${filePath}.${process.pid}.${Date.now()}.tmp`;
|
|
123
|
+
await promises_1.default.writeFile(tempPath, `${JSON.stringify(value, null, 2)}\n`, "utf8");
|
|
124
|
+
await promises_1.default.rename(tempPath, filePath);
|
|
125
|
+
}
|
|
126
|
+
async function writeManifest(rootDir, manifest, outputDir) {
|
|
127
|
+
await atomicWriteJson(getPageIndexPath(rootDir, exports.MANIFEST_FILE, outputDir), manifest);
|
|
128
|
+
}
|
|
129
|
+
async function writeFileState(rootDir, manifest, outputDir) {
|
|
130
|
+
const state = {
|
|
131
|
+
version: 1,
|
|
132
|
+
generatedAt: manifest.generatedAt,
|
|
133
|
+
files: manifest.documents.map((record) => ({
|
|
134
|
+
path: record.path,
|
|
135
|
+
absolutePath: record.absolutePath,
|
|
136
|
+
docId: record.docId,
|
|
137
|
+
contentHash: record.contentHash,
|
|
138
|
+
size: record.size,
|
|
139
|
+
mtimeMs: record.mtimeMs,
|
|
140
|
+
indexPath: record.indexPath,
|
|
141
|
+
status: record.status,
|
|
142
|
+
error: record.error
|
|
143
|
+
}))
|
|
144
|
+
};
|
|
145
|
+
await atomicWriteJson(getPageIndexPath(rootDir, exports.FILE_STATE_FILE, outputDir), state);
|
|
146
|
+
}
|
|
147
|
+
async function removeDeletedIndexFiles(rootDir, deletedRecords, outputDir) {
|
|
148
|
+
await Promise.all(deletedRecords.map(async (record) => {
|
|
149
|
+
try {
|
|
150
|
+
await promises_1.default.rm(resolveDocumentIndexPath(rootDir, record.indexPath, outputDir), { force: true });
|
|
151
|
+
}
|
|
152
|
+
catch (error) {
|
|
153
|
+
if (error.code !== "ENOENT") {
|
|
154
|
+
throw error;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
}));
|
|
158
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { PageIndexOptions, QueryResult, QuerySource } from "./types";
|
|
2
|
+
export type MultiQueryTarget = {
|
|
3
|
+
name: string;
|
|
4
|
+
target: string;
|
|
5
|
+
options?: PageIndexOptions;
|
|
6
|
+
};
|
|
7
|
+
export type MultiQuerySource = QuerySource & {
|
|
8
|
+
source: string;
|
|
9
|
+
originalReference: string;
|
|
10
|
+
};
|
|
11
|
+
export type MultiQuerySourceResult = QueryResult & {
|
|
12
|
+
source: string;
|
|
13
|
+
};
|
|
14
|
+
export type MultiQueryResult = {
|
|
15
|
+
version: 1;
|
|
16
|
+
target: "multiple";
|
|
17
|
+
sourcesQueried: string[];
|
|
18
|
+
question: string;
|
|
19
|
+
model: string;
|
|
20
|
+
answer: string;
|
|
21
|
+
contextBytes: number;
|
|
22
|
+
contextTokens: number;
|
|
23
|
+
results: MultiQuerySourceResult[];
|
|
24
|
+
sources: MultiQuerySource[];
|
|
25
|
+
warnings: string[];
|
|
26
|
+
timingsMs: {
|
|
27
|
+
query: number;
|
|
28
|
+
answer: number;
|
|
29
|
+
total: number;
|
|
30
|
+
};
|
|
31
|
+
trace?: {
|
|
32
|
+
version: 1;
|
|
33
|
+
context: {
|
|
34
|
+
sourceCount: number;
|
|
35
|
+
bytes: number;
|
|
36
|
+
tokens: number;
|
|
37
|
+
};
|
|
38
|
+
answer: {
|
|
39
|
+
promptBytes: number;
|
|
40
|
+
responseBytes: number;
|
|
41
|
+
};
|
|
42
|
+
failures: [];
|
|
43
|
+
};
|
|
44
|
+
};
|
|
45
|
+
export declare function queryMultipleIndexes(targets: MultiQueryTarget[], question: string, answerOptions?: PageIndexOptions): Promise<MultiQueryResult>;
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.queryMultipleIndexes = queryMultipleIndexes;
|
|
4
|
+
const config_1 = require("./config");
|
|
5
|
+
const llm_client_1 = require("./llm-client");
|
|
6
|
+
const query_1 = require("./query");
|
|
7
|
+
function byteLength(value) {
|
|
8
|
+
return Buffer.byteLength(value, "utf8");
|
|
9
|
+
}
|
|
10
|
+
function estimateTokenCount(value) {
|
|
11
|
+
const trimmed = value.trim();
|
|
12
|
+
return trimmed ? Math.ceil(trimmed.length / 4) : 0;
|
|
13
|
+
}
|
|
14
|
+
function elapsedSince(startedAt) {
|
|
15
|
+
return Date.now() - startedAt;
|
|
16
|
+
}
|
|
17
|
+
function sourceReference(source, reference) {
|
|
18
|
+
return `${source}:${reference}`;
|
|
19
|
+
}
|
|
20
|
+
function answerContext(results) {
|
|
21
|
+
const parts = results.flatMap((result) => result.sources.map((source) => `Source: ${sourceReference(result.source, source.reference)}\n${source.text}`));
|
|
22
|
+
return parts.length > 0 ? parts.join("\n\n---\n\n") : "(no relevant context found)";
|
|
23
|
+
}
|
|
24
|
+
function sourceAnswerSummary(results) {
|
|
25
|
+
return results
|
|
26
|
+
.map((result) => {
|
|
27
|
+
const warnings = result.warnings.length > 0 ? `\nWarnings:\n${result.warnings.map((warning) => `- ${warning}`).join("\n")}` : "";
|
|
28
|
+
return `Source: ${result.source}\nAnswer:\n${result.answer}${warnings}`;
|
|
29
|
+
})
|
|
30
|
+
.join("\n\n---\n\n");
|
|
31
|
+
}
|
|
32
|
+
async function queryMultipleIndexes(targets, question, answerOptions = {}) {
|
|
33
|
+
if (targets.length === 0) {
|
|
34
|
+
throw new Error("At least one query source is required.");
|
|
35
|
+
}
|
|
36
|
+
const totalStartedAt = Date.now();
|
|
37
|
+
const queryStartedAt = Date.now();
|
|
38
|
+
const results = [];
|
|
39
|
+
for (const target of targets) {
|
|
40
|
+
results.push({
|
|
41
|
+
...(await (0, query_1.queryFolder)(target.target, question, target.options ?? answerOptions)),
|
|
42
|
+
source: target.name
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
const queryMs = elapsedSince(queryStartedAt);
|
|
46
|
+
const warnings = results.flatMap((result) => result.warnings.map((warning) => `[${result.source}] ${warning}`));
|
|
47
|
+
const sources = results.flatMap((result) => result.sources.map((source) => ({
|
|
48
|
+
...source,
|
|
49
|
+
source: result.source,
|
|
50
|
+
originalReference: source.reference,
|
|
51
|
+
reference: sourceReference(result.source, source.reference)
|
|
52
|
+
})));
|
|
53
|
+
const prompt = `Answer the user question using only the provided multi-source context.
|
|
54
|
+
Synthesize across sources when they complement each other.
|
|
55
|
+
If sources conflict, call that out briefly.
|
|
56
|
+
If the context is insufficient, say that you could not find enough information in the available documentation.
|
|
57
|
+
Do not expose implementation details about how the documentation was found or prepared.
|
|
58
|
+
Use source references in the form source:path#node_id when possible.
|
|
59
|
+
|
|
60
|
+
User question:
|
|
61
|
+
${question}
|
|
62
|
+
|
|
63
|
+
Per-source draft answers:
|
|
64
|
+
${sourceAnswerSummary(results)}
|
|
65
|
+
|
|
66
|
+
Multi-source context:
|
|
67
|
+
${answerContext(results)}`;
|
|
68
|
+
const context = answerContext(results);
|
|
69
|
+
const contextBytes = byteLength(context);
|
|
70
|
+
const contextTokens = estimateTokenCount(context);
|
|
71
|
+
const answerStartedAt = Date.now();
|
|
72
|
+
const answer = await (0, llm_client_1.chatCompletion)([{ role: "user", content: prompt }], answerOptions);
|
|
73
|
+
const answerMs = elapsedSince(answerStartedAt);
|
|
74
|
+
return {
|
|
75
|
+
version: 1,
|
|
76
|
+
target: "multiple",
|
|
77
|
+
sourcesQueried: targets.map((target) => target.name),
|
|
78
|
+
question,
|
|
79
|
+
model: (0, config_1.loadPageIndexConfig)(answerOptions).model,
|
|
80
|
+
answer,
|
|
81
|
+
contextBytes,
|
|
82
|
+
contextTokens,
|
|
83
|
+
results,
|
|
84
|
+
sources,
|
|
85
|
+
warnings,
|
|
86
|
+
timingsMs: {
|
|
87
|
+
query: queryMs,
|
|
88
|
+
answer: answerMs,
|
|
89
|
+
total: elapsedSince(totalStartedAt)
|
|
90
|
+
},
|
|
91
|
+
...(answerOptions.trace
|
|
92
|
+
? {
|
|
93
|
+
trace: {
|
|
94
|
+
version: 1,
|
|
95
|
+
context: {
|
|
96
|
+
sourceCount: sources.length,
|
|
97
|
+
bytes: contextBytes,
|
|
98
|
+
tokens: contextTokens
|
|
99
|
+
},
|
|
100
|
+
answer: {
|
|
101
|
+
promptBytes: byteLength(prompt),
|
|
102
|
+
responseBytes: byteLength(answer)
|
|
103
|
+
},
|
|
104
|
+
failures: []
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
: {})
|
|
108
|
+
};
|
|
109
|
+
}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.runPageIndex = runPageIndex;
|
|
7
|
+
exports.readPageIndexSummary = readPageIndexSummary;
|
|
8
|
+
const node_child_process_1 = require("node:child_process");
|
|
9
|
+
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
10
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
11
|
+
const node_os_1 = __importDefault(require("node:os"));
|
|
12
|
+
const config_1 = require("./config");
|
|
13
|
+
const path_utils_1 = require("./path-utils");
|
|
14
|
+
const MAX_CAPTURED_OUTPUT = 64 * 1024;
|
|
15
|
+
const DEFAULT_MARKDOWN_ARGS = ["--if-add-node-text", "yes", "--if-add-node-id", "yes"];
|
|
16
|
+
const unsupportedOutputArgs = new Set();
|
|
17
|
+
class PageIndexRunError extends Error {
|
|
18
|
+
stdout;
|
|
19
|
+
stderr;
|
|
20
|
+
constructor(message, stdout, stderr) {
|
|
21
|
+
super(message);
|
|
22
|
+
this.stdout = stdout;
|
|
23
|
+
this.stderr = stderr;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
function appendCapturedOutput(current, chunk) {
|
|
27
|
+
const next = current + chunk.toString("utf8");
|
|
28
|
+
return next.length > MAX_CAPTURED_OUTPUT ? next.slice(-MAX_CAPTURED_OUTPUT) : next;
|
|
29
|
+
}
|
|
30
|
+
function pageIndexRunError(code, stdout, stderr) {
|
|
31
|
+
const trimmedStdout = stdout.trim();
|
|
32
|
+
const trimmedStderr = stderr.trim();
|
|
33
|
+
return new PageIndexRunError(`PageIndex failed with exit code ${code ?? "unknown"}\nSTDOUT:\n${trimmedStdout}\nSTDERR:\n${trimmedStderr}`, trimmedStdout, trimmedStderr);
|
|
34
|
+
}
|
|
35
|
+
function unsupportedOutputArgKey(pythonPath, cliPath, outputArg) {
|
|
36
|
+
return `${pythonPath}\0${cliPath}\0${outputArg}`;
|
|
37
|
+
}
|
|
38
|
+
function outputArgWasRejected(error, outputArg) {
|
|
39
|
+
if (!(error instanceof PageIndexRunError)) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
const output = `${error.stdout}\n${error.stderr}`;
|
|
43
|
+
return output.includes("unrecognized arguments") && output.includes(outputArg);
|
|
44
|
+
}
|
|
45
|
+
async function fileUpdatedAfter(filePath, startedAtMs) {
|
|
46
|
+
try {
|
|
47
|
+
const stat = await promises_1.default.stat(filePath);
|
|
48
|
+
return stat.mtimeMs >= startedAtMs - 1000;
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
async function findJsonFiles(rootDir) {
|
|
55
|
+
const results = [];
|
|
56
|
+
async function walk(currentDir) {
|
|
57
|
+
let entries;
|
|
58
|
+
try {
|
|
59
|
+
entries = await promises_1.default.readdir(currentDir, { withFileTypes: true });
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
for (const entry of entries) {
|
|
65
|
+
const absolutePath = node_path_1.default.join(currentDir, entry.name);
|
|
66
|
+
if (entry.isDirectory()) {
|
|
67
|
+
await walk(absolutePath);
|
|
68
|
+
}
|
|
69
|
+
else if (entry.isFile() && entry.name.endsWith(".json")) {
|
|
70
|
+
results.push(absolutePath);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
await walk(rootDir);
|
|
75
|
+
return results;
|
|
76
|
+
}
|
|
77
|
+
async function locatePageIndexResult(searchRoots, startedAtMs) {
|
|
78
|
+
const uniqueRoots = [...new Set(searchRoots)];
|
|
79
|
+
const candidates = [];
|
|
80
|
+
for (const root of uniqueRoots) {
|
|
81
|
+
const files = await findJsonFiles(root);
|
|
82
|
+
for (const filePath of files) {
|
|
83
|
+
const stat = await promises_1.default.stat(filePath);
|
|
84
|
+
if (stat.mtimeMs >= startedAtMs - 1000) {
|
|
85
|
+
candidates.push({ filePath, mtimeMs: stat.mtimeMs });
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
candidates.sort((left, right) => right.mtimeMs - left.mtimeMs);
|
|
90
|
+
return candidates[0]?.filePath;
|
|
91
|
+
}
|
|
92
|
+
async function runPageIndex(inputPath, outputPath, options = {}) {
|
|
93
|
+
const config = (0, config_1.loadPageIndexConfig)(options);
|
|
94
|
+
if (!config.cliPath) {
|
|
95
|
+
throw new Error("PAGEINDEX_CLI is required to run PageIndex");
|
|
96
|
+
}
|
|
97
|
+
const cliPath = node_path_1.default.resolve(config.cliPath);
|
|
98
|
+
const absoluteInputPath = node_path_1.default.resolve(inputPath);
|
|
99
|
+
const absoluteOutputPath = node_path_1.default.resolve(outputPath);
|
|
100
|
+
const tempDir = await promises_1.default.mkdtemp(node_path_1.default.join(node_os_1.default.tmpdir(), "ragbox-"));
|
|
101
|
+
await promises_1.default.mkdir(node_path_1.default.dirname(absoluteOutputPath), { recursive: true });
|
|
102
|
+
try {
|
|
103
|
+
const runOnce = async (outputArg) => {
|
|
104
|
+
const startedAtMs = Date.now();
|
|
105
|
+
const args = [cliPath, "--md_path", absoluteInputPath, "--model", config.model, ...DEFAULT_MARKDOWN_ARGS];
|
|
106
|
+
if (outputArg) {
|
|
107
|
+
args.push(outputArg, absoluteOutputPath);
|
|
108
|
+
}
|
|
109
|
+
if (config.extraArgs?.length) {
|
|
110
|
+
args.push(...config.extraArgs);
|
|
111
|
+
}
|
|
112
|
+
await new Promise((resolve, reject) => {
|
|
113
|
+
let stdout = "";
|
|
114
|
+
let stderr = "";
|
|
115
|
+
const child = (0, node_child_process_1.spawn)(config.pythonPath, args, {
|
|
116
|
+
cwd: tempDir,
|
|
117
|
+
env: {
|
|
118
|
+
...process.env,
|
|
119
|
+
...config.env,
|
|
120
|
+
OPENAI_BASE_URL: config.baseUrl,
|
|
121
|
+
OPENAI_API_KEY: config.apiKey ?? process.env.OPENAI_API_KEY ?? ""
|
|
122
|
+
}
|
|
123
|
+
});
|
|
124
|
+
child.stdout.on("data", (chunk) => {
|
|
125
|
+
stdout = appendCapturedOutput(stdout, chunk);
|
|
126
|
+
});
|
|
127
|
+
child.stderr.on("data", (chunk) => {
|
|
128
|
+
stderr = appendCapturedOutput(stderr, chunk);
|
|
129
|
+
});
|
|
130
|
+
child.on("error", reject);
|
|
131
|
+
child.on("close", (code) => {
|
|
132
|
+
if (code === 0) {
|
|
133
|
+
resolve();
|
|
134
|
+
return;
|
|
135
|
+
}
|
|
136
|
+
reject(pageIndexRunError(code, stdout, stderr));
|
|
137
|
+
});
|
|
138
|
+
});
|
|
139
|
+
return startedAtMs;
|
|
140
|
+
};
|
|
141
|
+
let outputArg = config.outputArg;
|
|
142
|
+
const outputArgKey = outputArg ? unsupportedOutputArgKey(config.pythonPath, cliPath, outputArg) : undefined;
|
|
143
|
+
if (outputArgKey && unsupportedOutputArgs.has(outputArgKey)) {
|
|
144
|
+
outputArg = undefined;
|
|
145
|
+
}
|
|
146
|
+
let startedAtMs;
|
|
147
|
+
try {
|
|
148
|
+
startedAtMs = await runOnce(outputArg);
|
|
149
|
+
}
|
|
150
|
+
catch (error) {
|
|
151
|
+
if (!outputArg || !outputArgKey || !outputArgWasRejected(error, outputArg)) {
|
|
152
|
+
throw error;
|
|
153
|
+
}
|
|
154
|
+
unsupportedOutputArgs.add(outputArgKey);
|
|
155
|
+
startedAtMs = await runOnce(undefined);
|
|
156
|
+
}
|
|
157
|
+
const cliDir = node_path_1.default.dirname(cliPath);
|
|
158
|
+
const inputDir = node_path_1.default.dirname(absoluteInputPath);
|
|
159
|
+
if (await fileUpdatedAfter(absoluteOutputPath, startedAtMs)) {
|
|
160
|
+
return;
|
|
161
|
+
}
|
|
162
|
+
const searchRoots = [
|
|
163
|
+
node_path_1.default.join(tempDir, "results"),
|
|
164
|
+
tempDir,
|
|
165
|
+
node_path_1.default.join(cliDir, "results"),
|
|
166
|
+
node_path_1.default.join(inputDir, "results"),
|
|
167
|
+
node_path_1.default.join(process.cwd(), "results")
|
|
168
|
+
];
|
|
169
|
+
const resultPath = await locatePageIndexResult(searchRoots, startedAtMs);
|
|
170
|
+
if (!resultPath) {
|
|
171
|
+
throw new Error("PageIndex completed but no generated JSON result was found");
|
|
172
|
+
}
|
|
173
|
+
if (node_path_1.default.resolve(resultPath) !== absoluteOutputPath) {
|
|
174
|
+
await promises_1.default.copyFile(resultPath, absoluteOutputPath);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
finally {
|
|
178
|
+
if ((0, path_utils_1.isSubPath)(node_os_1.default.tmpdir(), tempDir)) {
|
|
179
|
+
await promises_1.default.rm(tempDir, { recursive: true, force: true });
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
function findSummary(value) {
|
|
184
|
+
if (!value || typeof value !== "object") {
|
|
185
|
+
return undefined;
|
|
186
|
+
}
|
|
187
|
+
const record = value;
|
|
188
|
+
if (typeof record.summary === "string" && record.summary.trim()) {
|
|
189
|
+
return record.summary.trim();
|
|
190
|
+
}
|
|
191
|
+
for (const key of ["root", "tree", "document"]) {
|
|
192
|
+
const nested = findSummary(record[key]);
|
|
193
|
+
if (nested) {
|
|
194
|
+
return nested;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
for (const key of ["children", "nodes"]) {
|
|
198
|
+
const children = record[key];
|
|
199
|
+
if (Array.isArray(children)) {
|
|
200
|
+
for (const child of children) {
|
|
201
|
+
const nested = findSummary(child);
|
|
202
|
+
if (nested) {
|
|
203
|
+
return nested;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return undefined;
|
|
209
|
+
}
|
|
210
|
+
async function readPageIndexSummary(indexPath) {
|
|
211
|
+
try {
|
|
212
|
+
const raw = await promises_1.default.readFile(indexPath, "utf8");
|
|
213
|
+
return findSummary(JSON.parse(raw));
|
|
214
|
+
}
|
|
215
|
+
catch {
|
|
216
|
+
return undefined;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export declare function toPosixPath(inputPath: string): string;
|
|
2
|
+
export declare function normalizeRelativePath(inputPath: string, rootDir?: string): string;
|
|
3
|
+
export declare function normalizeAbsolutePath(inputPath: string): string;
|
|
4
|
+
export declare function isSubPath(parentDir: string, candidatePath: string): boolean;
|
|
5
|
+
export declare function isStrictSubPath(parentDir: string, candidatePath: string): boolean;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.toPosixPath = toPosixPath;
|
|
7
|
+
exports.normalizeRelativePath = normalizeRelativePath;
|
|
8
|
+
exports.normalizeAbsolutePath = normalizeAbsolutePath;
|
|
9
|
+
exports.isSubPath = isSubPath;
|
|
10
|
+
exports.isStrictSubPath = isStrictSubPath;
|
|
11
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
12
|
+
function toPosixPath(inputPath) {
|
|
13
|
+
return inputPath.replace(/\\/g, "/");
|
|
14
|
+
}
|
|
15
|
+
function normalizeRelativePath(inputPath, rootDir) {
|
|
16
|
+
const relativePath = rootDir ? node_path_1.default.relative(rootDir, inputPath) : inputPath;
|
|
17
|
+
return toPosixPath(relativePath)
|
|
18
|
+
.replace(/^\.\//, "")
|
|
19
|
+
.split("/")
|
|
20
|
+
.filter((part) => part.length > 0 && part !== ".")
|
|
21
|
+
.join("/");
|
|
22
|
+
}
|
|
23
|
+
function normalizeAbsolutePath(inputPath) {
|
|
24
|
+
return toPosixPath(node_path_1.default.resolve(inputPath));
|
|
25
|
+
}
|
|
26
|
+
function isSubPath(parentDir, candidatePath) {
|
|
27
|
+
const relativePath = node_path_1.default.relative(parentDir, candidatePath);
|
|
28
|
+
return relativePath === "" || (!relativePath.startsWith("..") && !node_path_1.default.isAbsolute(relativePath));
|
|
29
|
+
}
|
|
30
|
+
function isStrictSubPath(parentDir, candidatePath) {
|
|
31
|
+
const relativePath = node_path_1.default.relative(parentDir, candidatePath);
|
|
32
|
+
return relativePath !== "" && !relativePath.startsWith("..") && !node_path_1.default.isAbsolute(relativePath);
|
|
33
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { PageIndexOptions, QueryFailureStage, QueryResult } from "./types";
|
|
2
|
+
type JsonObject = Record<string, unknown>;
|
|
3
|
+
export type QueryIndexLocation = {
|
|
4
|
+
rootDir: string;
|
|
5
|
+
outputDir?: string;
|
|
6
|
+
manifestPath: string;
|
|
7
|
+
rootTreePath: string;
|
|
8
|
+
};
|
|
9
|
+
export declare class QueryStageError extends Error {
|
|
10
|
+
readonly stage: QueryFailureStage;
|
|
11
|
+
readonly cause: unknown;
|
|
12
|
+
constructor(stage: QueryFailureStage, error: unknown);
|
|
13
|
+
}
|
|
14
|
+
export declare function stripText<T>(value: T): T;
|
|
15
|
+
export declare function buildNodeMap(tree: unknown): Map<string, JsonObject>;
|
|
16
|
+
export declare function extractNodeTextFromMarkdown(node: JsonObject, tree: unknown, markdown: string): string | undefined;
|
|
17
|
+
export declare function resolveQueryIndexLocation(target: string): Promise<QueryIndexLocation>;
|
|
18
|
+
export declare function queryFolder(target: string, question: string, options?: PageIndexOptions): Promise<QueryResult>;
|
|
19
|
+
export {};
|