@bndynet/ragbox 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -21
- package/README.zh-CN.md +74 -21
- package/dist/src/cli.js +382 -14
- package/dist/src/config-file.d.ts +17 -1
- package/dist/src/config-file.js +49 -4
- package/dist/src/folder-index/config.js +7 -0
- package/dist/src/folder-index/indexer.js +48 -2
- package/dist/src/folder-index/pageindex-runner.d.ts +15 -0
- package/dist/src/folder-index/pageindex-runner.js +436 -0
- package/dist/src/folder-index/types.d.ts +2 -0
- package/dist/src/index.d.ts +1 -0
- package/dist/src/sdk.d.ts +2 -1
- package/dist/src/sdk.js +1 -0
- package/dist/src/serve.js +24 -15
- package/package.json +1 -1
|
@@ -25,11 +25,18 @@ function parseExtraArgs(value) {
|
|
|
25
25
|
const trimmed = value?.trim();
|
|
26
26
|
return trimmed ? trimmed.split(/\s+/) : undefined;
|
|
27
27
|
}
|
|
28
|
+
function parsePageIndexRunner(value) {
|
|
29
|
+
if (value === "auto" || value === "single" || value === "batch") {
|
|
30
|
+
return value;
|
|
31
|
+
}
|
|
32
|
+
return undefined;
|
|
33
|
+
}
|
|
28
34
|
function loadPageIndexConfig(overrides = {}) {
|
|
29
35
|
const env = overrides.env ?? process.env;
|
|
30
36
|
return {
|
|
31
37
|
pythonPath: overrides.pythonPath ?? env.PAGEINDEX_PYTHON ?? "python3",
|
|
32
38
|
cliPath: overrides.cliPath ?? env.PAGEINDEX_CLI,
|
|
39
|
+
pageIndexRunner: overrides.pageIndexRunner ?? parsePageIndexRunner(env.PAGEINDEX_RUNNER) ?? "auto",
|
|
33
40
|
model: overrides.model ?? env.PAGEINDEX_MODEL ?? env.LLM_MODEL ?? "gpt-4o-mini",
|
|
34
41
|
baseUrl: overrides.baseUrl ?? env.OPENAI_BASE_URL ?? "https://api.openai.com/v1",
|
|
35
42
|
apiKey: overrides.apiKey ?? env.OPENAI_API_KEY,
|
|
@@ -73,7 +73,7 @@ async function indexFolder(folder, options = {}) {
|
|
|
73
73
|
});
|
|
74
74
|
await promises_1.default.mkdir(node_path_1.default.join(outputDir, manifest_1.INDEXES_DIR), { recursive: true });
|
|
75
75
|
await (0, manifest_1.removeDeletedIndexFiles)(rootDir, diff.deleted, config.outputDir);
|
|
76
|
-
|
|
76
|
+
async function indexOne(scannedFile, index) {
|
|
77
77
|
const absoluteOutputPath = (0, manifest_1.resolveDocumentIndexPath)(rootDir, scannedFile.indexPath, config.outputDir);
|
|
78
78
|
const progressIndex = index + 1;
|
|
79
79
|
const progressTotal = toIndex.length;
|
|
@@ -105,7 +105,53 @@ async function indexFolder(folder, options = {}) {
|
|
|
105
105
|
error: errorMessage(error)
|
|
106
106
|
});
|
|
107
107
|
}
|
|
108
|
-
}
|
|
108
|
+
}
|
|
109
|
+
async function indexBatch() {
|
|
110
|
+
const outputPaths = toIndex.map((scannedFile) => (0, manifest_1.resolveDocumentIndexPath)(rootDir, scannedFile.indexPath, config.outputDir));
|
|
111
|
+
const results = await (0, pageindex_runner_1.runPageIndexBatchPool)(toIndex.map((scannedFile, index) => ({
|
|
112
|
+
inputPath: scannedFile.absolutePath,
|
|
113
|
+
outputPath: outputPaths[index]
|
|
114
|
+
})), config, {
|
|
115
|
+
onJobStart: (_job, index) => {
|
|
116
|
+
reportProgress(config, { type: "index-start", path: toIndex[index].path, index: index + 1, total: toIndex.length });
|
|
117
|
+
}
|
|
118
|
+
});
|
|
119
|
+
const records = [];
|
|
120
|
+
for (let index = 0; index < toIndex.length; index += 1) {
|
|
121
|
+
const scannedFile = toIndex[index];
|
|
122
|
+
const result = results[index];
|
|
123
|
+
if (result.ok) {
|
|
124
|
+
const summary = await (0, pageindex_runner_1.readPageIndexSummary)(outputPaths[index]);
|
|
125
|
+
reportProgress(config, {
|
|
126
|
+
type: "index-done",
|
|
127
|
+
path: scannedFile.path,
|
|
128
|
+
index: index + 1,
|
|
129
|
+
total: toIndex.length,
|
|
130
|
+
summary
|
|
131
|
+
});
|
|
132
|
+
records.push((0, manifest_1.recordFromScannedFile)(scannedFile, { status: "ready", summary }));
|
|
133
|
+
}
|
|
134
|
+
else {
|
|
135
|
+
const previous = previousByPath.get(scannedFile.path);
|
|
136
|
+
reportProgress(config, {
|
|
137
|
+
type: "index-failed",
|
|
138
|
+
path: scannedFile.path,
|
|
139
|
+
index: index + 1,
|
|
140
|
+
total: toIndex.length,
|
|
141
|
+
error: result.error
|
|
142
|
+
});
|
|
143
|
+
records.push((0, manifest_1.recordFromScannedFile)(scannedFile, {
|
|
144
|
+
status: "failed",
|
|
145
|
+
summary: previous?.summary,
|
|
146
|
+
error: result.error
|
|
147
|
+
}));
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return records;
|
|
151
|
+
}
|
|
152
|
+
const indexedRecords = config.pageIndexRunner === "single" || toIndex.length < 2
|
|
153
|
+
? await (0, queue_1.runWithConcurrency)(toIndex, config.concurrency, indexOne)
|
|
154
|
+
: await indexBatch();
|
|
109
155
|
const indexedByPath = new Map(indexedRecords.map((record) => [record.path, record]));
|
|
110
156
|
const documents = [];
|
|
111
157
|
for (const scannedFile of scannedFiles) {
|
|
@@ -1,3 +1,18 @@
|
|
|
1
1
|
import { PageIndexOptions } from "./types";
|
|
2
|
+
export type PageIndexBatchJob = {
|
|
3
|
+
inputPath: string;
|
|
4
|
+
outputPath: string;
|
|
5
|
+
};
|
|
6
|
+
export type PageIndexBatchResult = (PageIndexBatchJob & {
|
|
7
|
+
ok: true;
|
|
8
|
+
}) | (PageIndexBatchJob & {
|
|
9
|
+
ok: false;
|
|
10
|
+
error: string;
|
|
11
|
+
});
|
|
12
|
+
type PageIndexBatchCallbacks = {
|
|
13
|
+
onJobStart?: (job: PageIndexBatchJob, index: number) => void;
|
|
14
|
+
};
|
|
15
|
+
export declare function runPageIndexBatchPool(jobs: PageIndexBatchJob[], options?: PageIndexOptions, callbacks?: PageIndexBatchCallbacks): Promise<PageIndexBatchResult[]>;
|
|
2
16
|
export declare function runPageIndex(inputPath: string, outputPath: string, options?: PageIndexOptions): Promise<void>;
|
|
3
17
|
export declare function readPageIndexSummary(indexPath: string): Promise<string | undefined>;
|
|
18
|
+
export {};
|
|
@@ -3,17 +3,78 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
3
3
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
4
|
};
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.runPageIndexBatchPool = runPageIndexBatchPool;
|
|
6
7
|
exports.runPageIndex = runPageIndex;
|
|
7
8
|
exports.readPageIndexSummary = readPageIndexSummary;
|
|
8
9
|
const node_child_process_1 = require("node:child_process");
|
|
9
10
|
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
10
11
|
const node_path_1 = __importDefault(require("node:path"));
|
|
11
12
|
const node_os_1 = __importDefault(require("node:os"));
|
|
13
|
+
const node_readline_1 = __importDefault(require("node:readline"));
|
|
12
14
|
const config_1 = require("./config");
|
|
13
15
|
const path_utils_1 = require("./path-utils");
|
|
14
16
|
const MAX_CAPTURED_OUTPUT = 64 * 1024;
|
|
15
17
|
const DEFAULT_MARKDOWN_ARGS = ["--if-add-node-text", "yes", "--if-add-node-id", "yes"];
|
|
16
18
|
const unsupportedOutputArgs = new Set();
|
|
19
|
+
const BATCH_WORKER_CODE = String.raw `
|
|
20
|
+
import asyncio
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import sys
|
|
24
|
+
import traceback
|
|
25
|
+
|
|
26
|
+
_protocol_stdout = sys.stdout
|
|
27
|
+
sys.stdout = sys.stderr
|
|
28
|
+
|
|
29
|
+
def send(message):
|
|
30
|
+
_protocol_stdout.write(json.dumps(message, ensure_ascii=False) + "\n")
|
|
31
|
+
_protocol_stdout.flush()
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
from pageindex.page_index_md import md_to_tree
|
|
35
|
+
from pageindex.utils import ConfigLoader
|
|
36
|
+
except Exception:
|
|
37
|
+
send({"type": "startup-error", "error": traceback.format_exc()})
|
|
38
|
+
raise SystemExit(0)
|
|
39
|
+
|
|
40
|
+
send({"type": "ready"})
|
|
41
|
+
|
|
42
|
+
for line in sys.stdin:
|
|
43
|
+
request = {}
|
|
44
|
+
try:
|
|
45
|
+
request = json.loads(line)
|
|
46
|
+
if request.get("type") == "stop":
|
|
47
|
+
break
|
|
48
|
+
request_id = request["id"]
|
|
49
|
+
user_opt = {
|
|
50
|
+
"model": request.get("model"),
|
|
51
|
+
"if_add_node_summary": request.get("ifAddNodeSummary"),
|
|
52
|
+
"if_add_doc_description": request.get("ifAddDocDescription"),
|
|
53
|
+
"if_add_node_text": request.get("ifAddNodeText"),
|
|
54
|
+
"if_add_node_id": request.get("ifAddNodeId"),
|
|
55
|
+
}
|
|
56
|
+
opt = ConfigLoader().load(user_opt)
|
|
57
|
+
tree = asyncio.run(md_to_tree(
|
|
58
|
+
md_path=request["inputPath"],
|
|
59
|
+
if_thinning=bool(request.get("ifThinning", False)),
|
|
60
|
+
min_token_threshold=int(request.get("thinningThreshold", 5000)),
|
|
61
|
+
if_add_node_summary=opt.if_add_node_summary,
|
|
62
|
+
summary_token_threshold=int(request.get("summaryTokenThreshold", 200)),
|
|
63
|
+
model=opt.model,
|
|
64
|
+
if_add_doc_description=opt.if_add_doc_description,
|
|
65
|
+
if_add_node_text=opt.if_add_node_text,
|
|
66
|
+
if_add_node_id=opt.if_add_node_id,
|
|
67
|
+
))
|
|
68
|
+
output_path = request["outputPath"]
|
|
69
|
+
output_dir = os.path.dirname(output_path)
|
|
70
|
+
if output_dir:
|
|
71
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
72
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
73
|
+
json.dump(tree, f, indent=2, ensure_ascii=False)
|
|
74
|
+
send({"type": "done", "id": request_id})
|
|
75
|
+
except Exception:
|
|
76
|
+
send({"type": "error", "id": request.get("id"), "error": traceback.format_exc()})
|
|
77
|
+
`;
|
|
17
78
|
class PageIndexRunError extends Error {
|
|
18
79
|
stdout;
|
|
19
80
|
stderr;
|
|
@@ -27,11 +88,264 @@ function appendCapturedOutput(current, chunk) {
|
|
|
27
88
|
const next = current + chunk.toString("utf8");
|
|
28
89
|
return next.length > MAX_CAPTURED_OUTPUT ? next.slice(-MAX_CAPTURED_OUTPUT) : next;
|
|
29
90
|
}
|
|
91
|
+
function errorMessage(error) {
|
|
92
|
+
return error instanceof Error ? error.message : String(error);
|
|
93
|
+
}
|
|
30
94
|
function pageIndexRunError(code, stdout, stderr) {
|
|
31
95
|
const trimmedStdout = stdout.trim();
|
|
32
96
|
const trimmedStderr = stderr.trim();
|
|
33
97
|
return new PageIndexRunError(`PageIndex failed with exit code ${code ?? "unknown"}\nSTDOUT:\n${trimmedStdout}\nSTDERR:\n${trimmedStderr}`, trimmedStdout, trimmedStderr);
|
|
34
98
|
}
|
|
99
|
+
function parseIntegerArg(value) {
|
|
100
|
+
if (value === undefined) {
|
|
101
|
+
return undefined;
|
|
102
|
+
}
|
|
103
|
+
const parsed = Number.parseInt(value, 10);
|
|
104
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
105
|
+
}
|
|
106
|
+
function parseBatchMarkdownArgs(extraArgs) {
|
|
107
|
+
const args = {
|
|
108
|
+
ifAddNodeId: "yes",
|
|
109
|
+
ifAddNodeText: "yes",
|
|
110
|
+
ifThinning: false,
|
|
111
|
+
summaryTokenThreshold: 200,
|
|
112
|
+
thinningThreshold: 5000
|
|
113
|
+
};
|
|
114
|
+
const unsupported = [];
|
|
115
|
+
const values = extraArgs ?? [];
|
|
116
|
+
for (let index = 0; index < values.length; index += 1) {
|
|
117
|
+
const key = values[index];
|
|
118
|
+
const value = values[index + 1];
|
|
119
|
+
switch (key) {
|
|
120
|
+
case "--if-thinning":
|
|
121
|
+
if (value === undefined) {
|
|
122
|
+
unsupported.push(key);
|
|
123
|
+
break;
|
|
124
|
+
}
|
|
125
|
+
args.ifThinning = value.toLowerCase() === "yes" || value.toLowerCase() === "true" || value === "1";
|
|
126
|
+
index += 1;
|
|
127
|
+
break;
|
|
128
|
+
case "--thinning-threshold": {
|
|
129
|
+
const parsed = parseIntegerArg(value);
|
|
130
|
+
if (parsed === undefined) {
|
|
131
|
+
unsupported.push(key);
|
|
132
|
+
break;
|
|
133
|
+
}
|
|
134
|
+
args.thinningThreshold = parsed;
|
|
135
|
+
index += 1;
|
|
136
|
+
break;
|
|
137
|
+
}
|
|
138
|
+
case "--summary-token-threshold": {
|
|
139
|
+
const parsed = parseIntegerArg(value);
|
|
140
|
+
if (parsed === undefined) {
|
|
141
|
+
unsupported.push(key);
|
|
142
|
+
break;
|
|
143
|
+
}
|
|
144
|
+
args.summaryTokenThreshold = parsed;
|
|
145
|
+
index += 1;
|
|
146
|
+
break;
|
|
147
|
+
}
|
|
148
|
+
case "--if-add-node-summary":
|
|
149
|
+
if (value === undefined) {
|
|
150
|
+
unsupported.push(key);
|
|
151
|
+
break;
|
|
152
|
+
}
|
|
153
|
+
args.ifAddNodeSummary = value;
|
|
154
|
+
index += 1;
|
|
155
|
+
break;
|
|
156
|
+
case "--if-add-doc-description":
|
|
157
|
+
if (value === undefined) {
|
|
158
|
+
unsupported.push(key);
|
|
159
|
+
break;
|
|
160
|
+
}
|
|
161
|
+
args.ifAddDocDescription = value;
|
|
162
|
+
index += 1;
|
|
163
|
+
break;
|
|
164
|
+
case "--if-add-node-text":
|
|
165
|
+
if (value === undefined) {
|
|
166
|
+
unsupported.push(key);
|
|
167
|
+
break;
|
|
168
|
+
}
|
|
169
|
+
args.ifAddNodeText = value;
|
|
170
|
+
index += 1;
|
|
171
|
+
break;
|
|
172
|
+
case "--if-add-node-id":
|
|
173
|
+
if (value === undefined) {
|
|
174
|
+
unsupported.push(key);
|
|
175
|
+
break;
|
|
176
|
+
}
|
|
177
|
+
args.ifAddNodeId = value;
|
|
178
|
+
index += 1;
|
|
179
|
+
break;
|
|
180
|
+
default:
|
|
181
|
+
unsupported.push(key);
|
|
182
|
+
break;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return { args, unsupported };
|
|
186
|
+
}
|
|
187
|
+
function workerEnv(config, cliDir) {
|
|
188
|
+
const env = {
|
|
189
|
+
...process.env,
|
|
190
|
+
...config.env,
|
|
191
|
+
OPENAI_BASE_URL: config.baseUrl,
|
|
192
|
+
OPENAI_API_KEY: config.apiKey ?? process.env.OPENAI_API_KEY ?? ""
|
|
193
|
+
};
|
|
194
|
+
const pythonPath = [cliDir, env.PYTHONPATH].filter((value) => Boolean(value)).join(node_path_1.default.delimiter);
|
|
195
|
+
return {
|
|
196
|
+
...env,
|
|
197
|
+
PYTHONPATH: pythonPath
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
class PageIndexBatchWorker {
|
|
201
|
+
workerId;
|
|
202
|
+
config;
|
|
203
|
+
cliDir;
|
|
204
|
+
cwd;
|
|
205
|
+
child;
|
|
206
|
+
closed = false;
|
|
207
|
+
closedPromise;
|
|
208
|
+
nextId = 1;
|
|
209
|
+
pending = new Map();
|
|
210
|
+
stderr = "";
|
|
211
|
+
constructor(workerId, config, cliDir, cwd) {
|
|
212
|
+
this.workerId = workerId;
|
|
213
|
+
this.config = config;
|
|
214
|
+
this.cliDir = cliDir;
|
|
215
|
+
this.cwd = cwd;
|
|
216
|
+
}
|
|
217
|
+
async start() {
|
|
218
|
+
await promises_1.default.mkdir(this.cwd, { recursive: true });
|
|
219
|
+
const child = (0, node_child_process_1.spawn)(this.config.pythonPath, ["-u", "-c", BATCH_WORKER_CODE], {
|
|
220
|
+
cwd: this.cwd,
|
|
221
|
+
env: workerEnv(this.config, this.cliDir)
|
|
222
|
+
});
|
|
223
|
+
this.child = child;
|
|
224
|
+
this.closedPromise = new Promise((resolve) => {
|
|
225
|
+
child.on("close", () => resolve());
|
|
226
|
+
});
|
|
227
|
+
child.stderr.on("data", (chunk) => {
|
|
228
|
+
this.stderr = appendCapturedOutput(this.stderr, chunk);
|
|
229
|
+
});
|
|
230
|
+
return await new Promise((resolve, reject) => {
|
|
231
|
+
let readySettled = false;
|
|
232
|
+
const reader = node_readline_1.default.createInterface({ input: child.stdout });
|
|
233
|
+
const settleReady = (error) => {
|
|
234
|
+
if (readySettled) {
|
|
235
|
+
return;
|
|
236
|
+
}
|
|
237
|
+
readySettled = true;
|
|
238
|
+
if (error) {
|
|
239
|
+
reject(error);
|
|
240
|
+
}
|
|
241
|
+
else {
|
|
242
|
+
resolve();
|
|
243
|
+
}
|
|
244
|
+
};
|
|
245
|
+
reader.on("line", (line) => {
|
|
246
|
+
let message;
|
|
247
|
+
try {
|
|
248
|
+
message = JSON.parse(line);
|
|
249
|
+
}
|
|
250
|
+
catch {
|
|
251
|
+
this.stderr = appendCapturedOutput(this.stderr, Buffer.from(`${line}\n`, "utf8"));
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
if (message.type === "ready") {
|
|
255
|
+
settleReady();
|
|
256
|
+
return;
|
|
257
|
+
}
|
|
258
|
+
if (message.type === "startup-error") {
|
|
259
|
+
settleReady(new Error(typeof message.error === "string" ? message.error : "PageIndex worker failed to start"));
|
|
260
|
+
return;
|
|
261
|
+
}
|
|
262
|
+
const id = typeof message.id === "number" ? message.id : undefined;
|
|
263
|
+
const pending = id === undefined ? undefined : this.pending.get(id);
|
|
264
|
+
if (!pending) {
|
|
265
|
+
return;
|
|
266
|
+
}
|
|
267
|
+
this.pending.delete(id);
|
|
268
|
+
if (message.type === "done") {
|
|
269
|
+
pending.resolve({ ok: true });
|
|
270
|
+
return;
|
|
271
|
+
}
|
|
272
|
+
pending.resolve({
|
|
273
|
+
ok: false,
|
|
274
|
+
error: typeof message.error === "string" ? message.error : "PageIndex worker returned an unknown error"
|
|
275
|
+
});
|
|
276
|
+
});
|
|
277
|
+
child.on("error", (error) => {
|
|
278
|
+
settleReady(error);
|
|
279
|
+
this.rejectPending(error);
|
|
280
|
+
});
|
|
281
|
+
child.on("close", (code) => {
|
|
282
|
+
this.closed = true;
|
|
283
|
+
const error = new Error(`PageIndex worker ${this.workerId} exited with code ${code ?? "unknown"}${this.stderr.trim() ? `\n${this.stderr.trim()}` : ""}`);
|
|
284
|
+
settleReady(error);
|
|
285
|
+
this.rejectPending(error);
|
|
286
|
+
});
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
async run(job, markdownArgs) {
|
|
290
|
+
if (!this.child || this.closed) {
|
|
291
|
+
throw new Error(`PageIndex worker ${this.workerId} is not running`);
|
|
292
|
+
}
|
|
293
|
+
const id = this.nextId;
|
|
294
|
+
this.nextId += 1;
|
|
295
|
+
return await new Promise((resolve, reject) => {
|
|
296
|
+
this.pending.set(id, { reject, resolve });
|
|
297
|
+
const payload = {
|
|
298
|
+
type: "run",
|
|
299
|
+
id,
|
|
300
|
+
inputPath: job.inputPath,
|
|
301
|
+
outputPath: job.outputPath,
|
|
302
|
+
model: this.config.model,
|
|
303
|
+
ifAddDocDescription: markdownArgs.ifAddDocDescription,
|
|
304
|
+
ifAddNodeId: markdownArgs.ifAddNodeId,
|
|
305
|
+
ifAddNodeSummary: markdownArgs.ifAddNodeSummary,
|
|
306
|
+
ifAddNodeText: markdownArgs.ifAddNodeText,
|
|
307
|
+
ifThinning: markdownArgs.ifThinning,
|
|
308
|
+
summaryTokenThreshold: markdownArgs.summaryTokenThreshold,
|
|
309
|
+
thinningThreshold: markdownArgs.thinningThreshold
|
|
310
|
+
};
|
|
311
|
+
this.child?.stdin.write(`${JSON.stringify(payload)}\n`, (error) => {
|
|
312
|
+
if (error) {
|
|
313
|
+
this.pending.delete(id);
|
|
314
|
+
reject(error);
|
|
315
|
+
}
|
|
316
|
+
});
|
|
317
|
+
});
|
|
318
|
+
}
|
|
319
|
+
async stop() {
|
|
320
|
+
if (!this.child || this.closed) {
|
|
321
|
+
return;
|
|
322
|
+
}
|
|
323
|
+
try {
|
|
324
|
+
this.child.stdin.write(`${JSON.stringify({ type: "stop" })}\n`);
|
|
325
|
+
this.child.stdin.end();
|
|
326
|
+
}
|
|
327
|
+
catch {
|
|
328
|
+
// Closing a failed worker is best-effort.
|
|
329
|
+
}
|
|
330
|
+
await Promise.race([
|
|
331
|
+
this.closedPromise,
|
|
332
|
+
new Promise((resolve) => {
|
|
333
|
+
setTimeout(() => {
|
|
334
|
+
if (this.child && !this.closed) {
|
|
335
|
+
this.child.kill();
|
|
336
|
+
}
|
|
337
|
+
resolve();
|
|
338
|
+
}, 1000);
|
|
339
|
+
})
|
|
340
|
+
]);
|
|
341
|
+
}
|
|
342
|
+
rejectPending(error) {
|
|
343
|
+
for (const pending of this.pending.values()) {
|
|
344
|
+
pending.reject(error);
|
|
345
|
+
}
|
|
346
|
+
this.pending.clear();
|
|
347
|
+
}
|
|
348
|
+
}
|
|
35
349
|
function unsupportedOutputArgKey(pythonPath, cliPath, outputArg) {
|
|
36
350
|
return `${pythonPath}\0${cliPath}\0${outputArg}`;
|
|
37
351
|
}
|
|
@@ -89,6 +403,128 @@ async function locatePageIndexResult(searchRoots, startedAtMs) {
|
|
|
89
403
|
candidates.sort((left, right) => right.mtimeMs - left.mtimeMs);
|
|
90
404
|
return candidates[0]?.filePath;
|
|
91
405
|
}
|
|
406
|
+
function normalizeBatchJobs(jobs) {
|
|
407
|
+
return jobs.map((job) => ({
|
|
408
|
+
inputPath: node_path_1.default.resolve(job.inputPath),
|
|
409
|
+
outputPath: node_path_1.default.resolve(job.outputPath)
|
|
410
|
+
}));
|
|
411
|
+
}
|
|
412
|
+
function failedBatchResults(jobs, error, callbacks) {
|
|
413
|
+
return jobs.map((job, index) => {
|
|
414
|
+
callbacks.onJobStart?.(job, index);
|
|
415
|
+
return {
|
|
416
|
+
...job,
|
|
417
|
+
ok: false,
|
|
418
|
+
error
|
|
419
|
+
};
|
|
420
|
+
});
|
|
421
|
+
}
|
|
422
|
+
async function runPageIndexSingleFallback(job, options) {
|
|
423
|
+
try {
|
|
424
|
+
await runPageIndex(job.inputPath, job.outputPath, options);
|
|
425
|
+
return {
|
|
426
|
+
...job,
|
|
427
|
+
ok: true
|
|
428
|
+
};
|
|
429
|
+
}
|
|
430
|
+
catch (error) {
|
|
431
|
+
return {
|
|
432
|
+
...job,
|
|
433
|
+
ok: false,
|
|
434
|
+
error: errorMessage(error)
|
|
435
|
+
};
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
async function runAllSingleFallback(jobs, options, callbacks) {
|
|
439
|
+
const results = [];
|
|
440
|
+
for (let index = 0; index < jobs.length; index += 1) {
|
|
441
|
+
callbacks.onJobStart?.(jobs[index], index);
|
|
442
|
+
results.push(await runPageIndexSingleFallback(jobs[index], options));
|
|
443
|
+
}
|
|
444
|
+
return results;
|
|
445
|
+
}
|
|
446
|
+
async function runPageIndexBatchPool(jobs, options = {}, callbacks = {}) {
|
|
447
|
+
const config = (0, config_1.loadPageIndexConfig)(options);
|
|
448
|
+
const normalizedJobs = normalizeBatchJobs(jobs);
|
|
449
|
+
if (normalizedJobs.length === 0) {
|
|
450
|
+
return [];
|
|
451
|
+
}
|
|
452
|
+
if (!config.cliPath) {
|
|
453
|
+
return failedBatchResults(normalizedJobs, "PAGEINDEX_CLI is required to run PageIndex", callbacks);
|
|
454
|
+
}
|
|
455
|
+
const parsedArgs = parseBatchMarkdownArgs(config.extraArgs);
|
|
456
|
+
if (parsedArgs.unsupported.length > 0) {
|
|
457
|
+
if (config.pageIndexRunner === "auto") {
|
|
458
|
+
return await runAllSingleFallback(normalizedJobs, config, callbacks);
|
|
459
|
+
}
|
|
460
|
+
return failedBatchResults(normalizedJobs, `PageIndex batch runner does not support extra args: ${parsedArgs.unsupported.join(", ")}`, callbacks);
|
|
461
|
+
}
|
|
462
|
+
const cliPath = node_path_1.default.resolve(config.cliPath);
|
|
463
|
+
const cliDir = node_path_1.default.dirname(cliPath);
|
|
464
|
+
const workerCount = Math.min(Math.max(1, Math.floor(config.concurrency)), normalizedJobs.length);
|
|
465
|
+
const tempDir = await promises_1.default.mkdtemp(node_path_1.default.join(node_os_1.default.tmpdir(), "ragbox-batch-"));
|
|
466
|
+
const workers = [];
|
|
467
|
+
try {
|
|
468
|
+
for (let index = 0; index < workerCount; index += 1) {
|
|
469
|
+
const worker = new PageIndexBatchWorker(index + 1, config, cliDir, node_path_1.default.join(tempDir, `worker-${index + 1}`));
|
|
470
|
+
workers.push(worker);
|
|
471
|
+
}
|
|
472
|
+
try {
|
|
473
|
+
await Promise.all(workers.map((worker) => worker.start()));
|
|
474
|
+
}
|
|
475
|
+
catch (error) {
|
|
476
|
+
await Promise.allSettled(workers.map((worker) => worker.stop()));
|
|
477
|
+
if (config.pageIndexRunner === "auto") {
|
|
478
|
+
return await runAllSingleFallback(normalizedJobs, config, callbacks);
|
|
479
|
+
}
|
|
480
|
+
return failedBatchResults(normalizedJobs, errorMessage(error), callbacks);
|
|
481
|
+
}
|
|
482
|
+
const results = new Array(normalizedJobs.length);
|
|
483
|
+
let nextIndex = 0;
|
|
484
|
+
async function runWorkerLoop(worker) {
|
|
485
|
+
while (nextIndex < normalizedJobs.length) {
|
|
486
|
+
const currentIndex = nextIndex;
|
|
487
|
+
nextIndex += 1;
|
|
488
|
+
const job = normalizedJobs[currentIndex];
|
|
489
|
+
callbacks.onJobStart?.(job, currentIndex);
|
|
490
|
+
try {
|
|
491
|
+
const result = await worker.run(job, parsedArgs.args);
|
|
492
|
+
if (result.ok) {
|
|
493
|
+
results[currentIndex] = {
|
|
494
|
+
...job,
|
|
495
|
+
ok: true
|
|
496
|
+
};
|
|
497
|
+
}
|
|
498
|
+
else {
|
|
499
|
+
results[currentIndex] = {
|
|
500
|
+
...job,
|
|
501
|
+
ok: false,
|
|
502
|
+
error: result.error
|
|
503
|
+
};
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
catch (error) {
|
|
507
|
+
results[currentIndex] =
|
|
508
|
+
config.pageIndexRunner === "auto"
|
|
509
|
+
? await runPageIndexSingleFallback(job, config)
|
|
510
|
+
: {
|
|
511
|
+
...job,
|
|
512
|
+
ok: false,
|
|
513
|
+
error: errorMessage(error)
|
|
514
|
+
};
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
await Promise.all(workers.map((worker) => runWorkerLoop(worker)));
|
|
519
|
+
return results;
|
|
520
|
+
}
|
|
521
|
+
finally {
|
|
522
|
+
await Promise.allSettled(workers.map((worker) => worker.stop()));
|
|
523
|
+
if ((0, path_utils_1.isSubPath)(node_os_1.default.tmpdir(), tempDir)) {
|
|
524
|
+
await promises_1.default.rm(tempDir, { recursive: true, force: true });
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
}
|
|
92
528
|
async function runPageIndex(inputPath, outputPath, options = {}) {
|
|
93
529
|
const config = (0, config_1.loadPageIndexConfig)(options);
|
|
94
530
|
if (!config.cliPath) {
|
|
@@ -39,9 +39,11 @@ export type LlmChatRequest = {
|
|
|
39
39
|
export type LlmClient = {
|
|
40
40
|
chatCompletion: (request: LlmChatRequest) => Promise<string>;
|
|
41
41
|
};
|
|
42
|
+
export type PageIndexRunner = "auto" | "single" | "batch";
|
|
42
43
|
export type PageIndexOptions = {
|
|
43
44
|
pythonPath?: string;
|
|
44
45
|
cliPath?: string;
|
|
46
|
+
pageIndexRunner?: PageIndexRunner;
|
|
45
47
|
model?: string;
|
|
46
48
|
baseUrl?: string;
|
|
47
49
|
apiKey?: string;
|
package/dist/src/index.d.ts
CHANGED
|
@@ -3,4 +3,5 @@ export { startServe } from "./serve";
|
|
|
3
3
|
export type { RagboxConfig, RagboxConfigSource, RagboxIndexConfig, RagboxLlmConfig, RagboxPageIndexConfig } from "./config-file";
|
|
4
4
|
export type { CreateIndexOptions, CreateIndexResult, IndexCounts, InspectIndexDocument, InspectIndexResult, LlmChatRequest, LlmClient, QueryIndexOptions, QueryResult, SdkOptions, ValidateIndexResult, ValidationIssue, WatchIndexHandle, WatchIndexOptions, WatchIndexReadyResult } from "./sdk";
|
|
5
5
|
export type { ServeHandle, ServeHealthResult, ServeIndexesResult, ServeIndexSummary, ServeOptions } from "./serve";
|
|
6
|
+
export type { PageIndexRunner } from "./folder-index/types";
|
|
6
7
|
export * as advanced from "./advanced";
|
package/dist/src/sdk.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { DocumentRecord, IndexCounts, IndexProgressEvent, LlmClient, Manifest, QueryResult, RootTreeNode, WatchProgressEvent } from "./folder-index/types";
|
|
1
|
+
import { DocumentRecord, IndexCounts, IndexProgressEvent, LlmClient, Manifest, PageIndexRunner, QueryResult, RootTreeNode, WatchProgressEvent } from "./folder-index/types";
|
|
2
2
|
export type { IndexCounts, IndexProgressEvent, LlmChatRequest, LlmClient, QueryResult, WatchProgressEvent } from "./folder-index/types";
|
|
3
3
|
export type SdkOptions = {
|
|
4
4
|
apiKey?: string;
|
|
@@ -17,6 +17,7 @@ export type CreateIndexOptions = SdkOptions & {
|
|
|
17
17
|
pageIndexPython?: string;
|
|
18
18
|
pageIndexOutputArg?: string;
|
|
19
19
|
pageIndexExtraArgs?: string[];
|
|
20
|
+
pageIndexRunner?: PageIndexRunner;
|
|
20
21
|
concurrency?: number;
|
|
21
22
|
onProgress?: (event: IndexProgressEvent) => void;
|
|
22
23
|
};
|
package/dist/src/sdk.js
CHANGED
|
@@ -47,6 +47,7 @@ async function toPageIndexOptions(options = {}) {
|
|
|
47
47
|
model: options.model,
|
|
48
48
|
outputArg: createOptions.pageIndexOutputArg,
|
|
49
49
|
outputDir: options.outputDir,
|
|
50
|
+
pageIndexRunner: createOptions.pageIndexRunner,
|
|
50
51
|
progress: createOptions.onProgress,
|
|
51
52
|
pythonPath: createOptions.pageIndexPython,
|
|
52
53
|
trace: queryOptions.trace,
|
package/dist/src/serve.js
CHANGED
|
@@ -10,8 +10,6 @@ const config_file_1 = require("./config-file");
|
|
|
10
10
|
const multi_query_1 = require("./folder-index/multi-query");
|
|
11
11
|
const query_1 = require("./folder-index/query");
|
|
12
12
|
const sdk_1 = require("./sdk");
|
|
13
|
-
const DEFAULT_HOST = "127.0.0.1";
|
|
14
|
-
const DEFAULT_PORT = 8787;
|
|
15
13
|
const MAX_JSON_BODY_BYTES = 1024 * 1024;
|
|
16
14
|
class ServeHttpError extends Error {
|
|
17
15
|
status;
|
|
@@ -34,16 +32,6 @@ function mergeDefined(...values) {
|
|
|
34
32
|
}
|
|
35
33
|
return merged;
|
|
36
34
|
}
|
|
37
|
-
function parsePositivePort(value, fallback) {
|
|
38
|
-
if (!value) {
|
|
39
|
-
return fallback;
|
|
40
|
-
}
|
|
41
|
-
const parsed = Number.parseInt(value, 10);
|
|
42
|
-
if (!Number.isFinite(parsed) || parsed < 0 || parsed > 65535) {
|
|
43
|
-
throw new Error(`Invalid serve port: ${value}`);
|
|
44
|
-
}
|
|
45
|
-
return parsed;
|
|
46
|
-
}
|
|
47
35
|
function parseSourceNames(source) {
|
|
48
36
|
if (Array.isArray(source)) {
|
|
49
37
|
return source.map((name) => name.trim()).filter(Boolean);
|
|
@@ -147,6 +135,21 @@ async function buildIndexes(targets) {
|
|
|
147
135
|
indexes
|
|
148
136
|
};
|
|
149
137
|
}
|
|
138
|
+
async function assertTargetsReadyForQuery(targets) {
|
|
139
|
+
const notReady = [];
|
|
140
|
+
for (const target of targets) {
|
|
141
|
+
const validation = await (0, sdk_1.validateIndex)(target.target);
|
|
142
|
+
if (validation.ok) {
|
|
143
|
+
continue;
|
|
144
|
+
}
|
|
145
|
+
const label = target.source ? `${target.source} (${target.target})` : target.target;
|
|
146
|
+
const firstError = validation.errors[0]?.message ?? "index is not query-ready";
|
|
147
|
+
notReady.push(`${label}: ${firstError}`);
|
|
148
|
+
}
|
|
149
|
+
if (notReady.length > 0) {
|
|
150
|
+
throw new ServeHttpError(503, "index_not_ready", `Index is not ready: ${notReady.join("; ")}`);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
150
153
|
function healthFromIndexes(startedAt, lastReloadAt, indexes) {
|
|
151
154
|
const ready = indexes.indexes.filter((index) => index.ok).length;
|
|
152
155
|
const failed = indexes.indexes.length - ready;
|
|
@@ -344,9 +347,14 @@ async function queryTargets(targets, question, options) {
|
|
|
344
347
|
}
|
|
345
348
|
async function startServe(options = {}) {
|
|
346
349
|
const env = options.env ?? process.env;
|
|
347
|
-
const
|
|
348
|
-
const
|
|
349
|
-
|
|
350
|
+
const { config } = await (0, config_file_1.readRagboxConfig)(options.configPath);
|
|
351
|
+
const { authToken, host, port } = (0, config_file_1.resolveRagboxServeConfig)({
|
|
352
|
+
config,
|
|
353
|
+
env,
|
|
354
|
+
authToken: options.authToken,
|
|
355
|
+
host: options.host,
|
|
356
|
+
port: options.port
|
|
357
|
+
});
|
|
350
358
|
const serverOptions = {
|
|
351
359
|
...options,
|
|
352
360
|
authToken,
|
|
@@ -426,6 +434,7 @@ async function startServe(options = {}) {
|
|
|
426
434
|
...resolvedTarget,
|
|
427
435
|
options: queryOptionsFromServeOptions(resolvedTarget.options, serverOptions, trace)
|
|
428
436
|
}));
|
|
437
|
+
await assertTargetsReadyForQuery(targets);
|
|
429
438
|
writeJson(response, 200, await queryTargets(targets, question, serverOptions));
|
|
430
439
|
return;
|
|
431
440
|
}
|