pageindex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js ADDED
@@ -0,0 +1,2178 @@
1
+ #!/usr/bin/env node
2
+ #!/usr/bin/env node
3
+
4
+ // src/cli.ts
5
+ import { parseArgs } from "util";
6
+
7
+ // src/pdf.ts
8
+ import { PDFParse } from "pdf-parse";
9
+
10
+ // src/utils.ts
11
+ function countTokens(text) {
12
+ if (!text) return 0;
13
+ return Math.ceil(text.length / 4);
14
+ }
15
+ function getJsonContent(response) {
16
+ let content = response;
17
+ content = content.replace(/<think>[\s\S]*?<\/think>/g, "").trim();
18
+ content = content.replace(/<\/?output>/gi, "").trim();
19
+ const jsonBlockMatch = content.match(/```json\s*([\s\S]*?)```/);
20
+ if (jsonBlockMatch && jsonBlockMatch[1]) {
21
+ return jsonBlockMatch[1].trim();
22
+ }
23
+ const codeBlockMatch = content.match(/```\s*([\s\S]*?)```/);
24
+ if (codeBlockMatch && codeBlockMatch[1]) {
25
+ return codeBlockMatch[1].trim();
26
+ }
27
+ const jsonMatch = content.match(/(\{[\s\S]*\}|\[[\s\S]*\])/);
28
+ if (jsonMatch && jsonMatch[1]) {
29
+ return jsonMatch[1].trim();
30
+ }
31
+ return content.trim();
32
+ }
33
+ function cleanJsonString(jsonContent) {
34
+ return jsonContent.replace(/None/g, "null").replace(/True/g, "true").replace(/False/g, "false").replace(/,\s*]/g, "]").replace(/,\s*}/g, "}").replace(/'/g, '"');
35
+ }
36
+ function extractFieldsFromMalformedJson(content) {
37
+ const result = {};
38
+ const stringFields = [
39
+ "toc_detected",
40
+ "tocDetected",
41
+ "answer",
42
+ "start_begin",
43
+ "startBegin",
44
+ "completed",
45
+ "page_index_given",
46
+ "pageIndexGiven",
47
+ "page_index_given_in_toc",
48
+ "pageIndexGivenInToc",
49
+ "reasoning",
50
+ "thinking",
51
+ "structure",
52
+ "appear_start",
53
+ "appearStart"
54
+ ];
55
+ const numberFields = ["confidence", "page", "physical_index", "physicalIndex"];
56
+ for (const field of stringFields) {
57
+ const match = content.match(new RegExp(`["']${field}["']\\s*:\\s*["']([^"']*?)["']`, "i"));
58
+ if (match && match[1]) {
59
+ const normalizedField = field.replace(/([A-Z])/g, "_$1").toLowerCase();
60
+ result[normalizedField] = match[1].trim().toLowerCase();
61
+ result[field] = match[1].trim().toLowerCase();
62
+ }
63
+ }
64
+ for (const field of numberFields) {
65
+ const match = content.match(new RegExp(`["']${field}["']\\s*:\\s*([\\d.]+)`, "i"));
66
+ if (match && match[1]) {
67
+ const normalizedField = field.replace(/([A-Z])/g, "_$1").toLowerCase();
68
+ result[normalizedField] = parseFloat(match[1]);
69
+ result[field] = parseFloat(match[1]);
70
+ }
71
+ }
72
+ const titleMatch = content.match(/["']title["']\s*:\s*["']([^"']+)["']/i);
73
+ if (titleMatch && titleMatch[1]) {
74
+ result.title = titleMatch[1];
75
+ }
76
+ const tocMatch = content.match(/["']table_of_contents["']\s*:\s*\[/i);
77
+ if (tocMatch) {
78
+ const arrayContent = content.slice(content.indexOf("["));
79
+ try {
80
+ let depth = 0;
81
+ let endIndex = 0;
82
+ for (let i = 0; i < arrayContent.length; i++) {
83
+ if (arrayContent[i] === "[") depth++;
84
+ else if (arrayContent[i] === "]") {
85
+ depth--;
86
+ if (depth === 0) {
87
+ endIndex = i + 1;
88
+ break;
89
+ }
90
+ }
91
+ }
92
+ if (endIndex > 0) {
93
+ const arrayStr = cleanJsonString(arrayContent.slice(0, endIndex));
94
+ result.table_of_contents = JSON.parse(arrayStr);
95
+ }
96
+ } catch {
97
+ }
98
+ }
99
+ return Object.keys(result).length > 0 ? result : null;
100
+ }
101
+ function extractJson(content) {
102
+ try {
103
+ let jsonContent = getJsonContent(content);
104
+ jsonContent = cleanJsonString(jsonContent);
105
+ return JSON.parse(jsonContent);
106
+ } catch (error) {
107
+ try {
108
+ let cleanContent = content.replace(/<think>[\s\S]*?<\/think>/g, "").trim();
109
+ const startBrace = cleanContent.indexOf("{");
110
+ const startBracket = cleanContent.indexOf("[");
111
+ let start = -1;
112
+ let end = -1;
113
+ if (startBrace !== -1 && (startBracket === -1 || startBrace < startBracket)) {
114
+ start = startBrace;
115
+ end = cleanContent.lastIndexOf("}");
116
+ } else if (startBracket !== -1) {
117
+ start = startBracket;
118
+ end = cleanContent.lastIndexOf("]");
119
+ }
120
+ if (start !== -1 && end !== -1 && end > start) {
121
+ let jsonContent = cleanContent.slice(start, end + 1);
122
+ jsonContent = cleanJsonString(jsonContent);
123
+ return JSON.parse(jsonContent);
124
+ }
125
+ } catch {
126
+ const extracted = extractFieldsFromMalformedJson(content);
127
+ if (extracted) {
128
+ return extracted;
129
+ }
130
+ }
131
+ console.error("Failed to extract JSON:", error);
132
+ return null;
133
+ }
134
+ }
135
+ function writeNodeId(data, nodeId = 0) {
136
+ if (Array.isArray(data)) {
137
+ for (const item of data) {
138
+ nodeId = writeNodeId(item, nodeId);
139
+ }
140
+ } else if (data && typeof data === "object") {
141
+ data.nodeId = String(nodeId).padStart(4, "0");
142
+ nodeId += 1;
143
+ if (data.nodes) {
144
+ nodeId = writeNodeId(data.nodes, nodeId);
145
+ }
146
+ }
147
+ return nodeId;
148
+ }
149
+ function structureToList(structure) {
150
+ if (Array.isArray(structure)) {
151
+ const nodes2 = [];
152
+ for (const item of structure) {
153
+ nodes2.push(...structureToList(item));
154
+ }
155
+ return nodes2;
156
+ }
157
+ const nodes = [structure];
158
+ if (structure.nodes) {
159
+ nodes.push(...structureToList(structure.nodes));
160
+ }
161
+ return nodes;
162
+ }
163
+ function listToTree(data) {
164
+ const getParentStructure = (structure) => {
165
+ if (!structure) return null;
166
+ const parts = structure.split(".");
167
+ return parts.length > 1 ? parts.slice(0, -1).join(".") : null;
168
+ };
169
+ const nodes = /* @__PURE__ */ new Map();
170
+ const rootNodes = [];
171
+ for (const item of data) {
172
+ const structure = item.structure;
173
+ const node = {
174
+ title: item.title,
175
+ startIndex: item.physicalIndex,
176
+ endIndex: void 0,
177
+ nodes: []
178
+ };
179
+ if (structure) {
180
+ nodes.set(structure, node);
181
+ }
182
+ const parentStructure = getParentStructure(structure);
183
+ if (parentStructure && nodes.has(parentStructure)) {
184
+ nodes.get(parentStructure).nodes.push(node);
185
+ } else {
186
+ rootNodes.push(node);
187
+ }
188
+ }
189
+ const cleanNode = (node) => {
190
+ if (!node.nodes || node.nodes.length === 0) {
191
+ delete node.nodes;
192
+ } else {
193
+ node.nodes.forEach(cleanNode);
194
+ }
195
+ return node;
196
+ };
197
+ return rootNodes.map(cleanNode);
198
+ }
199
+ function addPrefaceIfNeeded(data) {
200
+ if (!Array.isArray(data) || data.length === 0) return data;
201
+ const firstItem = data[0];
202
+ if (firstItem && firstItem.physicalIndex && firstItem.physicalIndex > 1) {
203
+ data.unshift({
204
+ structure: "0",
205
+ title: "Preface",
206
+ physicalIndex: 1
207
+ });
208
+ }
209
+ return data;
210
+ }
211
+ function postProcessing(structure, endPhysicalIndex) {
212
+ for (let i = 0; i < structure.length; i++) {
213
+ const item = structure[i];
214
+ item.startIndex = item.physicalIndex;
215
+ if (i < structure.length - 1) {
216
+ const nextItem = structure[i + 1];
217
+ if (nextItem.appearStart === "yes") {
218
+ item.endIndex = (nextItem.physicalIndex || 0) - 1;
219
+ } else {
220
+ item.endIndex = nextItem.physicalIndex;
221
+ }
222
+ } else {
223
+ item.endIndex = endPhysicalIndex;
224
+ }
225
+ }
226
+ const tree = listToTree(structure);
227
+ return tree.length > 0 ? tree : structure;
228
+ }
229
+ function removeFields(data, fields = ["text"]) {
230
+ if (Array.isArray(data)) {
231
+ return data.map((item) => removeFields(item, fields));
232
+ }
233
+ if (data && typeof data === "object") {
234
+ const result = {};
235
+ for (const [key, value] of Object.entries(data)) {
236
+ if (!fields.includes(key)) {
237
+ result[key] = removeFields(value, fields);
238
+ }
239
+ }
240
+ return result;
241
+ }
242
+ return data;
243
+ }
244
+ function convertPhysicalIndexToInt(data) {
245
+ if (typeof data === "string") {
246
+ if (data.startsWith("<physical_index_")) {
247
+ return parseInt(data.split("_").pop().replace(">", "").trim(), 10);
248
+ }
249
+ if (data.startsWith("physical_index_")) {
250
+ return parseInt(data.split("_").pop().trim(), 10);
251
+ }
252
+ const num = parseInt(data, 10);
253
+ return isNaN(num) ? null : num;
254
+ }
255
+ if (Array.isArray(data)) {
256
+ for (const item of data) {
257
+ if (typeof item.physicalIndex === "string") {
258
+ const parsed = convertPhysicalIndexToInt(item.physicalIndex);
259
+ if (typeof parsed === "number") {
260
+ item.physicalIndex = parsed;
261
+ }
262
+ }
263
+ }
264
+ }
265
+ return data;
266
+ }
267
+ function convertPageToInt(data) {
268
+ for (const item of data) {
269
+ if (typeof item.page === "string") {
270
+ const parsed = parseInt(item.page, 10);
271
+ if (!isNaN(parsed)) {
272
+ item.page = parsed;
273
+ }
274
+ }
275
+ }
276
+ return data;
277
+ }
278
+ function reorderDict(data, keyOrder) {
279
+ if (!keyOrder.length) return data;
280
+ const result = {};
281
+ for (const key of keyOrder) {
282
+ if (key in data) {
283
+ result[key] = data[key];
284
+ }
285
+ }
286
+ return result;
287
+ }
288
+ function formatStructure(structure, order) {
289
+ if (!order) return structure;
290
+ if (Array.isArray(structure)) {
291
+ return structure.map((item) => formatStructure(item, order));
292
+ }
293
+ if (structure.nodes) {
294
+ structure.nodes = formatStructure(structure.nodes, order);
295
+ }
296
+ if (!structure.nodes || structure.nodes.length === 0) {
297
+ delete structure.nodes;
298
+ }
299
+ return reorderDict(structure, order);
300
+ }
301
+ function createCleanStructureForDescription(structure) {
302
+ if (Array.isArray(structure)) {
303
+ return structure.map(
304
+ (item) => createCleanStructureForDescription(item)
305
+ );
306
+ }
307
+ const cleanNode = {};
308
+ const essentialFields = [
309
+ "title",
310
+ "nodeId",
311
+ "summary",
312
+ "prefixSummary"
313
+ ];
314
+ for (const key of essentialFields) {
315
+ if (key in structure) {
316
+ cleanNode[key] = structure[key];
317
+ }
318
+ }
319
+ if (structure.nodes && structure.nodes.length > 0) {
320
+ cleanNode.nodes = createCleanStructureForDescription(
321
+ structure.nodes
322
+ );
323
+ }
324
+ return cleanNode;
325
+ }
326
+
327
+ // src/pdf.ts
328
+ import * as fs from "fs/promises";
329
+ async function parsePdf(input) {
330
+ let data;
331
+ if (typeof input === "string") {
332
+ const buffer = await fs.readFile(input);
333
+ data = new Uint8Array(buffer);
334
+ } else if (input instanceof ArrayBuffer) {
335
+ data = new Uint8Array(input);
336
+ } else {
337
+ data = new Uint8Array(input);
338
+ }
339
+ const parser = new PDFParse({ data });
340
+ const textResult = await parser.getText();
341
+ const infoResult = await parser.getInfo();
342
+ const pages = [];
343
+ let title = "Untitled";
344
+ if (infoResult?.info?.Title) {
345
+ title = infoResult.info.Title;
346
+ }
347
+ for (const pageText of textResult.pages) {
348
+ const text = pageText.text;
349
+ const tokenCount = countTokens(text);
350
+ pages.push({ text, tokenCount });
351
+ }
352
+ await parser.destroy();
353
+ return {
354
+ title,
355
+ numPages: textResult.pages.length,
356
+ pages
357
+ };
358
+ }
359
+ function getPdfName(pdfPath) {
360
+ const parts = pdfPath.split("/");
361
+ const basename3 = parts[parts.length - 1] || "Untitled";
362
+ return basename3.replace(/\.pdf$/i, "");
363
+ }
364
+
365
+ // src/ocr.ts
366
+ import { exec } from "child_process";
367
+ import { promisify } from "util";
368
+ import * as path from "path";
369
+ import * as os from "os";
370
+ import * as fs2 from "fs/promises";
371
+ import OpenAI from "openai";
372
+ var execAsync = promisify(exec);
373
+ var DEFAULT_OCR_OPTIONS = {
374
+ ocrModel: "mlx-community/GLM-OCR-bf16",
375
+ imageFormat: "png",
376
+ imageDpi: 150,
377
+ ocrPromptType: "text",
378
+ concurrency: 3
379
+ };
380
+ var OCR_PROMPTS = {
381
+ text: "Text Recognition:",
382
+ formula: "Formula Recognition:",
383
+ table: "Table Recognition:"
384
+ };
385
+ async function checkPopplerInstalled() {
386
+ try {
387
+ await execAsync("which pdftocairo");
388
+ return true;
389
+ } catch {
390
+ return false;
391
+ }
392
+ }
393
+ async function pdfToImages(pdfPath, options = {}) {
394
+ const format = options.imageFormat || DEFAULT_OCR_OPTIONS.imageFormat;
395
+ const dpi = options.imageDpi || DEFAULT_OCR_OPTIONS.imageDpi;
396
+ const installed = await checkPopplerInstalled();
397
+ if (!installed) {
398
+ throw new Error(
399
+ "Poppler tools not installed. Install with:\n macOS: brew install poppler\n Ubuntu: sudo apt-get install poppler-utils"
400
+ );
401
+ }
402
+ const tempDir = await fs2.mkdtemp(path.join(os.tmpdir(), "pageindex-ocr-"));
403
+ const outputPrefix = path.join(tempDir, "page");
404
+ const formatFlag = format === "png" ? "-png" : "-jpeg";
405
+ const cmd = `pdftocairo ${formatFlag} -r ${dpi} "${pdfPath}" "${outputPrefix}"`;
406
+ try {
407
+ await execAsync(cmd);
408
+ } catch (error) {
409
+ await fs2.rm(tempDir, { recursive: true, force: true }).catch(() => {
410
+ });
411
+ throw new Error(`PDF conversion failed: ${error instanceof Error ? error.message : String(error)}`);
412
+ }
413
+ const files = await fs2.readdir(tempDir);
414
+ const imageFiles = files.filter((f) => f.endsWith(`.${format}`)).sort((a, b) => {
415
+ const numA = parseInt(a.match(/-(\d+)\./)?.[1] || "0");
416
+ const numB = parseInt(b.match(/-(\d+)\./)?.[1] || "0");
417
+ return numA - numB;
418
+ }).map((f) => path.join(tempDir, f));
419
+ return imageFiles;
420
+ }
421
+ async function pdfBufferToImages(pdfBuffer, options = {}) {
422
+ const tempDir = await fs2.mkdtemp(path.join(os.tmpdir(), "pageindex-pdf-"));
423
+ const tempPdfPath = path.join(tempDir, "input.pdf");
424
+ const buffer = pdfBuffer instanceof ArrayBuffer ? Buffer.from(pdfBuffer) : pdfBuffer;
425
+ await fs2.writeFile(tempPdfPath, buffer);
426
+ try {
427
+ return await pdfToImages(tempPdfPath, options);
428
+ } finally {
429
+ await fs2.unlink(tempPdfPath).catch(() => {
430
+ });
431
+ await fs2.rmdir(tempDir).catch(() => {
432
+ });
433
+ }
434
+ }
435
+ async function ocrImage(imagePath, options = {}) {
436
+ const model = options.ocrModel || DEFAULT_OCR_OPTIONS.ocrModel;
437
+ const promptType = options.ocrPromptType || DEFAULT_OCR_OPTIONS.ocrPromptType;
438
+ const prompt = OCR_PROMPTS[promptType] ?? OCR_PROMPTS.text ?? "Text Recognition:";
439
+ const apiKey = options.apiKey || process.env.OPENAI_API_KEY || "lm-studio";
440
+ const baseUrl = options.baseUrl || process.env.OPENAI_BASE_URL || "http://localhost:1234/v1";
441
+ const client = new OpenAI({
442
+ apiKey,
443
+ baseURL: baseUrl
444
+ });
445
+ const imageData = await fs2.readFile(imagePath);
446
+ const base64Image = imageData.toString("base64");
447
+ const mimeType = imagePath.endsWith(".png") ? "image/png" : "image/jpeg";
448
+ try {
449
+ const contentParts = [
450
+ {
451
+ type: "image_url",
452
+ image_url: {
453
+ url: `data:${mimeType};base64,${base64Image}`
454
+ }
455
+ },
456
+ {
457
+ type: "text",
458
+ text: prompt
459
+ }
460
+ ];
461
+ const response = await client.chat.completions.create({
462
+ model,
463
+ messages: [
464
+ {
465
+ role: "user",
466
+ content: contentParts
467
+ }
468
+ ],
469
+ max_tokens: 4096
470
+ });
471
+ return response.choices[0]?.message?.content || "";
472
+ } catch (error) {
473
+ console.error(`[OCR Error] Failed to process ${imagePath}:`, error);
474
+ return "";
475
+ }
476
+ }
477
+ async function ocrImages(imagePaths, options = {}) {
478
+ const concurrency = options.concurrency || DEFAULT_OCR_OPTIONS.concurrency;
479
+ const results = [];
480
+ for (let i = 0; i < imagePaths.length; i += concurrency) {
481
+ const batch = imagePaths.slice(i, i + concurrency);
482
+ const batchResults = await Promise.all(
483
+ batch.map((imagePath) => ocrImage(imagePath, options))
484
+ );
485
+ results.push(...batchResults);
486
+ const processed = Math.min(i + concurrency, imagePaths.length);
487
+ console.log(`[OCR] Processed ${processed}/${imagePaths.length} pages`);
488
+ }
489
+ return results;
490
+ }
491
+ async function parsePdfWithOcr(input, options = {}) {
492
+ console.log("[OCR Mode] Converting PDF to images...");
493
+ let imagePaths;
494
+ let tempDir;
495
+ if (typeof input === "string") {
496
+ imagePaths = await pdfToImages(input, options);
497
+ } else {
498
+ const buffer = input instanceof ArrayBuffer ? Buffer.from(input) : input;
499
+ imagePaths = await pdfBufferToImages(buffer, options);
500
+ }
501
+ if (imagePaths.length > 0) {
502
+ tempDir = path.dirname(imagePaths[0]);
503
+ }
504
+ console.log(`[OCR Mode] Extracted ${imagePaths.length} page images`);
505
+ console.log("[OCR Mode] Running OCR on pages...");
506
+ const texts = await ocrImages(imagePaths, options);
507
+ const pages = texts.map((text) => ({
508
+ text,
509
+ tokenCount: countTokens(text)
510
+ }));
511
+ await cleanupTempImages(imagePaths);
512
+ return { pages };
513
+ }
514
+ async function cleanupTempImages(imagePaths) {
515
+ if (imagePaths.length === 0) return;
516
+ const firstPath = imagePaths[0];
517
+ if (!firstPath) return;
518
+ const tempDir = path.dirname(firstPath);
519
+ await Promise.all(
520
+ imagePaths.map((p) => fs2.unlink(p).catch(() => {
521
+ }))
522
+ );
523
+ await fs2.rmdir(tempDir).catch(() => {
524
+ });
525
+ }
526
+
527
+ // src/openai.ts
528
+ import OpenAI2 from "openai";
529
+ var sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
530
+ var clientInstance = null;
531
+ var currentBaseUrl;
532
+ function getClient(config = {}) {
533
+ const apiKey = config.apiKey || process.env.OPENAI_API_KEY || "lm-studio";
534
+ const baseUrl = config.baseUrl || process.env.OPENAI_BASE_URL;
535
+ if (clientInstance && currentBaseUrl === baseUrl) {
536
+ return clientInstance;
537
+ }
538
+ clientInstance = new OpenAI2({
539
+ apiKey,
540
+ baseURL: baseUrl
541
+ });
542
+ currentBaseUrl = baseUrl;
543
+ return clientInstance;
544
+ }
545
+ async function chatGPT(options) {
546
+ if (isLMStudio(options.baseUrl) && isThinkingModel(options.model)) {
547
+ const result2 = await chatLMStudioNative(options);
548
+ return result2.content;
549
+ }
550
+ const result = await chatGPTWithFinishReason(options);
551
+ return result.content;
552
+ }
553
+ async function chatGPTWithFinishReason(options) {
554
+ const {
555
+ model,
556
+ prompt,
557
+ apiKey,
558
+ baseUrl,
559
+ chatHistory,
560
+ temperature = 0,
561
+ maxRetries = 10
562
+ } = options;
563
+ const client = getClient({ apiKey, baseUrl });
564
+ const messages = chatHistory ? [...chatHistory, { role: "user", content: prompt }] : [{ role: "user", content: prompt }];
565
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
566
+ try {
567
+ const response = await client.chat.completions.create({
568
+ model,
569
+ messages,
570
+ temperature
571
+ });
572
+ const choice = response.choices[0];
573
+ if (!choice) {
574
+ throw new Error("No response from model");
575
+ }
576
+ const finishReason = choice.finish_reason === "length" ? "max_output_reached" : "finished";
577
+ return {
578
+ content: choice.message.content || "",
579
+ finishReason
580
+ };
581
+ } catch (error) {
582
+ console.error(`[Retry ${attempt + 1}/${maxRetries}]`, error);
583
+ if (attempt < maxRetries - 1) {
584
+ await sleep(1e3 * (attempt + 1));
585
+ } else {
586
+ console.error("Max retries reached for prompt:", prompt.slice(0, 100));
587
+ return { content: "Error", finishReason: "error" };
588
+ }
589
+ }
590
+ }
591
+ return { content: "Error", finishReason: "error" };
592
+ }
593
+ function isLMStudio(baseUrl) {
594
+ return baseUrl?.includes("localhost:1234") ?? false;
595
+ }
596
+ function isThinkingModel(model) {
597
+ const thinkingModelPatterns = [
598
+ /qwen3\.5/i,
599
+ // qwen3.5 series supports reasoning
600
+ /qwen3-coder/i,
601
+ // qwen3-coder supports reasoning
602
+ /deepseek/i,
603
+ // deepseek models support reasoning
604
+ /o1/i,
605
+ // OpenAI o1 models
606
+ /o3/i
607
+ // OpenAI o3 models
608
+ ];
609
+ return thinkingModelPatterns.some((pattern) => pattern.test(model));
610
+ }
611
+ var modelsWithoutReasoning = /* @__PURE__ */ new Set();
612
+ async function chatLMStudioNative(options) {
613
+ const {
614
+ model,
615
+ prompt,
616
+ temperature = 0,
617
+ maxRetries = 10
618
+ } = options;
619
+ const baseUrl = "http://localhost:1234";
620
+ for (let attempt = 0; attempt < maxRetries; attempt++) {
621
+ try {
622
+ const body = {
623
+ model,
624
+ input: prompt,
625
+ temperature
626
+ };
627
+ if (!modelsWithoutReasoning.has(model)) {
628
+ body.reasoning = "off";
629
+ }
630
+ const response = await fetch(`${baseUrl}/api/v1/chat`, {
631
+ method: "POST",
632
+ headers: {
633
+ "Content-Type": "application/json"
634
+ },
635
+ body: JSON.stringify(body)
636
+ });
637
+ const data = await response.json();
638
+ if (!response.ok) {
639
+ if (data.type === "invalid_request" && data.param === "reasoning") {
640
+ console.log(`[LM Studio] Model ${model} doesn't support reasoning parameter, retrying without it`);
641
+ modelsWithoutReasoning.add(model);
642
+ const retryResponse = await fetch(`${baseUrl}/api/v1/chat`, {
643
+ method: "POST",
644
+ headers: {
645
+ "Content-Type": "application/json"
646
+ },
647
+ body: JSON.stringify({
648
+ model,
649
+ input: prompt,
650
+ temperature
651
+ })
652
+ });
653
+ if (!retryResponse.ok) {
654
+ const retryError = await retryResponse.text();
655
+ throw new Error(`LM Studio API error on retry: ${retryResponse.status} - ${retryError}`);
656
+ }
657
+ const retryData = await retryResponse.json();
658
+ const messageOutput2 = retryData.output?.find((o) => o.type === "message");
659
+ return {
660
+ content: messageOutput2?.content || "",
661
+ finishReason: "finished"
662
+ };
663
+ }
664
+ throw new Error(`LM Studio API error: ${response.status} - ${data.message || JSON.stringify(data)}`);
665
+ }
666
+ const messageOutput = data.output?.find((o) => o.type === "message");
667
+ const content = messageOutput?.content || "";
668
+ return {
669
+ content,
670
+ finishReason: "finished"
671
+ };
672
+ } catch (error) {
673
+ console.error(`[LM Studio Retry ${attempt + 1}/${maxRetries}]`, error);
674
+ if (attempt < maxRetries - 1) {
675
+ await sleep(1e3 * (attempt + 1));
676
+ } else {
677
+ console.error("Max retries reached for prompt:", prompt.slice(0, 100));
678
+ return { content: "Error", finishReason: "error" };
679
+ }
680
+ }
681
+ }
682
+ return { content: "Error", finishReason: "error" };
683
+ }
684
+
685
+ // src/prompts.ts
686
+ function tocDetectorPrompt(content) {
687
+ return `Your job is to detect if there is a table of content provided in the given text.
688
+
689
+ Given text: ${content}
690
+
691
+ return the following JSON format:
692
+ {
693
+ "thinking": <why do you think there is a table of content in the given text>
694
+ "toc_detected": "<yes or no>",
695
+ }
696
+
697
+ Directly return the final JSON structure. Do not output anything else.
698
+ Please note: abstract, summary, notation list, figure list, table list, etc. are not table of contents.`;
699
+ }
700
+ function checkTitleAppearancePrompt(title, pageText) {
701
+ return `Your job is to check if the given section appears or starts in the given page_text.
702
+
703
+ Note: do fuzzy matching, ignore any space inconsistency in the page_text.
704
+
705
+ The given section title is ${title}.
706
+ The given page_text is ${pageText}.
707
+
708
+ Reply format:
709
+ {
710
+ "thinking": <why do you think the section appears or starts in the page_text>
711
+ "answer": "yes or no" (yes if the section appears or starts in the page_text, no otherwise)
712
+ }
713
+ Directly return the final JSON structure. Do not output anything else.`;
714
+ }
715
+ function checkTitleStartAtBeginningPrompt(title, pageText) {
716
+ return `You will be given the current section title and the current page_text.
717
+ Your job is to check if the current section starts in the beginning of the given page_text.
718
+ If there are other contents before the current section title, then the current section does not start in the beginning of the given page_text.
719
+ If the current section title is the first content in the given page_text, then the current section starts in the beginning of the given page_text.
720
+
721
+ Note: do fuzzy matching, ignore any space inconsistency in the page_text.
722
+
723
+ The given section title is ${title}.
724
+ The given page_text is ${pageText}.
725
+
726
+ reply format:
727
+ {
728
+ "thinking": <why do you think the section appears or starts in the page_text>
729
+ "start_begin": "yes or no" (yes if the section starts in the beginning of the page_text, no otherwise)
730
+ }
731
+ Directly return the final JSON structure. Do not output anything else.`;
732
+ }
733
+ function checkTocTransformationCompletePrompt(rawToc, cleanedToc) {
734
+ return `You are given a raw table of contents and a cleaned table of contents.
735
+ Your job is to check if the cleaned table of contents is complete.
736
+
737
+ Reply format:
738
+ {
739
+ "thinking": <why do you think the cleaned table of contents is complete or not>
740
+ "completed": "yes" or "no"
741
+ }
742
+ Directly return the final JSON structure. Do not output anything else.
743
+
744
+ Raw Table of contents:
745
+ ${rawToc}
746
+
747
+ Cleaned Table of contents:
748
+ ${cleanedToc}`;
749
+ }
750
+ function detectPageIndexPrompt(tocContent) {
751
+ return `You will be given a table of contents.
752
+
753
+ Your job is to detect if there are page numbers/indices given within the table of contents.
754
+
755
+ Given text: ${tocContent}
756
+
757
+ Reply format:
758
+ {
759
+ "thinking": <why do you think there are page numbers/indices given within the table of contents>
760
+ "page_index_given_in_toc": "<yes or no>"
761
+ }
762
+ Directly return the final JSON structure. Do not output anything else.`;
763
+ }
764
+ function tocTransformerPrompt(tocContent) {
765
+ return `You are given a table of contents, You job is to transform the whole table of content into a JSON format included table_of_contents.
766
+
767
+ structure is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
768
+
769
+ The response should be in the following JSON format:
770
+ {
771
+ table_of_contents: [
772
+ {
773
+ "structure": <structure index, "x.x.x" or None> (string),
774
+ "title": <title of the section>,
775
+ "page": <page number or None>,
776
+ },
777
+ ...
778
+ ],
779
+ }
780
+ You should transform the full table of contents in one go.
781
+ Directly return the final JSON structure, do not output anything else.
782
+
783
+ Given table of contents:
784
+ ${tocContent}`;
785
+ }
786
+ function tocTransformerContinuePrompt(rawToc, incompleteToc) {
787
+ return `Your task is to continue the table of contents json structure, directly output the remaining part of the json structure.
788
+ The response should be in the following JSON format:
789
+
790
+ The raw table of contents json structure is:
791
+ ${rawToc}
792
+
793
+ The incomplete transformed table of contents json structure is:
794
+ ${incompleteToc}
795
+
796
+ Please continue the json structure, directly output the remaining part of the json structure.`;
797
+ }
798
+ function tocIndexExtractorPrompt(toc, content) {
799
+ return `You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format.
800
+
801
+ The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
802
+
803
+ The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
804
+
805
+ The response should be in the following JSON format:
806
+ [
807
+ {
808
+ "structure": <structure index, "x.x.x" or None> (string),
809
+ "title": <title of the section>,
810
+ "physical_index": "<physical_index_X>" (keep the format)
811
+ },
812
+ ...
813
+ ]
814
+
815
+ Only add the physical_index to the sections that are in the provided pages.
816
+ If the section is not in the provided pages, do not add the physical_index to it.
817
+ Directly return the final JSON structure. Do not output anything else.
818
+
819
+ Table of contents:
820
+ ${toc}
821
+
822
+ Document pages:
823
+ ${content}`;
824
+ }
825
+ function addPageNumberToTocPrompt(part, structure) {
826
+ return `You are given an JSON structure of a document and a partial part of the document. Your task is to check if the title that is described in the structure is started in the partial given document.
827
+
828
+ The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
829
+
830
+ If the full target section starts in the partial given document, insert the given JSON structure with the "start": "yes", and "start_index": "<physical_index_X>".
831
+
832
+ If the full target section does not start in the partial given document, insert "start": "no", "start_index": None.
833
+
834
+ The response should be in the following format.
835
+ [
836
+ {
837
+ "structure": <structure index, "x.x.x" or None> (string),
838
+ "title": <title of the section>,
839
+ "start": "<yes or no>",
840
+ "physical_index": "<physical_index_X> (keep the format)" or None
841
+ },
842
+ ...
843
+ ]
844
+ The given structure contains the result of the previous part, you need to fill the result of the current part, do not change the previous result.
845
+ Directly return the final JSON structure. Do not output anything else.
846
+
847
+ Current Partial Document:
848
+ ${part}
849
+
850
+ Given Structure
851
+ ${structure}`;
852
+ }
853
+ function generateTocInitPrompt(part) {
854
+ return `You are an expert in extracting hierarchical tree structure, your task is to generate the tree structure of the document.
855
+
856
+ The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
857
+
858
+ For the title, you need to extract the original title from the text, only fix the space inconsistency.
859
+
860
+ The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X.
861
+
862
+ For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format.
863
+
864
+ The response should be in the following format.
865
+ [
866
+ {
867
+ "structure": <structure index, "x.x.x"> (string),
868
+ "title": <title of the section, keep the original title>,
869
+ "physical_index": "<physical_index_X> (keep the format)"
870
+ },
871
+
872
+ ],
873
+
874
+
875
+ Directly return the final JSON structure. Do not output anything else.
876
+
877
+ Given text:
878
+ ${part}`;
879
+ }
880
+ function generateTocContinuePrompt(part, previousStructure) {
881
+ return `You are an expert in extracting hierarchical tree structure.
882
+ You are given a tree structure of the previous part and the text of the current part.
883
+ Your task is to continue the tree structure from the previous part to include the current part.
884
+
885
+ The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
886
+
887
+ For the title, you need to extract the original title from the text, only fix the space inconsistency.
888
+
889
+ The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X.
890
+
891
+ For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format.
892
+
893
+ The response should be in the following format.
894
+ [
895
+ {
896
+ "structure": <structure index, "x.x.x"> (string),
897
+ "title": <title of the section, keep the original title>,
898
+ "physical_index": "<physical_index_X> (keep the format)"
899
+ },
900
+ ...
901
+ ]
902
+
903
+ Directly return the additional part of the final JSON structure. Do not output anything else.
904
+
905
+ Given text:
906
+ ${part}
907
+
908
+ Previous tree structure:
909
+ ${previousStructure}`;
910
+ }
911
+ function singleTocItemIndexFixerPrompt(sectionTitle, content) {
912
+ return `You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
913
+
914
+ The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
915
+
916
+ Reply in a JSON format:
917
+ {
918
+ "thinking": <explain which page, started and closed by <physical_index_X>, contains the start of this section>,
919
+ "physical_index": "<physical_index_X>" (keep the format)
920
+ }
921
+ Directly return the final JSON structure. Do not output anything else.
922
+
923
+ Section Title:
924
+ ${sectionTitle}
925
+
926
+ Document pages:
927
+ ${content}`;
928
+ }
929
+ function generateNodeSummaryPrompt(nodeText) {
930
+ return `You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document.
931
+
932
+ Partial Document Text: ${nodeText}
933
+
934
+ Directly return the description, do not include any other text.`;
935
+ }
936
+ function generateDocDescriptionPrompt(structure) {
937
+ return `Your are an expert in generating descriptions for a document.
938
+ You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
939
+
940
+ Document Structure: ${structure}
941
+
942
+ Directly return the description, do not include any other text.`;
943
+ }
944
+
945
+ // src/toc.ts
946
+ async function tocDetectorSinglePage(content, options) {
947
+ const prompt = tocDetectorPrompt(content);
948
+ const response = await chatGPT({
949
+ model: options.model,
950
+ prompt,
951
+ apiKey: options.apiKey,
952
+ baseUrl: options.baseUrl
953
+ });
954
+ const json = extractJson(response);
955
+ return json?.toc_detected === "yes" ? "yes" : "no";
956
+ }
957
+ async function findTocPages(startPageIndex, pages, options) {
958
+ let lastPageIsYes = false;
959
+ const tocPageList = [];
960
+ let i = startPageIndex;
961
+ while (i < pages.length) {
962
+ if (i >= options.tocCheckPageNum && !lastPageIsYes) {
963
+ break;
964
+ }
965
+ const page = pages[i];
966
+ if (!page) {
967
+ i++;
968
+ continue;
969
+ }
970
+ const detected = await tocDetectorSinglePage(page.text, options);
971
+ if (detected === "yes") {
972
+ tocPageList.push(i);
973
+ lastPageIsYes = true;
974
+ } else if (detected === "no" && lastPageIsYes) {
975
+ break;
976
+ }
977
+ i++;
978
+ }
979
+ return tocPageList;
980
+ }
981
+ function transformDotsToColon(text) {
982
+ text = text.replace(/\.{5,}/g, ": ");
983
+ text = text.replace(/(?:\. ){5,}\.?/g, ": ");
984
+ return text;
985
+ }
986
+ async function detectPageIndex(tocContent, options) {
987
+ const prompt = detectPageIndexPrompt(tocContent);
988
+ const response = await chatGPT({
989
+ model: options.model,
990
+ prompt,
991
+ apiKey: options.apiKey,
992
+ baseUrl: options.baseUrl
993
+ });
994
+ const json = extractJson(response);
995
+ return json?.page_index_given_in_toc === "yes" ? "yes" : "no";
996
+ }
997
+ async function tocExtractor(pages, tocPageList, options) {
998
+ let tocContent = "";
999
+ for (const pageIndex of tocPageList) {
1000
+ const page = pages[pageIndex];
1001
+ if (page) {
1002
+ tocContent += page.text;
1003
+ }
1004
+ }
1005
+ tocContent = transformDotsToColon(tocContent);
1006
+ const hasPageIndex = await detectPageIndex(tocContent, options);
1007
+ return {
1008
+ tocContent,
1009
+ pageIndexGivenInToc: hasPageIndex
1010
+ };
1011
+ }
1012
+ async function checkTocTransformationComplete(rawToc, cleanedToc, options) {
1013
+ const prompt = checkTocTransformationCompletePrompt(rawToc, cleanedToc);
1014
+ const response = await chatGPT({
1015
+ model: options.model,
1016
+ prompt,
1017
+ apiKey: options.apiKey,
1018
+ baseUrl: options.baseUrl
1019
+ });
1020
+ const json = extractJson(response);
1021
+ return json?.completed === "yes";
1022
+ }
1023
+ async function tocTransformer(tocContent, options) {
1024
+ const prompt = tocTransformerPrompt(tocContent);
1025
+ let { content: lastComplete, finishReason } = await chatGPTWithFinishReason({
1026
+ model: options.model,
1027
+ prompt,
1028
+ apiKey: options.apiKey,
1029
+ baseUrl: options.baseUrl
1030
+ });
1031
+ let isComplete = await checkTocTransformationComplete(tocContent, lastComplete, options);
1032
+ if (isComplete && finishReason === "finished") {
1033
+ const json = extractJson(lastComplete);
1034
+ if (json?.table_of_contents) {
1035
+ return convertPageToInt(json.table_of_contents);
1036
+ }
1037
+ }
1038
+ lastComplete = getJsonContent(lastComplete);
1039
+ let attempts = 0;
1040
+ const maxAttempts = 5;
1041
+ while (!(isComplete && finishReason === "finished") && attempts < maxAttempts) {
1042
+ const position = lastComplete.lastIndexOf("}");
1043
+ if (position !== -1) {
1044
+ lastComplete = lastComplete.slice(0, position + 2);
1045
+ }
1046
+ const continuePrompt = tocTransformerContinuePrompt(tocContent, lastComplete);
1047
+ const result = await chatGPTWithFinishReason({
1048
+ model: options.model,
1049
+ prompt: continuePrompt,
1050
+ apiKey: options.apiKey,
1051
+ baseUrl: options.baseUrl
1052
+ });
1053
+ let newContent = result.content;
1054
+ finishReason = result.finishReason;
1055
+ if (newContent.startsWith("```json")) {
1056
+ newContent = getJsonContent(newContent);
1057
+ }
1058
+ lastComplete = lastComplete + newContent;
1059
+ isComplete = await checkTocTransformationComplete(tocContent, lastComplete, options);
1060
+ attempts++;
1061
+ }
1062
+ try {
1063
+ const parsed = JSON.parse(lastComplete);
1064
+ return convertPageToInt(parsed.table_of_contents || parsed);
1065
+ } catch {
1066
+ console.error("Failed to parse TOC JSON");
1067
+ return [];
1068
+ }
1069
+ }
1070
+ async function tocIndexExtractor(toc, content, options) {
1071
+ const prompt = tocIndexExtractorPrompt(JSON.stringify(toc), content);
1072
+ const response = await chatGPT({
1073
+ model: options.model,
1074
+ prompt,
1075
+ apiKey: options.apiKey,
1076
+ baseUrl: options.baseUrl
1077
+ });
1078
+ const json = extractJson(response);
1079
+ return json || [];
1080
+ }
1081
+ async function generateTocInit(part, options) {
1082
+ const prompt = generateTocInitPrompt(part);
1083
+ const { content, finishReason } = await chatGPTWithFinishReason({
1084
+ model: options.model,
1085
+ prompt,
1086
+ apiKey: options.apiKey,
1087
+ baseUrl: options.baseUrl
1088
+ });
1089
+ if (finishReason === "finished") {
1090
+ const json = extractJson(content);
1091
+ return json || [];
1092
+ }
1093
+ throw new Error(`Generation incomplete: ${finishReason}`);
1094
+ }
1095
+ async function generateTocContinue(tocContent, part, options) {
1096
+ const prompt = generateTocContinuePrompt(part, JSON.stringify(tocContent, null, 2));
1097
+ const { content, finishReason } = await chatGPTWithFinishReason({
1098
+ model: options.model,
1099
+ prompt,
1100
+ apiKey: options.apiKey,
1101
+ baseUrl: options.baseUrl
1102
+ });
1103
+ if (finishReason === "finished") {
1104
+ const json = extractJson(content);
1105
+ return json || [];
1106
+ }
1107
+ throw new Error(`Generation incomplete: ${finishReason}`);
1108
+ }
1109
+ async function addPageNumberToToc(part, structure, options) {
1110
+ const prompt = addPageNumberToTocPrompt(part, JSON.stringify(structure, null, 2));
1111
+ const response = await chatGPT({
1112
+ model: options.model,
1113
+ prompt,
1114
+ apiKey: options.apiKey,
1115
+ baseUrl: options.baseUrl
1116
+ });
1117
+ const json = extractJson(response);
1118
+ if (!json) return structure;
1119
+ for (const item of json) {
1120
+ delete item.start;
1121
+ }
1122
+ return json;
1123
+ }
1124
+ async function checkTitleAppearance(item, pages, startIndex, options) {
1125
+ const title = item.title;
1126
+ if (!item.physicalIndex) {
1127
+ return { listIndex: item.listIndex, answer: "no", title, pageNumber: void 0 };
1128
+ }
1129
+ const pageNumber = item.physicalIndex;
1130
+ const pageText = pages[pageNumber - startIndex]?.text || "";
1131
+ const prompt = checkTitleAppearancePrompt(title, pageText);
1132
+ const response = await chatGPT({
1133
+ model: options.model,
1134
+ prompt,
1135
+ apiKey: options.apiKey,
1136
+ baseUrl: options.baseUrl
1137
+ });
1138
+ const json = extractJson(response);
1139
+ const answer = json?.answer === "yes" ? "yes" : "no";
1140
+ return { listIndex: item.listIndex, answer, title, pageNumber };
1141
+ }
1142
+ async function checkTitleAppearanceInStart(title, pageText, options) {
1143
+ const prompt = checkTitleStartAtBeginningPrompt(title, pageText);
1144
+ const response = await chatGPT({
1145
+ model: options.model,
1146
+ prompt,
1147
+ apiKey: options.apiKey,
1148
+ baseUrl: options.baseUrl
1149
+ });
1150
+ const json = extractJson(response);
1151
+ return json?.start_begin === "yes" ? "yes" : "no";
1152
+ }
1153
+ async function checkTitleAppearanceInStartConcurrent(structure, pages, options) {
1154
+ const results = [];
1155
+ for (const item of structure) {
1156
+ if (!item.physicalIndex) {
1157
+ results.push({ ...item, appearStart: "no" });
1158
+ continue;
1159
+ }
1160
+ const pageText = pages[item.physicalIndex - 1]?.text || "";
1161
+ const appearStart = await checkTitleAppearanceInStart(item.title, pageText, options);
1162
+ results.push({ ...item, appearStart });
1163
+ }
1164
+ return results;
1165
+ }
1166
+ async function checkToc(pages, options) {
1167
+ const tocPageList = await findTocPages(0, pages, options);
1168
+ if (tocPageList.length === 0) {
1169
+ return {
1170
+ tocContent: null,
1171
+ tocPageList: [],
1172
+ pageIndexGivenInToc: "no"
1173
+ };
1174
+ }
1175
+ const tocResult = await tocExtractor(pages, tocPageList, options);
1176
+ if (tocResult.pageIndexGivenInToc === "yes") {
1177
+ return {
1178
+ tocContent: tocResult.tocContent,
1179
+ tocPageList,
1180
+ pageIndexGivenInToc: "yes"
1181
+ };
1182
+ }
1183
+ const lastTocPage = tocPageList[tocPageList.length - 1];
1184
+ let currentStartIndex = lastTocPage !== void 0 ? lastTocPage + 1 : 0;
1185
+ while (currentStartIndex < pages.length && currentStartIndex < options.tocCheckPageNum) {
1186
+ const additionalTocPages = await findTocPages(currentStartIndex, pages, options);
1187
+ if (additionalTocPages.length === 0) {
1188
+ break;
1189
+ }
1190
+ const additionalTocResult = await tocExtractor(pages, additionalTocPages, options);
1191
+ if (additionalTocResult.pageIndexGivenInToc === "yes") {
1192
+ return {
1193
+ tocContent: additionalTocResult.tocContent,
1194
+ tocPageList: additionalTocPages,
1195
+ pageIndexGivenInToc: "yes"
1196
+ };
1197
+ }
1198
+ const lastAdditionalPage = additionalTocPages[additionalTocPages.length - 1];
1199
+ currentStartIndex = lastAdditionalPage !== void 0 ? lastAdditionalPage + 1 : pages.length;
1200
+ }
1201
+ return {
1202
+ tocContent: tocResult.tocContent,
1203
+ tocPageList,
1204
+ pageIndexGivenInToc: "no"
1205
+ };
1206
+ }
1207
+ async function singleTocItemIndexFixer(sectionTitle, content, options) {
1208
+ const prompt = singleTocItemIndexFixerPrompt(sectionTitle, content);
1209
+ const response = await chatGPT({
1210
+ model: options.model,
1211
+ prompt,
1212
+ apiKey: options.apiKey,
1213
+ baseUrl: options.baseUrl
1214
+ });
1215
+ const json = extractJson(response);
1216
+ if (!json?.physical_index) return null;
1217
+ const result = convertPhysicalIndexToInt(json.physical_index);
1218
+ return typeof result === "number" ? result : null;
1219
+ }
1220
+
1221
+ // src/tree.ts
1222
+ function pageListToGroupText(pageContents, tokenLengths, maxTokens = 2e4, overlapPage = 1) {
1223
+ const numTokens = tokenLengths.reduce((a, b) => a + b, 0);
1224
+ if (numTokens <= maxTokens) {
1225
+ return [pageContents.join("")];
1226
+ }
1227
+ const subsets = [];
1228
+ let currentSubset = [];
1229
+ let currentTokenCount = 0;
1230
+ const expectedPartsNum = Math.ceil(numTokens / maxTokens);
1231
+ const averageTokensPerPart = Math.ceil(
1232
+ (numTokens / expectedPartsNum + maxTokens) / 2
1233
+ );
1234
+ for (let i = 0; i < pageContents.length; i++) {
1235
+ const pageContent = pageContents[i] || "";
1236
+ const pageTokens = tokenLengths[i] || 0;
1237
+ if (currentTokenCount + pageTokens > averageTokensPerPart) {
1238
+ subsets.push(currentSubset.join(""));
1239
+ const overlapStart = Math.max(i - overlapPage, 0);
1240
+ currentSubset = pageContents.slice(overlapStart, i);
1241
+ currentTokenCount = tokenLengths.slice(overlapStart, i).reduce((a, b) => a + b, 0);
1242
+ }
1243
+ currentSubset.push(pageContent);
1244
+ currentTokenCount += pageTokens;
1245
+ }
1246
+ if (currentSubset.length > 0) {
1247
+ subsets.push(currentSubset.join(""));
1248
+ }
1249
+ return subsets;
1250
+ }
1251
+ function extractMatchingPagePairs(tocPage, tocPhysicalIndex, startPageIndex) {
1252
+ const pairs = [];
1253
+ for (const phyItem of tocPhysicalIndex) {
1254
+ for (const pageItem of tocPage) {
1255
+ if (phyItem.title === pageItem.title) {
1256
+ const physicalIndex = phyItem.physicalIndex;
1257
+ if (physicalIndex !== void 0 && physicalIndex >= startPageIndex) {
1258
+ pairs.push({
1259
+ title: phyItem.title,
1260
+ page: pageItem.page || 0,
1261
+ physicalIndex
1262
+ });
1263
+ }
1264
+ }
1265
+ }
1266
+ }
1267
+ return pairs;
1268
+ }
1269
+ function calculatePageOffset(pairs) {
1270
+ const differences = [];
1271
+ for (const pair of pairs) {
1272
+ const difference = pair.physicalIndex - pair.page;
1273
+ differences.push(difference);
1274
+ }
1275
+ if (differences.length === 0) return null;
1276
+ const counts = /* @__PURE__ */ new Map();
1277
+ for (const diff of differences) {
1278
+ counts.set(diff, (counts.get(diff) || 0) + 1);
1279
+ }
1280
+ let maxCount = 0;
1281
+ let mostCommon = 0;
1282
+ for (const [diff, count] of counts) {
1283
+ if (count > maxCount) {
1284
+ maxCount = count;
1285
+ mostCommon = diff;
1286
+ }
1287
+ }
1288
+ return mostCommon;
1289
+ }
1290
+ function addPageOffsetToTocJson(data, offset) {
1291
+ for (const item of data) {
1292
+ if (item.page !== void 0 && typeof item.page === "number") {
1293
+ item.physicalIndex = item.page + offset;
1294
+ delete item.page;
1295
+ }
1296
+ }
1297
+ return data;
1298
+ }
1299
+ async function processNoToc(pages, startIndex, options) {
1300
+ const pageContents = [];
1301
+ const tokenLengths = [];
1302
+ for (let pageIndex = startIndex; pageIndex < startIndex + pages.length; pageIndex++) {
1303
+ const pageText = `<physical_index_${pageIndex}>
1304
+ ${pages[pageIndex - startIndex]?.text || ""}
1305
+ <physical_index_${pageIndex}>
1306
+
1307
+ `;
1308
+ pageContents.push(pageText);
1309
+ tokenLengths.push(countTokens(pageText));
1310
+ }
1311
+ const groupTexts = pageListToGroupText(pageContents, tokenLengths);
1312
+ let tocWithPageNumber = await generateTocInit(groupTexts[0] || "", options);
1313
+ for (let i = 1; i < groupTexts.length; i++) {
1314
+ const additional = await generateTocContinue(tocWithPageNumber, groupTexts[i] || "", options);
1315
+ tocWithPageNumber.push(...additional);
1316
+ }
1317
+ return tocWithPageNumber;
1318
+ }
1319
+ async function processTocNoPageNumbers(tocContent, pages, startIndex, options) {
1320
+ const pageContents = [];
1321
+ const tokenLengths = [];
1322
+ const tocItems = await tocTransformer(tocContent, options);
1323
+ for (let pageIndex = startIndex; pageIndex < startIndex + pages.length; pageIndex++) {
1324
+ const pageText = `<physical_index_${pageIndex}>
1325
+ ${pages[pageIndex - startIndex]?.text || ""}
1326
+ <physical_index_${pageIndex}>
1327
+
1328
+ `;
1329
+ pageContents.push(pageText);
1330
+ tokenLengths.push(countTokens(pageText));
1331
+ }
1332
+ const groupTexts = pageListToGroupText(pageContents, tokenLengths);
1333
+ let tocWithPageNumber = [...tocItems];
1334
+ for (const groupText of groupTexts) {
1335
+ tocWithPageNumber = await addPageNumberToToc(groupText, tocWithPageNumber, options);
1336
+ }
1337
+ return tocWithPageNumber;
1338
+ }
1339
+ async function processTocWithPageNumbers(tocContent, tocPageList, pages, options) {
1340
+ const tocWithPageNumber = await tocTransformer(tocContent, options);
1341
+ const tocNoPageNumber = tocWithPageNumber.map((item) => {
1342
+ const newItem = { ...item };
1343
+ delete newItem.page;
1344
+ return newItem;
1345
+ });
1346
+ const startPageIndex = (tocPageList[tocPageList.length - 1] || 0) + 1;
1347
+ let mainContent = "";
1348
+ for (let pageIndex = startPageIndex; pageIndex < Math.min(startPageIndex + options.tocCheckPageNum, pages.length); pageIndex++) {
1349
+ mainContent += `<physical_index_${pageIndex + 1}>
1350
+ ${pages[pageIndex]?.text || ""}
1351
+ <physical_index_${pageIndex + 1}>
1352
+
1353
+ `;
1354
+ }
1355
+ const tocWithPhysicalIndex = await tocIndexExtractor(tocNoPageNumber, mainContent, options);
1356
+ const matchingPairs = extractMatchingPagePairs(
1357
+ tocWithPageNumber,
1358
+ tocWithPhysicalIndex,
1359
+ startPageIndex
1360
+ );
1361
+ const offset = calculatePageOffset(matchingPairs);
1362
+ if (offset !== null) {
1363
+ addPageOffsetToTocJson(tocWithPageNumber, offset);
1364
+ }
1365
+ return tocWithPageNumber;
1366
+ }
1367
+ function addNodeText(node, pages) {
1368
+ if (Array.isArray(node)) {
1369
+ for (const item of node) {
1370
+ addNodeText(item, pages);
1371
+ }
1372
+ return;
1373
+ }
1374
+ const startPage = node.startIndex;
1375
+ const endPage = node.endIndex;
1376
+ if (startPage !== void 0 && endPage !== void 0) {
1377
+ let text = "";
1378
+ for (let i = startPage - 1; i < endPage && i < pages.length; i++) {
1379
+ text += pages[i]?.text || "";
1380
+ }
1381
+ node.text = text;
1382
+ }
1383
+ if (node.nodes) {
1384
+ addNodeText(node.nodes, pages);
1385
+ }
1386
+ }
1387
+ async function generateNodeSummary(node, options) {
1388
+ if (!node.text) return "";
1389
+ const prompt = generateNodeSummaryPrompt(node.text);
1390
+ return chatGPT({
1391
+ model: options.model,
1392
+ prompt,
1393
+ apiKey: options.apiKey,
1394
+ baseUrl: options.baseUrl
1395
+ });
1396
+ }
1397
+ async function generateSummariesForStructure(structure, options) {
1398
+ const nodes = structureToList(structure);
1399
+ const batchSize = 5;
1400
+ for (let i = 0; i < nodes.length; i += batchSize) {
1401
+ const batch = nodes.slice(i, i + batchSize);
1402
+ const summaries = await Promise.all(
1403
+ batch.map((node) => generateNodeSummary(node, options))
1404
+ );
1405
+ for (let j = 0; j < batch.length; j++) {
1406
+ batch[j].summary = summaries[j];
1407
+ }
1408
+ }
1409
+ }
1410
+ async function generateDocDescription(structure, options) {
1411
+ const cleanStructure = createCleanStructureForDescription(structure);
1412
+ const prompt = generateDocDescriptionPrompt(JSON.stringify(cleanStructure));
1413
+ return chatGPT({
1414
+ model: options.model,
1415
+ prompt,
1416
+ apiKey: options.apiKey,
1417
+ baseUrl: options.baseUrl
1418
+ });
1419
+ }
1420
+ async function verifyToc(pages, listResult, startIndex, options) {
1421
+ const correct = [];
1422
+ const incorrect = [];
1423
+ for (let i = 0; i < listResult.length; i++) {
1424
+ const item = listResult[i];
1425
+ if (!item) continue;
1426
+ const itemWithIndex = { ...item, listIndex: i };
1427
+ const result = await checkTitleAppearance(itemWithIndex, pages, startIndex, options);
1428
+ if (result.answer === "yes") {
1429
+ correct.push(item);
1430
+ } else {
1431
+ incorrect.push({
1432
+ listIndex: i,
1433
+ title: item.title,
1434
+ physicalIndex: item.physicalIndex
1435
+ });
1436
+ }
1437
+ }
1438
+ return { correct, incorrect };
1439
+ }
1440
+ async function fixIncorrectToc(tocWithPageNumber, pages, incorrectResults, startIndex, options) {
1441
+ const fixed = [...tocWithPageNumber];
1442
+ const stillIncorrect = [];
1443
+ const incorrectIndices = new Set(incorrectResults.map((r) => r.listIndex));
1444
+ const endIndex = pages.length + startIndex - 1;
1445
+ for (const incorrectItem of incorrectResults) {
1446
+ const { listIndex } = incorrectItem;
1447
+ let prevCorrect = startIndex - 1;
1448
+ for (let i = listIndex - 1; i >= 0; i--) {
1449
+ if (!incorrectIndices.has(i)) {
1450
+ const item = tocWithPageNumber[i];
1451
+ if (item?.physicalIndex !== void 0) {
1452
+ prevCorrect = item.physicalIndex;
1453
+ break;
1454
+ }
1455
+ }
1456
+ }
1457
+ let nextCorrect = endIndex;
1458
+ for (let i = listIndex + 1; i < tocWithPageNumber.length; i++) {
1459
+ if (!incorrectIndices.has(i)) {
1460
+ const item = tocWithPageNumber[i];
1461
+ if (item?.physicalIndex !== void 0) {
1462
+ nextCorrect = item.physicalIndex;
1463
+ break;
1464
+ }
1465
+ }
1466
+ }
1467
+ const pageContents = [];
1468
+ for (let pageIndex = prevCorrect; pageIndex <= nextCorrect; pageIndex++) {
1469
+ const idx = pageIndex - startIndex;
1470
+ if (idx >= 0 && idx < pages.length) {
1471
+ pageContents.push(
1472
+ `<physical_index_${pageIndex}>
1473
+ ${pages[idx]?.text || ""}
1474
+ <physical_index_${pageIndex}>
1475
+
1476
+ `
1477
+ );
1478
+ }
1479
+ }
1480
+ const contentRange = pageContents.join("");
1481
+ const physicalIndexInt = await singleTocItemIndexFixer(
1482
+ incorrectItem.title,
1483
+ contentRange,
1484
+ options
1485
+ );
1486
+ if (physicalIndexInt !== null && fixed[listIndex]) {
1487
+ fixed[listIndex].physicalIndex = physicalIndexInt;
1488
+ const checkItem = { ...fixed[listIndex], listIndex };
1489
+ const checkResult = await checkTitleAppearance(checkItem, pages, startIndex, options);
1490
+ if (checkResult.answer !== "yes") {
1491
+ stillIncorrect.push({
1492
+ listIndex,
1493
+ title: incorrectItem.title,
1494
+ physicalIndex: physicalIndexInt
1495
+ });
1496
+ }
1497
+ } else {
1498
+ stillIncorrect.push(incorrectItem);
1499
+ }
1500
+ }
1501
+ return { fixed, stillIncorrect };
1502
+ }
1503
+ function buildTree(tocItems, endPhysicalIndex, options) {
1504
+ const withPreface = addPrefaceIfNeeded(tocItems);
1505
+ const tree = postProcessing(withPreface, endPhysicalIndex);
1506
+ if (options.addNodeId) {
1507
+ writeNodeId(tree);
1508
+ }
1509
+ const keyOrder = ["title", "nodeId", "startIndex", "endIndex", "summary", "text", "nodes"];
1510
+ return formatStructure(tree, keyOrder);
1511
+ }
1512
+
1513
+ // src/pageindex.ts
1514
+ var DEFAULT_OPTIONS = {
1515
+ model: "gpt-4o-2024-11-20",
1516
+ tocCheckPageNum: 20,
1517
+ maxPageNumEachNode: 10,
1518
+ maxTokenNumEachNode: 2e4,
1519
+ addNodeId: true,
1520
+ addNodeSummary: true,
1521
+ addDocDescription: false,
1522
+ addNodeText: false,
1523
+ // OCR defaults
1524
+ extractionMode: "text",
1525
+ ocrModel: "mlx-community/GLM-OCR-bf16",
1526
+ ocrPromptType: "text",
1527
+ imageDpi: 150,
1528
+ imageFormat: "png",
1529
+ ocrConcurrency: 3
1530
+ };
1531
+ var PageIndex = class {
1532
+ options;
1533
+ constructor(options = {}) {
1534
+ this.options = {
1535
+ model: options.model || DEFAULT_OPTIONS.model,
1536
+ tocCheckPageNum: options.tocCheckPageNum || DEFAULT_OPTIONS.tocCheckPageNum,
1537
+ maxPageNumEachNode: options.maxPageNumEachNode || DEFAULT_OPTIONS.maxPageNumEachNode,
1538
+ maxTokenNumEachNode: options.maxTokenNumEachNode || DEFAULT_OPTIONS.maxTokenNumEachNode,
1539
+ addNodeId: options.addNodeId ?? DEFAULT_OPTIONS.addNodeId,
1540
+ addNodeSummary: options.addNodeSummary ?? DEFAULT_OPTIONS.addNodeSummary,
1541
+ addDocDescription: options.addDocDescription ?? DEFAULT_OPTIONS.addDocDescription,
1542
+ addNodeText: options.addNodeText ?? DEFAULT_OPTIONS.addNodeText,
1543
+ apiKey: options.apiKey,
1544
+ baseUrl: options.baseUrl,
1545
+ // OCR options
1546
+ extractionMode: options.extractionMode || DEFAULT_OPTIONS.extractionMode,
1547
+ ocrModel: options.ocrModel || DEFAULT_OPTIONS.ocrModel,
1548
+ ocrPromptType: options.ocrPromptType || DEFAULT_OPTIONS.ocrPromptType,
1549
+ imageDpi: options.imageDpi || DEFAULT_OPTIONS.imageDpi,
1550
+ imageFormat: options.imageFormat || DEFAULT_OPTIONS.imageFormat,
1551
+ ocrConcurrency: options.ocrConcurrency || DEFAULT_OPTIONS.ocrConcurrency
1552
+ };
1553
+ }
1554
+ /**
1555
+ * Set base URL for OpenAI-compatible API (e.g., LM Studio)
1556
+ */
1557
+ setBaseUrl(baseUrl) {
1558
+ this.options.baseUrl = baseUrl;
1559
+ return this;
1560
+ }
1561
+ /**
1562
+ * Use LM Studio configuration
1563
+ */
1564
+ useLMStudio() {
1565
+ this.options.baseUrl = "http://localhost:1234/v1";
1566
+ this.options.apiKey = "lm-studio";
1567
+ return this;
1568
+ }
1569
+ /**
1570
+ * Use Ollama configuration
1571
+ */
1572
+ useOllama() {
1573
+ this.options.baseUrl = "http://localhost:11434/v1";
1574
+ this.options.apiKey = "ollama";
1575
+ return this;
1576
+ }
1577
+ /**
1578
+ * Enable OCR mode for scanned PDFs
1579
+ */
1580
+ useOcrMode(ocrModel) {
1581
+ this.options.extractionMode = "ocr";
1582
+ if (ocrModel) {
1583
+ this.options.ocrModel = ocrModel;
1584
+ }
1585
+ return this;
1586
+ }
1587
+ /**
1588
+ * Process a PDF file and build tree index
1589
+ */
1590
+ async fromPdf(input) {
1591
+ let pages;
1592
+ let pdfName;
1593
+ if (this.options.extractionMode === "ocr") {
1594
+ console.log("[OCR Mode] Processing PDF with OCR...");
1595
+ const ocrOptions = {
1596
+ ocrModel: this.options.ocrModel,
1597
+ apiKey: this.options.apiKey,
1598
+ baseUrl: this.options.baseUrl,
1599
+ imageFormat: this.options.imageFormat,
1600
+ imageDpi: this.options.imageDpi,
1601
+ ocrPromptType: this.options.ocrPromptType,
1602
+ concurrency: this.options.ocrConcurrency
1603
+ };
1604
+ const result = await parsePdfWithOcr(input, ocrOptions);
1605
+ pages = result.pages;
1606
+ pdfName = typeof input === "string" ? getPdfName(input) : "Untitled";
1607
+ } else {
1608
+ const pdfInfo = await parsePdf(input);
1609
+ pages = pdfInfo.pages;
1610
+ pdfName = typeof input === "string" ? getPdfName(input) : pdfInfo.title;
1611
+ }
1612
+ return this.processPdfPages(pages, pdfName);
1613
+ }
1614
+ /**
1615
+ * Process PDF pages directly
1616
+ */
1617
+ async processPdfPages(pages, docName) {
1618
+ const startIndex = 1;
1619
+ const endPhysicalIndex = pages.length;
1620
+ const tocResult = await checkToc(pages, this.options);
1621
+ console.log(
1622
+ `TOC found: ${tocResult.tocContent !== null}, Pages: ${tocResult.tocPageList.length}, Has page numbers: ${tocResult.pageIndexGivenInToc}`
1623
+ );
1624
+ let tocItems;
1625
+ if (tocResult.tocContent === null) {
1626
+ console.log("Generating structure from document content...");
1627
+ tocItems = await processNoToc(pages, startIndex, this.options);
1628
+ } else if (tocResult.pageIndexGivenInToc === "no") {
1629
+ console.log("Processing TOC without page numbers...");
1630
+ tocItems = await processTocNoPageNumbers(
1631
+ tocResult.tocContent,
1632
+ pages,
1633
+ startIndex,
1634
+ this.options
1635
+ );
1636
+ } else {
1637
+ console.log("Processing TOC with page numbers...");
1638
+ tocItems = await processTocWithPageNumbers(
1639
+ tocResult.tocContent,
1640
+ tocResult.tocPageList,
1641
+ pages,
1642
+ this.options
1643
+ );
1644
+ }
1645
+ tocItems = convertPhysicalIndexToInt(tocItems);
1646
+ tocItems = await checkTitleAppearanceInStartConcurrent(tocItems, pages, this.options);
1647
+ console.log("Verifying TOC...");
1648
+ const { incorrect } = await verifyToc(pages, tocItems, startIndex, this.options);
1649
+ if (incorrect.length > 0) {
1650
+ console.log(`Fixing ${incorrect.length} incorrect TOC items...`);
1651
+ const { fixed } = await fixIncorrectToc(
1652
+ tocItems,
1653
+ pages,
1654
+ incorrect,
1655
+ startIndex,
1656
+ this.options
1657
+ );
1658
+ tocItems = fixed;
1659
+ }
1660
+ const tree = buildTree(tocItems, endPhysicalIndex, this.options);
1661
+ if (this.options.addNodeText || this.options.addNodeSummary) {
1662
+ addNodeText(tree, pages);
1663
+ }
1664
+ if (this.options.addNodeSummary) {
1665
+ console.log("Generating summaries...");
1666
+ await generateSummariesForStructure(tree, this.options);
1667
+ }
1668
+ let docDescription;
1669
+ if (this.options.addDocDescription) {
1670
+ console.log("Generating document description...");
1671
+ docDescription = await generateDocDescription(tree, this.options);
1672
+ }
1673
+ let finalStructure = tree;
1674
+ if (!this.options.addNodeText) {
1675
+ finalStructure = removeFields(tree, ["text"]);
1676
+ }
1677
+ return {
1678
+ docName,
1679
+ docDescription,
1680
+ structure: finalStructure
1681
+ };
1682
+ }
1683
+ };
1684
+
1685
+ // src/markdown.ts
1686
+ import * as path2 from "path";
1687
+ import * as fs3 from "fs/promises";
1688
+ var DEFAULT_MARKDOWN_OPTIONS = {
1689
+ model: "gpt-4o-2024-11-20",
1690
+ tocCheckPageNum: 20,
1691
+ maxPageNumEachNode: 10,
1692
+ maxTokenNumEachNode: 2e4,
1693
+ addNodeId: true,
1694
+ addNodeSummary: true,
1695
+ addDocDescription: false,
1696
+ addNodeText: false,
1697
+ thinning: false,
1698
+ thinningThreshold: 5e3,
1699
+ summaryTokenThreshold: 200
1700
+ };
1701
+ function extractNodesFromMarkdown(markdownContent) {
1702
+ const headerPattern = /^(#{1,6})\s+(.+)$/;
1703
+ const codeBlockPattern = /^```/;
1704
+ const nodeList = [];
1705
+ const lines = markdownContent.split("\n");
1706
+ let inCodeBlock = false;
1707
+ for (let lineNum = 0; lineNum < lines.length; lineNum++) {
1708
+ const line = lines[lineNum];
1709
+ const strippedLine = line.trim();
1710
+ if (codeBlockPattern.test(strippedLine)) {
1711
+ inCodeBlock = !inCodeBlock;
1712
+ continue;
1713
+ }
1714
+ if (!strippedLine) {
1715
+ continue;
1716
+ }
1717
+ if (!inCodeBlock) {
1718
+ const match = strippedLine.match(headerPattern);
1719
+ if (match) {
1720
+ const title = match[2].trim();
1721
+ nodeList.push({ nodeTitle: title, lineNum: lineNum + 1 });
1722
+ }
1723
+ }
1724
+ }
1725
+ return { nodeList, lines };
1726
+ }
1727
+ function extractNodeTextContent(nodeList, markdownLines) {
1728
+ const allNodes = [];
1729
+ for (const node of nodeList) {
1730
+ const lineContent = markdownLines[node.lineNum - 1];
1731
+ if (!lineContent) continue;
1732
+ const headerMatch = lineContent.match(/^(#{1,6})/);
1733
+ if (!headerMatch) {
1734
+ console.warn(
1735
+ `Warning: Line ${node.lineNum} does not contain a valid header: '${lineContent}'`
1736
+ );
1737
+ continue;
1738
+ }
1739
+ const processedNode = {
1740
+ title: node.nodeTitle,
1741
+ lineNum: node.lineNum,
1742
+ level: headerMatch[1].length
1743
+ };
1744
+ allNodes.push(processedNode);
1745
+ }
1746
+ for (let i = 0; i < allNodes.length; i++) {
1747
+ const node = allNodes[i];
1748
+ const startLine = node.lineNum - 1;
1749
+ let endLine;
1750
+ if (i + 1 < allNodes.length) {
1751
+ endLine = allNodes[i + 1].lineNum - 1;
1752
+ } else {
1753
+ endLine = markdownLines.length;
1754
+ }
1755
+ node.text = markdownLines.slice(startLine, endLine).join("\n").trim();
1756
+ }
1757
+ return allNodes;
1758
+ }
1759
+ function findAllChildren(parentIndex, parentLevel, nodeList) {
1760
+ const childrenIndices = [];
1761
+ for (let i = parentIndex + 1; i < nodeList.length; i++) {
1762
+ const currentLevel = nodeList[i].level || 0;
1763
+ if (currentLevel <= parentLevel) {
1764
+ break;
1765
+ }
1766
+ childrenIndices.push(i);
1767
+ }
1768
+ return childrenIndices;
1769
+ }
1770
+ function updateNodeListWithTextTokenCount(nodeList) {
1771
+ const resultList = nodeList.map((n) => ({ ...n }));
1772
+ for (let i = resultList.length - 1; i >= 0; i--) {
1773
+ const currentNode = resultList[i];
1774
+ const currentLevel = currentNode.level || 0;
1775
+ const childrenIndices = findAllChildren(i, currentLevel, resultList);
1776
+ const nodeText = currentNode.text || "";
1777
+ let totalText = nodeText;
1778
+ for (const childIndex of childrenIndices) {
1779
+ const childText = resultList[childIndex]?.text || "";
1780
+ if (childText) {
1781
+ totalText += "\n" + childText;
1782
+ }
1783
+ }
1784
+ resultList[i].textTokenCount = countTokens(totalText);
1785
+ }
1786
+ return resultList;
1787
+ }
1788
+ function treeThinningForIndex(nodeList, minNodeToken) {
1789
+ const resultList = nodeList.map((n) => ({ ...n }));
1790
+ const nodesToRemove = /* @__PURE__ */ new Set();
1791
+ for (let i = resultList.length - 1; i >= 0; i--) {
1792
+ if (nodesToRemove.has(i)) {
1793
+ continue;
1794
+ }
1795
+ const currentNode = resultList[i];
1796
+ const currentLevel = currentNode.level || 0;
1797
+ const totalTokens = currentNode.textTokenCount || 0;
1798
+ if (totalTokens < minNodeToken) {
1799
+ const childrenIndices = findAllChildren(i, currentLevel, resultList);
1800
+ const childrenTexts = [];
1801
+ for (const childIndex of childrenIndices.sort((a, b) => a - b)) {
1802
+ if (!nodesToRemove.has(childIndex)) {
1803
+ const childText = resultList[childIndex]?.text || "";
1804
+ if (childText.trim()) {
1805
+ childrenTexts.push(childText);
1806
+ }
1807
+ nodesToRemove.add(childIndex);
1808
+ }
1809
+ }
1810
+ if (childrenTexts.length > 0) {
1811
+ let parentText = currentNode.text || "";
1812
+ let mergedText = parentText;
1813
+ for (const childText of childrenTexts) {
1814
+ if (mergedText && !mergedText.endsWith("\n")) {
1815
+ mergedText += "\n\n";
1816
+ }
1817
+ mergedText += childText;
1818
+ }
1819
+ resultList[i].text = mergedText;
1820
+ resultList[i].textTokenCount = countTokens(mergedText);
1821
+ }
1822
+ }
1823
+ }
1824
+ const indicesToRemove = Array.from(nodesToRemove).sort((a, b) => b - a);
1825
+ for (const index of indicesToRemove) {
1826
+ resultList.splice(index, 1);
1827
+ }
1828
+ return resultList;
1829
+ }
1830
+ function buildTreeFromNodes(nodeList) {
1831
+ if (nodeList.length === 0) {
1832
+ return [];
1833
+ }
1834
+ const stack = [];
1835
+ const rootNodes = [];
1836
+ let nodeCounter = 1;
1837
+ for (const node of nodeList) {
1838
+ const currentLevel = node.level || 1;
1839
+ const treeNode = {
1840
+ title: node.title,
1841
+ nodeId: String(nodeCounter).padStart(4, "0"),
1842
+ text: node.text,
1843
+ lineNum: node.lineNum,
1844
+ nodes: []
1845
+ };
1846
+ nodeCounter++;
1847
+ while (stack.length > 0 && stack[stack.length - 1][1] >= currentLevel) {
1848
+ stack.pop();
1849
+ }
1850
+ if (stack.length === 0) {
1851
+ rootNodes.push(treeNode);
1852
+ } else {
1853
+ const [parentNode] = stack[stack.length - 1];
1854
+ parentNode.nodes.push(treeNode);
1855
+ }
1856
+ stack.push([treeNode, currentLevel]);
1857
+ }
1858
+ return rootNodes;
1859
+ }
1860
+ async function getNodeSummary(node, summaryTokenThreshold, options) {
1861
+ const nodeText = node.text || "";
1862
+ const numTokens = countTokens(nodeText);
1863
+ if (numTokens < summaryTokenThreshold) {
1864
+ return nodeText;
1865
+ }
1866
+ const prompt = generateNodeSummaryPrompt(nodeText);
1867
+ return chatGPT({
1868
+ model: options.model,
1869
+ prompt,
1870
+ apiKey: options.apiKey,
1871
+ baseUrl: options.baseUrl
1872
+ });
1873
+ }
1874
+ async function generateSummariesForStructureMd(structure, summaryTokenThreshold, options) {
1875
+ const nodes = structureToList(structure);
1876
+ const batchSize = 5;
1877
+ for (let i = 0; i < nodes.length; i += batchSize) {
1878
+ const batch = nodes.slice(i, i + batchSize);
1879
+ const summaries = await Promise.all(
1880
+ batch.map(
1881
+ (node) => getNodeSummary(node, summaryTokenThreshold, options)
1882
+ )
1883
+ );
1884
+ for (let j = 0; j < batch.length; j++) {
1885
+ const node = batch[j];
1886
+ if (!node.nodes || node.nodes.length === 0) {
1887
+ node.summary = summaries[j];
1888
+ } else {
1889
+ node.prefixSummary = summaries[j];
1890
+ }
1891
+ }
1892
+ }
1893
+ }
1894
+ async function generateDocDescriptionMd(structure, options) {
1895
+ const cleanStructure = createCleanStructureForDescription(structure);
1896
+ const prompt = generateDocDescriptionPrompt(
1897
+ JSON.stringify(cleanStructure)
1898
+ );
1899
+ return chatGPT({
1900
+ model: options.model,
1901
+ prompt,
1902
+ apiKey: options.apiKey,
1903
+ baseUrl: options.baseUrl
1904
+ });
1905
+ }
1906
+ async function mdToTree(mdPath, options = {}) {
1907
+ const opts = {
1908
+ ...DEFAULT_MARKDOWN_OPTIONS,
1909
+ ...options
1910
+ };
1911
+ const markdownContent = await fs3.readFile(mdPath, "utf-8");
1912
+ console.log("Extracting nodes from markdown...");
1913
+ const { nodeList, lines: markdownLines } = extractNodesFromMarkdown(markdownContent);
1914
+ console.log("Extracting text content from nodes...");
1915
+ let nodesWithContent = extractNodeTextContent(nodeList, markdownLines);
1916
+ if (opts.thinning) {
1917
+ nodesWithContent = updateNodeListWithTextTokenCount(nodesWithContent);
1918
+ console.log("Thinning nodes...");
1919
+ nodesWithContent = treeThinningForIndex(nodesWithContent, opts.thinningThreshold);
1920
+ }
1921
+ console.log("Building tree from nodes...");
1922
+ let treeStructure = buildTreeFromNodes(nodesWithContent);
1923
+ if (opts.addNodeId) {
1924
+ writeNodeId(treeStructure);
1925
+ }
1926
+ console.log("Formatting tree structure...");
1927
+ const keyOrder = [
1928
+ "title",
1929
+ "nodeId",
1930
+ "summary",
1931
+ "prefixSummary",
1932
+ "text",
1933
+ "lineNum",
1934
+ "nodes"
1935
+ ];
1936
+ if (opts.addNodeSummary) {
1937
+ treeStructure = formatStructure(treeStructure, keyOrder);
1938
+ console.log("Generating summaries for each node...");
1939
+ await generateSummariesForStructureMd(
1940
+ treeStructure,
1941
+ opts.summaryTokenThreshold,
1942
+ {
1943
+ model: opts.model,
1944
+ apiKey: opts.apiKey,
1945
+ baseUrl: void 0
1946
+ }
1947
+ );
1948
+ if (!opts.addNodeText) {
1949
+ const orderWithoutText = keyOrder.filter((k) => k !== "text");
1950
+ treeStructure = formatStructure(treeStructure, orderWithoutText);
1951
+ }
1952
+ if (opts.addDocDescription) {
1953
+ console.log("Generating document description...");
1954
+ const docDescription = await generateDocDescriptionMd(treeStructure, {
1955
+ model: opts.model,
1956
+ apiKey: opts.apiKey,
1957
+ baseUrl: void 0
1958
+ });
1959
+ return {
1960
+ docName: path2.basename(mdPath, path2.extname(mdPath)),
1961
+ docDescription,
1962
+ structure: treeStructure
1963
+ };
1964
+ }
1965
+ } else {
1966
+ const orderToUse = opts.addNodeText ? keyOrder : keyOrder.filter((k) => k !== "text");
1967
+ treeStructure = formatStructure(treeStructure, orderToUse);
1968
+ }
1969
+ return {
1970
+ docName: path2.basename(mdPath, path2.extname(mdPath)),
1971
+ structure: treeStructure
1972
+ };
1973
+ }
1974
+
1975
+ // src/cli.ts
1976
+ import * as path3 from "path";
1977
+ import * as fs4 from "fs";
1978
+ import * as fsp from "fs/promises";
1979
+ function printHelp() {
1980
+ console.log(`
1981
+ bun-pageindex - Vectorless, reasoning-based RAG for document understanding
1982
+
1983
+ USAGE:
1984
+ bun-pageindex --pdf <path> Process a PDF file
1985
+ bun-pageindex --md <path> Process a Markdown file
1986
+
1987
+ OPTIONS:
1988
+ --pdf <path> Path to PDF file
1989
+ --md <path> Path to Markdown file
1990
+ --output, -o <path> Output file path (default: ./results/<name>_structure.json)
1991
+
1992
+ MODEL OPTIONS:
1993
+ --model <name> Model to use (default: gpt-4o-2024-11-20)
1994
+ --lmstudio Use LM Studio (localhost:1234)
1995
+ --ollama Use Ollama (localhost:11434)
1996
+ --base-url <url> Custom OpenAI-compatible API URL
1997
+
1998
+ PDF OPTIONS:
1999
+ --toc-check-pages <n> Pages to check for TOC (default: 20)
2000
+ --max-pages-per-node <n> Max pages per node (default: 10)
2001
+ --max-tokens-per-node <n> Max tokens per node (default: 20000)
2002
+
2003
+ OCR OPTIONS (for scanned PDFs):
2004
+ --ocr Enable OCR mode for scanned PDFs
2005
+ --ocr-model <name> OCR model (default: mlx-community/GLM-OCR-bf16)
2006
+ --ocr-prompt-type <type> OCR prompt: text, formula, table (default: text)
2007
+ --image-dpi <n> Image DPI for OCR (default: 150)
2008
+
2009
+ MARKDOWN OPTIONS:
2010
+ --thinning Apply tree thinning
2011
+ --thinning-threshold <n> Min tokens for thinning (default: 5000)
2012
+ --summary-token-threshold <n> Token threshold for summaries (default: 200)
2013
+
2014
+ OUTPUT OPTIONS:
2015
+ --add-node-id Add node IDs (default: true)
2016
+ --no-node-id Don't add node IDs
2017
+ --add-node-summary Add node summaries (default: true)
2018
+ --no-node-summary Don't add node summaries
2019
+ --add-doc-description Add document description
2020
+ --add-node-text Include raw text in output
2021
+
2022
+ --help, -h Show this help message
2023
+
2024
+ EXAMPLES:
2025
+ bun-pageindex --pdf document.pdf
2026
+ bun-pageindex --md README.md --add-doc-description
2027
+ bun-pageindex --pdf paper.pdf --lmstudio --model llama3
2028
+ bun-pageindex --pdf report.pdf --base-url http://localhost:8080/v1
2029
+ bun-pageindex --pdf scanned.pdf --ocr --lmstudio --model qwen/qwen3-vl-30b
2030
+ `);
2031
+ }
2032
+ function parseCliArgs() {
2033
+ const { values } = parseArgs({
2034
+ args: process.argv.slice(2),
2035
+ options: {
2036
+ pdf: { type: "string" },
2037
+ md: { type: "string" },
2038
+ model: { type: "string", default: "gpt-4o-2024-11-20" },
2039
+ "toc-check-pages": { type: "string", default: "20" },
2040
+ "max-pages-per-node": { type: "string", default: "10" },
2041
+ "max-tokens-per-node": { type: "string", default: "20000" },
2042
+ "add-node-id": { type: "boolean", default: true },
2043
+ "no-node-id": { type: "boolean", default: false },
2044
+ "add-node-summary": { type: "boolean", default: true },
2045
+ "no-node-summary": { type: "boolean", default: false },
2046
+ "add-doc-description": { type: "boolean", default: false },
2047
+ "add-node-text": { type: "boolean", default: false },
2048
+ thinning: { type: "boolean", default: false },
2049
+ "thinning-threshold": { type: "string", default: "5000" },
2050
+ "summary-token-threshold": { type: "string", default: "200" },
2051
+ output: { type: "string", short: "o" },
2052
+ lmstudio: { type: "boolean", default: false },
2053
+ ollama: { type: "boolean", default: false },
2054
+ "base-url": { type: "string" },
2055
+ // OCR options
2056
+ ocr: { type: "boolean", default: false },
2057
+ "ocr-model": { type: "string", default: "mlx-community/GLM-OCR-bf16" },
2058
+ "ocr-prompt-type": { type: "string", default: "text" },
2059
+ "image-dpi": { type: "string", default: "150" },
2060
+ help: { type: "boolean", short: "h", default: false }
2061
+ },
2062
+ allowPositionals: true
2063
+ });
2064
+ return {
2065
+ pdf: values.pdf,
2066
+ md: values.md,
2067
+ model: values.model || "gpt-4o-2024-11-20",
2068
+ tocCheckPages: parseInt(values["toc-check-pages"] || "20", 10),
2069
+ maxPagesPerNode: parseInt(values["max-pages-per-node"] || "10", 10),
2070
+ maxTokensPerNode: parseInt(values["max-tokens-per-node"] || "20000", 10),
2071
+ addNodeId: values["no-node-id"] ? false : values["add-node-id"] ?? true,
2072
+ addNodeSummary: values["no-node-summary"] ? false : values["add-node-summary"] ?? true,
2073
+ addDocDescription: values["add-doc-description"] ?? false,
2074
+ addNodeText: values["add-node-text"] ?? false,
2075
+ thinning: values.thinning ?? false,
2076
+ thinningThreshold: parseInt(values["thinning-threshold"] || "5000", 10),
2077
+ summaryTokenThreshold: parseInt(values["summary-token-threshold"] || "200", 10),
2078
+ output: values.output,
2079
+ lmstudio: values.lmstudio ?? false,
2080
+ ollama: values.ollama ?? false,
2081
+ baseUrl: values["base-url"],
2082
+ // OCR options
2083
+ ocr: values.ocr ?? false,
2084
+ ocrModel: values["ocr-model"] || "mlx-community/GLM-OCR-bf16",
2085
+ ocrPromptType: values["ocr-prompt-type"] || "text",
2086
+ imageDpi: parseInt(values["image-dpi"] || "150", 10),
2087
+ help: values.help ?? false
2088
+ };
2089
+ }
2090
+ async function main() {
2091
+ const args = parseCliArgs();
2092
+ if (args.help) {
2093
+ printHelp();
2094
+ process.exit(0);
2095
+ }
2096
+ if (!args.pdf && !args.md) {
2097
+ console.error("Error: Either --pdf or --md must be specified");
2098
+ console.error("Use --help for usage information");
2099
+ process.exit(1);
2100
+ }
2101
+ if (args.pdf && args.md) {
2102
+ console.error("Error: Only one of --pdf or --md can be specified");
2103
+ process.exit(1);
2104
+ }
2105
+ const inputPath = args.pdf || args.md;
2106
+ const inputName = path3.basename(inputPath, path3.extname(inputPath));
2107
+ const outputDir = "./results";
2108
+ const outputPath = args.output || path3.join(outputDir, `${inputName}_structure.json`);
2109
+ if (!fs4.existsSync(outputDir)) {
2110
+ fs4.mkdirSync(outputDir, { recursive: true });
2111
+ }
2112
+ let result;
2113
+ if (args.pdf) {
2114
+ if (!args.pdf.toLowerCase().endsWith(".pdf")) {
2115
+ console.error("Error: PDF file must have .pdf extension");
2116
+ process.exit(1);
2117
+ }
2118
+ if (!fs4.existsSync(args.pdf)) {
2119
+ console.error(`Error: PDF file not found: ${args.pdf}`);
2120
+ process.exit(1);
2121
+ }
2122
+ console.log(`Processing PDF: ${args.pdf}`);
2123
+ if (args.ocr) {
2124
+ console.log(`[OCR Mode] Using OCR model: ${args.ocrModel}`);
2125
+ }
2126
+ const pageIndex = new PageIndex({
2127
+ model: args.model,
2128
+ tocCheckPageNum: args.tocCheckPages,
2129
+ maxPageNumEachNode: args.maxPagesPerNode,
2130
+ maxTokenNumEachNode: args.maxTokensPerNode,
2131
+ addNodeId: args.addNodeId,
2132
+ addNodeSummary: args.addNodeSummary,
2133
+ addDocDescription: args.addDocDescription,
2134
+ addNodeText: args.addNodeText,
2135
+ // OCR options
2136
+ extractionMode: args.ocr ? "ocr" : "text",
2137
+ ocrModel: args.ocrModel,
2138
+ ocrPromptType: args.ocrPromptType,
2139
+ imageDpi: args.imageDpi
2140
+ });
2141
+ if (args.lmstudio) {
2142
+ pageIndex.useLMStudio();
2143
+ } else if (args.ollama) {
2144
+ pageIndex.useOllama();
2145
+ } else if (args.baseUrl) {
2146
+ pageIndex.setBaseUrl(args.baseUrl);
2147
+ }
2148
+ result = await pageIndex.fromPdf(args.pdf);
2149
+ } else {
2150
+ const mdPath = args.md;
2151
+ if (!mdPath.toLowerCase().endsWith(".md") && !mdPath.toLowerCase().endsWith(".markdown")) {
2152
+ console.error("Error: Markdown file must have .md or .markdown extension");
2153
+ process.exit(1);
2154
+ }
2155
+ if (!fs4.existsSync(mdPath)) {
2156
+ console.error(`Error: Markdown file not found: ${mdPath}`);
2157
+ process.exit(1);
2158
+ }
2159
+ console.log(`Processing Markdown: ${mdPath}`);
2160
+ result = await mdToTree(mdPath, {
2161
+ model: args.model,
2162
+ addNodeId: args.addNodeId,
2163
+ addNodeSummary: args.addNodeSummary,
2164
+ addDocDescription: args.addDocDescription,
2165
+ addNodeText: args.addNodeText,
2166
+ thinning: args.thinning,
2167
+ thinningThreshold: args.thinningThreshold,
2168
+ summaryTokenThreshold: args.summaryTokenThreshold
2169
+ });
2170
+ }
2171
+ console.log("Parsing done, saving to file...");
2172
+ await fsp.writeFile(outputPath, JSON.stringify(result, null, 2));
2173
+ console.log(`Tree structure saved to: ${outputPath}`);
2174
+ }
2175
+ main().catch((error) => {
2176
+ console.error("Error:", error.message);
2177
+ process.exit(1);
2178
+ });