react-native-pageindex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/LICENSE +21 -0
- package/README.md +405 -0
- package/dist/config.d.ts +4 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +22 -0
- package/dist/config.js.map +1 -0
- package/dist/index.d.ts +49 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +75 -0
- package/dist/index.js.map +1 -0
- package/dist/pageIndex.d.ts +48 -0
- package/dist/pageIndex.d.ts.map +1 -0
- package/dist/pageIndex.js +962 -0
- package/dist/pageIndex.js.map +1 -0
- package/dist/pageIndexDocument.d.ts +85 -0
- package/dist/pageIndexDocument.d.ts.map +1 -0
- package/dist/pageIndexDocument.js +145 -0
- package/dist/pageIndexDocument.js.map +1 -0
- package/dist/pageIndexMd.d.ts +31 -0
- package/dist/pageIndexMd.d.ts.map +1 -0
- package/dist/pageIndexMd.js +260 -0
- package/dist/pageIndexMd.js.map +1 -0
- package/dist/parsers/csv.d.ts +17 -0
- package/dist/parsers/csv.d.ts.map +1 -0
- package/dist/parsers/csv.js +147 -0
- package/dist/parsers/csv.js.map +1 -0
- package/dist/parsers/docx.d.ts +20 -0
- package/dist/parsers/docx.d.ts.map +1 -0
- package/dist/parsers/docx.js +134 -0
- package/dist/parsers/docx.js.map +1 -0
- package/dist/parsers/xlsx.d.ts +19 -0
- package/dist/parsers/xlsx.d.ts.map +1 -0
- package/dist/parsers/xlsx.js +121 -0
- package/dist/parsers/xlsx.js.map +1 -0
- package/dist/reverseIndex.d.ts +39 -0
- package/dist/reverseIndex.d.ts.map +1 -0
- package/dist/reverseIndex.js +248 -0
- package/dist/reverseIndex.js.map +1 -0
- package/dist/types.d.ts +190 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +4 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/json.d.ts +13 -0
- package/dist/utils/json.d.ts.map +1 -0
- package/dist/utils/json.js +69 -0
- package/dist/utils/json.js.map +1 -0
- package/dist/utils/pdf.d.ts +20 -0
- package/dist/utils/pdf.d.ts.map +1 -0
- package/dist/utils/pdf.js +96 -0
- package/dist/utils/pdf.js.map +1 -0
- package/dist/utils/progress.d.ts +29 -0
- package/dist/utils/progress.d.ts.map +1 -0
- package/dist/utils/progress.js +59 -0
- package/dist/utils/progress.js.map +1 -0
- package/dist/utils/tokens.d.ts +7 -0
- package/dist/utils/tokens.d.ts.map +1 -0
- package/dist/utils/tokens.js +12 -0
- package/dist/utils/tokens.js.map +1 -0
- package/dist/utils/tree.d.ts +88 -0
- package/dist/utils/tree.d.ts.map +1 -0
- package/dist/utils/tree.js +365 -0
- package/dist/utils/tree.js.map +1 -0
- package/package.json +76 -0
|
@@ -0,0 +1,962 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* PDF pipeline — port of pageindex/page_index.py
|
|
4
|
+
*
|
|
5
|
+
* Processes PDF pages (as pre-extracted text + token counts) and builds a
|
|
6
|
+
* hierarchical tree index using LLM reasoning. No PDF parser is included
|
|
7
|
+
* here — pass `PageData[]` directly, or use the `extractPdfPages()` helper
|
|
8
|
+
* from `./utils/pdf` (requires pdfjs-dist to be installed).
|
|
9
|
+
*/
|
|
10
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
11
|
+
if (k2 === undefined) k2 = k;
|
|
12
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
13
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
14
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
15
|
+
}
|
|
16
|
+
Object.defineProperty(o, k2, desc);
|
|
17
|
+
}) : (function(o, m, k, k2) {
|
|
18
|
+
if (k2 === undefined) k2 = k;
|
|
19
|
+
o[k2] = m[k];
|
|
20
|
+
}));
|
|
21
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
22
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
23
|
+
}) : function(o, v) {
|
|
24
|
+
o["default"] = v;
|
|
25
|
+
});
|
|
26
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
27
|
+
var ownKeys = function(o) {
|
|
28
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
29
|
+
var ar = [];
|
|
30
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
31
|
+
return ar;
|
|
32
|
+
};
|
|
33
|
+
return ownKeys(o);
|
|
34
|
+
};
|
|
35
|
+
return function (mod) {
|
|
36
|
+
if (mod && mod.__esModule) return mod;
|
|
37
|
+
var result = {};
|
|
38
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
39
|
+
__setModuleDefault(result, mod);
|
|
40
|
+
return result;
|
|
41
|
+
};
|
|
42
|
+
})();
|
|
43
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
44
|
+
exports.pageIndex = pageIndex;
|
|
45
|
+
const config_1 = require("./config");
|
|
46
|
+
const tokens_1 = require("./utils/tokens");
|
|
47
|
+
const json_1 = require("./utils/json");
|
|
48
|
+
const progress_1 = require("./utils/progress");
|
|
49
|
+
const tree_1 = require("./utils/tree");
|
|
50
|
+
// ─── PDF Pipeline Steps (in order) ───────────────────────────────────────────
|
|
51
|
+
const PDF_STEPS = [
|
|
52
|
+
'Initializing',
|
|
53
|
+
'Extracting PDF pages',
|
|
54
|
+
'Scanning for table of contents',
|
|
55
|
+
'Transforming TOC to structured format',
|
|
56
|
+
'Mapping TOC entries to page numbers',
|
|
57
|
+
'Building tree from document text',
|
|
58
|
+
'Verifying TOC accuracy',
|
|
59
|
+
'Fixing inaccurate TOC entries',
|
|
60
|
+
'Resolving large sections',
|
|
61
|
+
'Attaching page text to nodes',
|
|
62
|
+
'Generating node summaries',
|
|
63
|
+
'Generating document description',
|
|
64
|
+
'Done',
|
|
65
|
+
];
|
|
66
|
+
// ─── LLM Wrappers ─────────────────────────────────────────────────────────────
|
|
67
|
+
async function llmCall(llm, prompt, chatHistory) {
|
|
68
|
+
const MAX_RETRIES = 10;
|
|
69
|
+
for (let i = 0; i < MAX_RETRIES; i++) {
|
|
70
|
+
try {
|
|
71
|
+
const result = await llm(prompt, chatHistory ? { chatHistory } : undefined);
|
|
72
|
+
return result.content;
|
|
73
|
+
}
|
|
74
|
+
catch (err) {
|
|
75
|
+
console.warn(`[PageIndex] LLM call failed (attempt ${i + 1}/${MAX_RETRIES}):`, err);
|
|
76
|
+
if (i < MAX_RETRIES - 1)
|
|
77
|
+
await sleep(1000);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
throw new Error('[PageIndex] Max retries reached for LLM call');
|
|
81
|
+
}
|
|
82
|
+
async function llmCallWithFinishReason(llm, prompt, chatHistory) {
|
|
83
|
+
const MAX_RETRIES = 10;
|
|
84
|
+
for (let i = 0; i < MAX_RETRIES; i++) {
|
|
85
|
+
try {
|
|
86
|
+
const result = await llm(prompt, chatHistory ? { chatHistory } : undefined);
|
|
87
|
+
return { content: result.content, finishReason: result.finishReason };
|
|
88
|
+
}
|
|
89
|
+
catch (err) {
|
|
90
|
+
console.warn(`[PageIndex] LLM call failed (attempt ${i + 1}/${MAX_RETRIES}):`, err);
|
|
91
|
+
if (i < MAX_RETRIES - 1)
|
|
92
|
+
await sleep(1000);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
throw new Error('[PageIndex] Max retries reached for LLM call');
|
|
96
|
+
}
|
|
97
|
+
function sleep(ms) {
|
|
98
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
99
|
+
}
|
|
100
|
+
// ─── TOC Detection ────────────────────────────────────────────────────────────
|
|
101
|
+
async function tocDetectorSinglePage(content, llm) {
|
|
102
|
+
const prompt = `Your job is to detect if there is a table of content provided in the given text.
|
|
103
|
+
|
|
104
|
+
Given text: ${content}
|
|
105
|
+
|
|
106
|
+
return the following JSON format:
|
|
107
|
+
{
|
|
108
|
+
"thinking": <why do you think there is a table of content in the given text>
|
|
109
|
+
"toc_detected": "<yes or no>",
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
Directly return the final JSON structure. Do not output anything else.
|
|
113
|
+
Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.`;
|
|
114
|
+
const response = await llmCall(llm, prompt);
|
|
115
|
+
const json = (0, json_1.extractJson)(response);
|
|
116
|
+
return json['toc_detected'] ?? 'no';
|
|
117
|
+
}
|
|
118
|
+
async function detectPageIndex(tocContent, llm) {
|
|
119
|
+
const prompt = `You will be given a table of contents.
|
|
120
|
+
|
|
121
|
+
Your job is to detect if there are page numbers/indices given within the table of contents.
|
|
122
|
+
|
|
123
|
+
Given text: ${tocContent}
|
|
124
|
+
|
|
125
|
+
Reply format:
|
|
126
|
+
{
|
|
127
|
+
"thinking": <why do you think there are page numbers/indices given within the table of contents>
|
|
128
|
+
"page_index_given_in_toc": "<yes or no>"
|
|
129
|
+
}
|
|
130
|
+
Directly return the final JSON structure. Do not output anything else.`;
|
|
131
|
+
const response = await llmCall(llm, prompt);
|
|
132
|
+
const json = (0, json_1.extractJson)(response);
|
|
133
|
+
return json['page_index_given_in_toc'] ?? 'no';
|
|
134
|
+
}
|
|
135
|
+
function transformDotsToColon(text) {
|
|
136
|
+
return text.replace(/\.{5,}/g, ': ').replace(/(?:\. ){5,}\.?/g, ': ');
|
|
137
|
+
}
|
|
138
|
+
async function tocExtractorHelper(pageList, tocPageList, llm) {
|
|
139
|
+
let tocContent = '';
|
|
140
|
+
for (const pageIndex of tocPageList)
|
|
141
|
+
tocContent += pageList[pageIndex].text;
|
|
142
|
+
tocContent = transformDotsToColon(tocContent);
|
|
143
|
+
const hasPageIndex = await detectPageIndex(tocContent, llm);
|
|
144
|
+
return { toc_content: tocContent, page_index_given_in_toc: hasPageIndex };
|
|
145
|
+
}
|
|
146
|
+
async function findTocPages(startPageIndex, pageList, opts, llm, pr) {
|
|
147
|
+
const tocPageList = [];
|
|
148
|
+
let lastPageIsYes = false;
|
|
149
|
+
let i = startPageIndex;
|
|
150
|
+
while (i < pageList.length) {
|
|
151
|
+
if (i >= opts.tocCheckPageNum && !lastPageIsYes)
|
|
152
|
+
break;
|
|
153
|
+
pr.advance('Scanning for table of contents', `Checking page ${i + 1} / ${Math.min(opts.tocCheckPageNum, pageList.length)}`);
|
|
154
|
+
const result = await tocDetectorSinglePage(pageList[i].text, llm);
|
|
155
|
+
if (result === 'yes') {
|
|
156
|
+
tocPageList.push(i);
|
|
157
|
+
lastPageIsYes = true;
|
|
158
|
+
}
|
|
159
|
+
else if (result === 'no' && lastPageIsYes) {
|
|
160
|
+
break;
|
|
161
|
+
}
|
|
162
|
+
i++;
|
|
163
|
+
}
|
|
164
|
+
return tocPageList;
|
|
165
|
+
}
|
|
166
|
+
async function checkToc(pageList, opts, llm, pr) {
|
|
167
|
+
pr.report('Scanning for table of contents', `Checking up to ${opts.tocCheckPageNum} pages`);
|
|
168
|
+
const tocPageList = await findTocPages(0, pageList, opts, llm, pr);
|
|
169
|
+
if (tocPageList.length === 0) {
|
|
170
|
+
console.log('[PageIndex] No TOC found — will extract structure from content');
|
|
171
|
+
return { toc_content: null, toc_page_list: [], page_index_given_in_toc: 'no' };
|
|
172
|
+
}
|
|
173
|
+
console.log(`[PageIndex] TOC found on pages: ${tocPageList.map((p) => p + 1).join(', ')}`);
|
|
174
|
+
const tocJson = await tocExtractorHelper(pageList, tocPageList, llm);
|
|
175
|
+
if (tocJson.page_index_given_in_toc === 'yes') {
|
|
176
|
+
return {
|
|
177
|
+
toc_content: tocJson.toc_content,
|
|
178
|
+
toc_page_list: tocPageList,
|
|
179
|
+
page_index_given_in_toc: 'yes',
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
let currentStartIndex = tocPageList[tocPageList.length - 1] + 1;
|
|
183
|
+
while (tocJson.page_index_given_in_toc === 'no' &&
|
|
184
|
+
currentStartIndex < pageList.length &&
|
|
185
|
+
currentStartIndex < opts.tocCheckPageNum) {
|
|
186
|
+
const additionalTocPages = await findTocPages(currentStartIndex, pageList, opts, llm, pr);
|
|
187
|
+
if (additionalTocPages.length === 0)
|
|
188
|
+
break;
|
|
189
|
+
const additionalTocJson = await tocExtractorHelper(pageList, additionalTocPages, llm);
|
|
190
|
+
if (additionalTocJson.page_index_given_in_toc === 'yes') {
|
|
191
|
+
return {
|
|
192
|
+
toc_content: additionalTocJson.toc_content,
|
|
193
|
+
toc_page_list: additionalTocPages,
|
|
194
|
+
page_index_given_in_toc: 'yes',
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
currentStartIndex = additionalTocPages[additionalTocPages.length - 1] + 1;
|
|
198
|
+
}
|
|
199
|
+
return {
|
|
200
|
+
toc_content: tocJson.toc_content,
|
|
201
|
+
toc_page_list: tocPageList,
|
|
202
|
+
page_index_given_in_toc: 'no',
|
|
203
|
+
};
|
|
204
|
+
}
|
|
205
|
+
// ─── TOC Transformation ───────────────────────────────────────────────────────
|
|
206
|
+
async function checkIfTocTransformationIsComplete(content, toc, llm) {
|
|
207
|
+
const prompt = `You are given a raw table of contents and a table of contents.
|
|
208
|
+
Your job is to check if the table of contents is complete.
|
|
209
|
+
|
|
210
|
+
Reply format:
|
|
211
|
+
{
|
|
212
|
+
"thinking": <why do you think the cleaned table of contents is complete or not>
|
|
213
|
+
"completed": "yes" or "no"
|
|
214
|
+
}
|
|
215
|
+
Directly return the final JSON structure. Do not output anything else.
|
|
216
|
+
|
|
217
|
+
Raw Table of contents:
|
|
218
|
+
${content}
|
|
219
|
+
|
|
220
|
+
Cleaned Table of contents:
|
|
221
|
+
${toc}`;
|
|
222
|
+
const response = await llmCall(llm, prompt);
|
|
223
|
+
const json = (0, json_1.extractJson)(response);
|
|
224
|
+
return json['completed'] ?? 'no';
|
|
225
|
+
}
|
|
226
|
+
async function tocTransformer(tocContent, llm, pr) {
|
|
227
|
+
pr.report('Transforming TOC to structured format', 'Converting TOC to JSON hierarchy');
|
|
228
|
+
const initPrompt = `You are given a table of contents, You job is to transform the whole table of content into a JSON format included table_of_contents.
|
|
229
|
+
|
|
230
|
+
structure is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
|
|
231
|
+
|
|
232
|
+
The response should be in the following JSON format:
|
|
233
|
+
{
|
|
234
|
+
table_of_contents: [
|
|
235
|
+
{
|
|
236
|
+
"structure": <structure index, "x.x.x" or None> (string),
|
|
237
|
+
"title": <title of the section>,
|
|
238
|
+
"page": <page number or None>,
|
|
239
|
+
},
|
|
240
|
+
...
|
|
241
|
+
],
|
|
242
|
+
}
|
|
243
|
+
You should transform the full table of contents in one go.
|
|
244
|
+
Directly return the final JSON structure, do not output anything else.
|
|
245
|
+
|
|
246
|
+
Given table of contents:
|
|
247
|
+
${tocContent}`;
|
|
248
|
+
let { content: lastComplete, finishReason } = await llmCallWithFinishReason(llm, initPrompt);
|
|
249
|
+
let ifComplete = await checkIfTocTransformationIsComplete(tocContent, lastComplete, llm);
|
|
250
|
+
if (ifComplete === 'yes' && finishReason !== 'length') {
|
|
251
|
+
const parsed = (0, json_1.extractJson)(lastComplete);
|
|
252
|
+
return (0, tree_1.convertPageToInt)((parsed['table_of_contents'] ?? []));
|
|
253
|
+
}
|
|
254
|
+
lastComplete = (0, json_1.getJsonContent)(lastComplete);
|
|
255
|
+
let attempts = 0;
|
|
256
|
+
const MAX_ATTEMPTS = 5;
|
|
257
|
+
while (!(ifComplete === 'yes' && finishReason !== 'length') && attempts < MAX_ATTEMPTS) {
|
|
258
|
+
pr.advance('Transforming TOC to structured format', `Completing TOC (attempt ${attempts + 2})`);
|
|
259
|
+
const position = lastComplete.lastIndexOf('}');
|
|
260
|
+
if (position !== -1)
|
|
261
|
+
lastComplete = lastComplete.slice(0, position + 2);
|
|
262
|
+
const continuePrompt = `Your task is to continue the table of contents json structure, directly output the remaining part of the json structure.
|
|
263
|
+
|
|
264
|
+
The raw table of contents json structure is:
|
|
265
|
+
${tocContent}
|
|
266
|
+
|
|
267
|
+
The incomplete transformed table of contents json structure is:
|
|
268
|
+
${lastComplete}
|
|
269
|
+
|
|
270
|
+
Please continue the json structure, directly output the remaining part of the json structure.`;
|
|
271
|
+
let newComplete;
|
|
272
|
+
({ content: newComplete, finishReason } = await llmCallWithFinishReason(llm, continuePrompt));
|
|
273
|
+
if (newComplete.startsWith('```json'))
|
|
274
|
+
newComplete = (0, json_1.getJsonContent)(newComplete);
|
|
275
|
+
lastComplete = lastComplete + newComplete;
|
|
276
|
+
ifComplete = await checkIfTocTransformationIsComplete(tocContent, lastComplete, llm);
|
|
277
|
+
attempts++;
|
|
278
|
+
}
|
|
279
|
+
const parsed = JSON.parse(lastComplete);
|
|
280
|
+
return (0, tree_1.convertPageToInt)((parsed['table_of_contents'] ?? []));
|
|
281
|
+
}
|
|
282
|
+
// ─── TOC Index Extraction ─────────────────────────────────────────────────────
|
|
283
|
+
async function tocIndexExtractor(toc, content, llm) {
|
|
284
|
+
const prompt = `You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format.
|
|
285
|
+
|
|
286
|
+
The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
|
|
287
|
+
|
|
288
|
+
The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
|
|
289
|
+
|
|
290
|
+
The response should be in the following JSON format:
|
|
291
|
+
[
|
|
292
|
+
{
|
|
293
|
+
"structure": <structure index, "x.x.x" or None> (string),
|
|
294
|
+
"title": <title of the section>,
|
|
295
|
+
"physical_index": "<physical_index_X>" (keep the format)
|
|
296
|
+
},
|
|
297
|
+
...
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
Only add the physical_index to the sections that are in the provided pages.
|
|
301
|
+
If the section is not in the provided pages, do not add the physical_index to it.
|
|
302
|
+
Directly return the final JSON structure. Do not output anything else.
|
|
303
|
+
|
|
304
|
+
Table of contents:
|
|
305
|
+
${JSON.stringify(toc)}
|
|
306
|
+
|
|
307
|
+
Document pages:
|
|
308
|
+
${content}`;
|
|
309
|
+
const response = await llmCall(llm, prompt);
|
|
310
|
+
return (0, json_1.extractJson)(response);
|
|
311
|
+
}
|
|
312
|
+
// ─── Page Grouping ────────────────────────────────────────────────────────────
|
|
313
|
+
function pageListToGroupText(pageContents, tokenLengths, maxTokens = 20000, overlapPage = 1) {
|
|
314
|
+
const numTokens = tokenLengths.reduce((a, b) => a + b, 0);
|
|
315
|
+
if (numTokens <= maxTokens)
|
|
316
|
+
return [pageContents.join('')];
|
|
317
|
+
const expectedParts = Math.ceil(numTokens / maxTokens);
|
|
318
|
+
const avgTokensPerPart = Math.ceil((numTokens / expectedParts + maxTokens) / 2);
|
|
319
|
+
const subsets = [];
|
|
320
|
+
let currentSubset = [];
|
|
321
|
+
let currentTokenCount = 0;
|
|
322
|
+
for (let i = 0; i < pageContents.length; i++) {
|
|
323
|
+
if (currentTokenCount + tokenLengths[i] > avgTokensPerPart) {
|
|
324
|
+
subsets.push(currentSubset.join(''));
|
|
325
|
+
const overlapStart = Math.max(i - overlapPage, 0);
|
|
326
|
+
currentSubset = pageContents.slice(overlapStart, i);
|
|
327
|
+
currentTokenCount = tokenLengths.slice(overlapStart, i).reduce((a, b) => a + b, 0);
|
|
328
|
+
}
|
|
329
|
+
currentSubset.push(pageContents[i]);
|
|
330
|
+
currentTokenCount += tokenLengths[i];
|
|
331
|
+
}
|
|
332
|
+
if (currentSubset.length > 0)
|
|
333
|
+
subsets.push(currentSubset.join(''));
|
|
334
|
+
console.log(`[PageIndex] Split into ${subsets.length} text group(s)`);
|
|
335
|
+
return subsets;
|
|
336
|
+
}
|
|
337
|
+
// ─── TOC Generation (no existing TOC) ────────────────────────────────────────
|
|
338
|
+
async function generateTocInit(part, llm, pr, groupIndex, totalGroups) {
|
|
339
|
+
pr.report('Building tree from document text', `Extracting structure from group ${groupIndex + 1} / ${totalGroups}`);
|
|
340
|
+
const prompt = `You are an expert in extracting hierarchical tree structure, your task is to generate the tree structure of the document.
|
|
341
|
+
|
|
342
|
+
The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
|
|
343
|
+
|
|
344
|
+
For the title, you need to extract the original title from the text, only fix the space inconsistency.
|
|
345
|
+
|
|
346
|
+
The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X.
|
|
347
|
+
|
|
348
|
+
For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format.
|
|
349
|
+
|
|
350
|
+
The response should be in the following format.
|
|
351
|
+
[
|
|
352
|
+
{
|
|
353
|
+
"structure": <structure index, "x.x.x"> (string),
|
|
354
|
+
"title": <title of the section, keep the original title>,
|
|
355
|
+
"physical_index": "<physical_index_X> (keep the format)"
|
|
356
|
+
},
|
|
357
|
+
],
|
|
358
|
+
|
|
359
|
+
Directly return the final JSON structure. Do not output anything else.
|
|
360
|
+
|
|
361
|
+
Given text:
|
|
362
|
+
${part}`;
|
|
363
|
+
const { content: response, finishReason } = await llmCallWithFinishReason(llm, prompt);
|
|
364
|
+
if (finishReason !== 'length')
|
|
365
|
+
return (0, json_1.extractJson)(response);
|
|
366
|
+
throw new Error('[PageIndex] TOC generation truncated (output too long)');
|
|
367
|
+
}
|
|
368
|
+
async function generateTocContinue(tocContent, part, llm, pr, groupIndex, totalGroups) {
|
|
369
|
+
pr.advance('Building tree from document text', `Continuing structure extraction — group ${groupIndex + 1} / ${totalGroups}`);
|
|
370
|
+
const prompt = `You are an expert in extracting hierarchical tree structure.
|
|
371
|
+
You are given a tree structure of the previous part and the text of the current part.
|
|
372
|
+
Your task is to continue the tree structure from the previous part to include the current part.
|
|
373
|
+
|
|
374
|
+
The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
|
|
375
|
+
|
|
376
|
+
For the title, you need to extract the original title from the text, only fix the space inconsistency.
|
|
377
|
+
|
|
378
|
+
The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X.
|
|
379
|
+
|
|
380
|
+
For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format.
|
|
381
|
+
|
|
382
|
+
The response should be in the following format.
|
|
383
|
+
[
|
|
384
|
+
{
|
|
385
|
+
"structure": <structure index, "x.x.x"> (string),
|
|
386
|
+
"title": <title of the section, keep the original title>,
|
|
387
|
+
"physical_index": "<physical_index_X> (keep the format)"
|
|
388
|
+
},
|
|
389
|
+
...
|
|
390
|
+
]
|
|
391
|
+
|
|
392
|
+
Directly return the additional part of the final JSON structure. Do not output anything else.
|
|
393
|
+
|
|
394
|
+
Given text:
|
|
395
|
+
${part}
|
|
396
|
+
|
|
397
|
+
Previous tree structure:
|
|
398
|
+
${JSON.stringify(tocContent, null, 2)}`;
|
|
399
|
+
const { content: response, finishReason } = await llmCallWithFinishReason(llm, prompt);
|
|
400
|
+
if (finishReason !== 'length')
|
|
401
|
+
return (0, json_1.extractJson)(response);
|
|
402
|
+
throw new Error('[PageIndex] TOC continuation truncated (output too long)');
|
|
403
|
+
}
|
|
404
|
+
// ─── TOC with Page Numbers Processing ────────────────────────────────────────
|
|
405
|
+
function removePageNumber(data) {
|
|
406
|
+
if (Array.isArray(data)) {
|
|
407
|
+
for (const item of data)
|
|
408
|
+
removePageNumber(item);
|
|
409
|
+
}
|
|
410
|
+
else if (data !== null && typeof data === 'object') {
|
|
411
|
+
delete data.page;
|
|
412
|
+
if (data.nodes)
|
|
413
|
+
removePageNumber(data.nodes);
|
|
414
|
+
}
|
|
415
|
+
return data;
|
|
416
|
+
}
|
|
417
|
+
function extractMatchingPagePairs(tocPage, tocPhysicalIndex, startPageIndex) {
|
|
418
|
+
const pairs = [];
|
|
419
|
+
for (const phyItem of tocPhysicalIndex) {
|
|
420
|
+
for (const pageItem of tocPage) {
|
|
421
|
+
if (phyItem.title === pageItem.title) {
|
|
422
|
+
const physIdx = phyItem.physical_index;
|
|
423
|
+
if (physIdx != null && physIdx >= startPageIndex) {
|
|
424
|
+
pairs.push({ title: phyItem.title, page: pageItem.page ?? null, physical_index: physIdx });
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
return pairs;
|
|
430
|
+
}
|
|
431
|
+
function calculatePageOffset(pairs) {
|
|
432
|
+
const differences = [];
|
|
433
|
+
for (const pair of pairs) {
|
|
434
|
+
if (pair.physical_index != null && pair.page != null) {
|
|
435
|
+
differences.push(pair.physical_index - pair.page);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
if (differences.length === 0)
|
|
439
|
+
return null;
|
|
440
|
+
const counts = {};
|
|
441
|
+
for (const d of differences)
|
|
442
|
+
counts[d] = (counts[d] ?? 0) + 1;
|
|
443
|
+
return Number(Object.entries(counts).sort((a, b) => b[1] - a[1])[0][0]);
|
|
444
|
+
}
|
|
445
|
+
function addPageOffsetToTocJson(data, offset) {
|
|
446
|
+
for (const item of data) {
|
|
447
|
+
if (item.page != null && typeof item.page === 'number') {
|
|
448
|
+
item.physical_index = item.page + offset;
|
|
449
|
+
delete item.page;
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
return data;
|
|
453
|
+
}
|
|
454
|
+
async function addPageNumberToToc(part, structure, llm) {
|
|
455
|
+
const prompt = `You are given an JSON structure of a document and a partial part of the document. Your task is to check if the title that is described in the structure is started in the partial given document.
|
|
456
|
+
|
|
457
|
+
The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
|
|
458
|
+
|
|
459
|
+
If the full target section starts in the partial given document, insert the given JSON structure with the "start": "yes", and "start_index": "<physical_index_X>".
|
|
460
|
+
|
|
461
|
+
If the full target section does not start in the partial given document, insert "start": "no", "start_index": None.
|
|
462
|
+
|
|
463
|
+
The response should be in the following format.
|
|
464
|
+
[
|
|
465
|
+
{
|
|
466
|
+
"structure": <structure index, "x.x.x" or None> (string),
|
|
467
|
+
"title": <title of the section>,
|
|
468
|
+
"start": "<yes or no>",
|
|
469
|
+
"physical_index": "<physical_index_X> (keep the format)" or None
|
|
470
|
+
},
|
|
471
|
+
...
|
|
472
|
+
]
|
|
473
|
+
The given structure contains the result of the previous part, you need to fill the result of the current part, do not change the previous result.
|
|
474
|
+
Directly return the final JSON structure. Do not output anything else.
|
|
475
|
+
|
|
476
|
+
Current Partial Document:
|
|
477
|
+
${part}
|
|
478
|
+
|
|
479
|
+
Given Structure:
|
|
480
|
+
${JSON.stringify(structure, null, 2)}`;
|
|
481
|
+
const response = await llmCall(llm, prompt);
|
|
482
|
+
const jsonResult = (0, json_1.extractJson)(response);
|
|
483
|
+
for (const item of jsonResult) {
|
|
484
|
+
delete item['start'];
|
|
485
|
+
}
|
|
486
|
+
return jsonResult;
|
|
487
|
+
}
|
|
488
|
+
async function processNonePageNumbers(tocItems, pageList, startIndex, llm) {
|
|
489
|
+
for (let i = 0; i < tocItems.length; i++) {
|
|
490
|
+
const item = tocItems[i];
|
|
491
|
+
if (item.physical_index == null) {
|
|
492
|
+
let prevPhysicalIndex = 0;
|
|
493
|
+
for (let j = i - 1; j >= 0; j--) {
|
|
494
|
+
if (tocItems[j].physical_index != null) {
|
|
495
|
+
prevPhysicalIndex = tocItems[j].physical_index;
|
|
496
|
+
break;
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
let nextPhysicalIndex = pageList.length + startIndex - 1;
|
|
500
|
+
for (let j = i + 1; j < tocItems.length; j++) {
|
|
501
|
+
if (tocItems[j].physical_index != null) {
|
|
502
|
+
nextPhysicalIndex = tocItems[j].physical_index;
|
|
503
|
+
break;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
const pageContents = [];
|
|
507
|
+
for (let p = prevPhysicalIndex; p <= nextPhysicalIndex; p++) {
|
|
508
|
+
const li = p - startIndex;
|
|
509
|
+
if (li >= 0 && li < pageList.length) {
|
|
510
|
+
pageContents.push(`<physical_index_${p}>\n${pageList[li].text}\n<physical_index_${p}>\n\n`);
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
const itemCopy = (0, tree_1.deepClone)(item);
|
|
514
|
+
delete itemCopy['page'];
|
|
515
|
+
const result = await addPageNumberToToc(pageContents.join(''), [itemCopy], llm);
|
|
516
|
+
const phyStr = result[0]?.physical_index;
|
|
517
|
+
if (typeof phyStr === 'string' && phyStr.startsWith('<physical_index')) {
|
|
518
|
+
const match = phyStr.match(/physical_index_(\d+)/);
|
|
519
|
+
if (match) {
|
|
520
|
+
tocItems[i].physical_index = parseInt(match[1], 10);
|
|
521
|
+
delete tocItems[i].page;
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
return tocItems;
|
|
527
|
+
}
|
|
528
|
+
// ─── Processing Modes ─────────────────────────────────────────────────────────
|
|
529
|
+
async function processNoToc(pageList, startIndex, opts, llm, pr) {
|
|
530
|
+
const pageContents = [];
|
|
531
|
+
const tokenLengths = [];
|
|
532
|
+
for (let i = 0; i < pageList.length; i++) {
|
|
533
|
+
const pi = i + startIndex;
|
|
534
|
+
const text = `<physical_index_${pi}>\n${pageList[i].text}\n<physical_index_${pi}>\n\n`;
|
|
535
|
+
pageContents.push(text);
|
|
536
|
+
tokenLengths.push(opts.counter(text));
|
|
537
|
+
}
|
|
538
|
+
const groupTexts = pageListToGroupText(pageContents, tokenLengths, opts.maxTokenNumEachNode);
|
|
539
|
+
let tocWithPageNumber = await generateTocInit(groupTexts[0], llm, pr, 0, groupTexts.length);
|
|
540
|
+
for (let g = 1; g < groupTexts.length; g++) {
|
|
541
|
+
const additional = await generateTocContinue(tocWithPageNumber, groupTexts[g], llm, pr, g, groupTexts.length);
|
|
542
|
+
tocWithPageNumber = [...tocWithPageNumber, ...additional];
|
|
543
|
+
}
|
|
544
|
+
(0, tree_1.convertPhysicalIndexToInt)(tocWithPageNumber);
|
|
545
|
+
return tocWithPageNumber;
|
|
546
|
+
}
|
|
547
|
+
async function processTocNoPageNumbers(tocContent, pageList, startIndex, opts, llm, pr) {
|
|
548
|
+
const transformedToc = await tocTransformer(tocContent, llm, pr);
|
|
549
|
+
const pageContents = [];
|
|
550
|
+
const tokenLengths = [];
|
|
551
|
+
for (let i = 0; i < pageList.length; i++) {
|
|
552
|
+
const pi = i + startIndex;
|
|
553
|
+
const text = `<physical_index_${pi}>\n${pageList[i].text}\n<physical_index_${pi}>\n\n`;
|
|
554
|
+
pageContents.push(text);
|
|
555
|
+
tokenLengths.push(opts.counter(text));
|
|
556
|
+
}
|
|
557
|
+
const groupTexts = pageListToGroupText(pageContents, tokenLengths, opts.maxTokenNumEachNode);
|
|
558
|
+
pr.report('Mapping TOC entries to page numbers', `Processing ${groupTexts.length} group(s)`);
|
|
559
|
+
let tocWithPageNumber = (0, tree_1.deepClone)(transformedToc);
|
|
560
|
+
for (let g = 0; g < groupTexts.length; g++) {
|
|
561
|
+
pr.advance('Mapping TOC entries to page numbers', `Group ${g + 1} / ${groupTexts.length}`);
|
|
562
|
+
tocWithPageNumber = await addPageNumberToToc(groupTexts[g], tocWithPageNumber, llm);
|
|
563
|
+
}
|
|
564
|
+
(0, tree_1.convertPhysicalIndexToInt)(tocWithPageNumber);
|
|
565
|
+
return tocWithPageNumber;
|
|
566
|
+
}
|
|
567
|
+
async function processTocWithPageNumbers(tocContent, tocPageList, pageList, startIndex, opts, llm, pr) {
|
|
568
|
+
const tocWithPageNumber = await tocTransformer(tocContent, llm, pr);
|
|
569
|
+
const tocNoPageNumber = (0, tree_1.deepClone)(tocWithPageNumber);
|
|
570
|
+
removePageNumber(tocNoPageNumber);
|
|
571
|
+
pr.report('Mapping TOC entries to page numbers', 'Matching TOC entries to physical pages');
|
|
572
|
+
const startPageIndex = tocPageList[tocPageList.length - 1] + 1;
|
|
573
|
+
let mainContent = '';
|
|
574
|
+
const end = Math.min(startPageIndex + opts.tocCheckPageNum, pageList.length);
|
|
575
|
+
for (let p = startPageIndex; p < end; p++) {
|
|
576
|
+
mainContent += `<physical_index_${p + 1}>\n${pageList[p].text}\n<physical_index_${p + 1}>\n\n`;
|
|
577
|
+
}
|
|
578
|
+
let tocWithPhysicalIndex = await tocIndexExtractor(tocNoPageNumber, mainContent, llm);
|
|
579
|
+
(0, tree_1.convertPhysicalIndexToInt)(tocWithPhysicalIndex);
|
|
580
|
+
const matchingPairs = extractMatchingPagePairs(tocWithPageNumber, tocWithPhysicalIndex, startPageIndex);
|
|
581
|
+
const offset = calculatePageOffset(matchingPairs);
|
|
582
|
+
let result;
|
|
583
|
+
if (offset != null) {
|
|
584
|
+
result = addPageOffsetToTocJson((0, tree_1.deepClone)(tocWithPageNumber), offset);
|
|
585
|
+
result = await processNonePageNumbers(result, pageList, startIndex, llm);
|
|
586
|
+
}
|
|
587
|
+
else {
|
|
588
|
+
result = tocWithPhysicalIndex;
|
|
589
|
+
}
|
|
590
|
+
return result;
|
|
591
|
+
}
|
|
592
|
+
// ─── TOC Verification & Fixing ────────────────────────────────────────────────
|
|
593
|
+
async function checkTitleAppearance(item, pageList, startIndex, llm) {
|
|
594
|
+
const title = item.title;
|
|
595
|
+
if (item.physical_index == null) {
|
|
596
|
+
return { list_index: item.list_index ?? 0, answer: 'no', title, page_number: null };
|
|
597
|
+
}
|
|
598
|
+
const pageNumber = item.physical_index;
|
|
599
|
+
const listIdx = pageNumber - startIndex;
|
|
600
|
+
const pageText = listIdx >= 0 && listIdx < pageList.length ? pageList[listIdx].text : '';
|
|
601
|
+
const prompt = `Your job is to check if the given section appears or starts in the given page_text.
|
|
602
|
+
|
|
603
|
+
Note: do fuzzy matching, ignore any space inconsistency in the page_text.
|
|
604
|
+
|
|
605
|
+
The given section title is ${title}.
|
|
606
|
+
The given page_text is ${pageText}.
|
|
607
|
+
|
|
608
|
+
Reply format:
|
|
609
|
+
{
|
|
610
|
+
"thinking": <why do you think the section appears or starts in the page_text>
|
|
611
|
+
"answer": "yes or no" (yes if the section appears or starts in the page_text, no otherwise)
|
|
612
|
+
}
|
|
613
|
+
Directly return the final JSON structure. Do not output anything else.`;
|
|
614
|
+
const response = await llmCall(llm, prompt);
|
|
615
|
+
const json = (0, json_1.extractJson)(response);
|
|
616
|
+
return { list_index: item.list_index ?? 0, answer: json['answer'] ?? 'no', title, page_number: pageNumber };
|
|
617
|
+
}
|
|
618
|
+
async function checkTitleAppearanceInStart(title, pageText, llm) {
|
|
619
|
+
const prompt = `You will be given the current section title and the current page_text.
|
|
620
|
+
Your job is to check if the current section starts in the beginning of the given page_text.
|
|
621
|
+
If there are other contents before the current section title, then the current section does not start in the beginning of the given page_text.
|
|
622
|
+
If the current section title is the first content in the given page_text, then the current section starts in the beginning of the given page_text.
|
|
623
|
+
|
|
624
|
+
Note: do fuzzy matching, ignore any space inconsistency in the page_text.
|
|
625
|
+
|
|
626
|
+
The given section title is ${title}.
|
|
627
|
+
The given page_text is ${pageText}.
|
|
628
|
+
|
|
629
|
+
reply format:
|
|
630
|
+
{
|
|
631
|
+
"thinking": <why do you think the section appears or starts in the page_text>
|
|
632
|
+
"start_begin": "yes or no" (yes if the section starts in the beginning of the page_text, no otherwise)
|
|
633
|
+
}
|
|
634
|
+
Directly return the final JSON structure. Do not output anything else.`;
|
|
635
|
+
const response = await llmCall(llm, prompt);
|
|
636
|
+
const json = (0, json_1.extractJson)(response);
|
|
637
|
+
return json['start_begin'] ?? 'no';
|
|
638
|
+
}
|
|
639
|
+
async function checkTitleAppearanceInStartConcurrent(structure, pageList, llm) {
|
|
640
|
+
for (const item of structure) {
|
|
641
|
+
if (item.physical_index == null)
|
|
642
|
+
item.appear_start = 'no';
|
|
643
|
+
}
|
|
644
|
+
const validItems = structure.filter((item) => item.physical_index != null);
|
|
645
|
+
const results = await Promise.all(validItems.map(async (item) => {
|
|
646
|
+
const pageIdx = item.physical_index - 1;
|
|
647
|
+
const pageText = pageIdx >= 0 && pageIdx < pageList.length ? pageList[pageIdx].text : '';
|
|
648
|
+
try {
|
|
649
|
+
return await checkTitleAppearanceInStart(item.title, pageText, llm);
|
|
650
|
+
}
|
|
651
|
+
catch {
|
|
652
|
+
return 'no';
|
|
653
|
+
}
|
|
654
|
+
}));
|
|
655
|
+
for (let i = 0; i < validItems.length; i++)
|
|
656
|
+
validItems[i].appear_start = results[i];
|
|
657
|
+
return structure;
|
|
658
|
+
}
|
|
659
|
+
async function verifyToc(pageList, listResult, startIndex, llm, pr) {
|
|
660
|
+
pr.report('Verifying TOC accuracy', `Checking ${listResult.length} TOC entries`);
|
|
661
|
+
const lastPhysicalIndex = [...listResult].reverse().find((item) => item.physical_index != null)?.physical_index;
|
|
662
|
+
if (lastPhysicalIndex == null || lastPhysicalIndex < pageList.length / 2) {
|
|
663
|
+
return { accuracy: 0, incorrectResults: [] };
|
|
664
|
+
}
|
|
665
|
+
const indexedSample = listResult
|
|
666
|
+
.map((item, idx) => ({ ...item, list_index: idx }))
|
|
667
|
+
.filter((item) => item.physical_index != null);
|
|
668
|
+
let checked = 0;
|
|
669
|
+
const results = await Promise.all(indexedSample.map(async (item) => {
|
|
670
|
+
const result = await checkTitleAppearance(item, pageList, startIndex, llm);
|
|
671
|
+
checked++;
|
|
672
|
+
pr.advance('Verifying TOC accuracy', `Verified ${checked} / ${indexedSample.length} entries`);
|
|
673
|
+
return result;
|
|
674
|
+
}));
|
|
675
|
+
let correctCount = 0;
|
|
676
|
+
const incorrectResults = [];
|
|
677
|
+
for (const result of results) {
|
|
678
|
+
if (result.answer === 'yes')
|
|
679
|
+
correctCount++;
|
|
680
|
+
else
|
|
681
|
+
incorrectResults.push({ list_index: result.list_index, title: result.title, physical_index: result.page_number });
|
|
682
|
+
}
|
|
683
|
+
const accuracy = results.length > 0 ? correctCount / results.length : 0;
|
|
684
|
+
console.log(`[PageIndex] Verification accuracy: ${(accuracy * 100).toFixed(1)}%`);
|
|
685
|
+
return { accuracy, incorrectResults };
|
|
686
|
+
}
|
|
687
|
+
async function singleTocItemIndexFixer(sectionTitle, content, llm) {
|
|
688
|
+
const prompt = `You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
|
|
689
|
+
|
|
690
|
+
The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
|
|
691
|
+
|
|
692
|
+
Reply in a JSON format:
|
|
693
|
+
{
|
|
694
|
+
"thinking": <explain which page, started and closed by <physical_index_X>, contains the start of this section>,
|
|
695
|
+
"physical_index": "<physical_index_X>" (keep the format)
|
|
696
|
+
}
|
|
697
|
+
Directly return the final JSON structure. Do not output anything else.
|
|
698
|
+
|
|
699
|
+
Section Title:
|
|
700
|
+
${sectionTitle}
|
|
701
|
+
|
|
702
|
+
Document pages:
|
|
703
|
+
${content}`;
|
|
704
|
+
const response = await llmCall(llm, prompt);
|
|
705
|
+
const json = (0, json_1.extractJson)(response);
|
|
706
|
+
const phyStr = json['physical_index'];
|
|
707
|
+
if (typeof phyStr === 'string') {
|
|
708
|
+
const match = phyStr.match(/physical_index_(\d+)/);
|
|
709
|
+
if (match)
|
|
710
|
+
return parseInt(match[1], 10);
|
|
711
|
+
}
|
|
712
|
+
return null;
|
|
713
|
+
}
|
|
714
|
+
async function fixIncorrectToc(tocWithPageNumber, pageList, incorrectResults, startIndex, llm, pr) {
|
|
715
|
+
pr.report('Fixing inaccurate TOC entries', `Correcting ${incorrectResults.length} entry / entries`);
|
|
716
|
+
const incorrectIndices = new Set(incorrectResults.map((r) => r.list_index));
|
|
717
|
+
const endIndex = pageList.length + startIndex - 1;
|
|
718
|
+
const processItem = async (incorrectItem) => {
|
|
719
|
+
const listIndex = incorrectItem.list_index;
|
|
720
|
+
if (listIndex < 0 || listIndex >= tocWithPageNumber.length) {
|
|
721
|
+
return { list_index: listIndex, title: incorrectItem.title, physical_index: null, is_valid: false };
|
|
722
|
+
}
|
|
723
|
+
let prevCorrect = startIndex - 1;
|
|
724
|
+
for (let j = listIndex - 1; j >= 0; j--) {
|
|
725
|
+
if (!incorrectIndices.has(j) && tocWithPageNumber[j].physical_index != null) {
|
|
726
|
+
prevCorrect = tocWithPageNumber[j].physical_index;
|
|
727
|
+
break;
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
let nextCorrect = endIndex;
|
|
731
|
+
for (let j = listIndex + 1; j < tocWithPageNumber.length; j++) {
|
|
732
|
+
if (!incorrectIndices.has(j) && tocWithPageNumber[j].physical_index != null) {
|
|
733
|
+
nextCorrect = tocWithPageNumber[j].physical_index;
|
|
734
|
+
break;
|
|
735
|
+
}
|
|
736
|
+
}
|
|
737
|
+
const pageContents = [];
|
|
738
|
+
for (let p = prevCorrect; p <= nextCorrect; p++) {
|
|
739
|
+
const li = p - startIndex;
|
|
740
|
+
if (li >= 0 && li < pageList.length)
|
|
741
|
+
pageContents.push(`<physical_index_${p}>\n${pageList[li].text}\n<physical_index_${p}>\n\n`);
|
|
742
|
+
}
|
|
743
|
+
const physicalIndex = await singleTocItemIndexFixer(incorrectItem.title, pageContents.join(''), llm);
|
|
744
|
+
const checkItem = { ...incorrectItem, physical_index: physicalIndex, list_index: listIndex };
|
|
745
|
+
const checkResult = await checkTitleAppearance(checkItem, pageList, startIndex, llm);
|
|
746
|
+
return { list_index: listIndex, title: incorrectItem.title, physical_index: physicalIndex, is_valid: checkResult.answer === 'yes' };
|
|
747
|
+
};
|
|
748
|
+
const results = await Promise.all(incorrectResults.map(processItem));
|
|
749
|
+
const stillInvalid = [];
|
|
750
|
+
for (const result of results) {
|
|
751
|
+
if (result.is_valid && result.list_index >= 0 && result.list_index < tocWithPageNumber.length) {
|
|
752
|
+
tocWithPageNumber[result.list_index].physical_index = result.physical_index;
|
|
753
|
+
}
|
|
754
|
+
else {
|
|
755
|
+
stillInvalid.push({ list_index: result.list_index, title: result.title, physical_index: result.physical_index });
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
return { toc: tocWithPageNumber, stillInvalid };
|
|
759
|
+
}
|
|
760
|
+
async function fixIncorrectTocWithRetries(tocWithPageNumber, pageList, incorrectResults, startIndex, llm, pr, maxAttempts = 3) {
|
|
761
|
+
let current = tocWithPageNumber;
|
|
762
|
+
let currentIncorrect = incorrectResults;
|
|
763
|
+
let attempt = 0;
|
|
764
|
+
while (currentIncorrect.length > 0 && attempt < maxAttempts) {
|
|
765
|
+
const { toc, stillInvalid } = await fixIncorrectToc(current, pageList, currentIncorrect, startIndex, llm, pr);
|
|
766
|
+
current = toc;
|
|
767
|
+
currentIncorrect = stillInvalid;
|
|
768
|
+
attempt++;
|
|
769
|
+
if (currentIncorrect.length > 0) {
|
|
770
|
+
pr.advance('Fixing inaccurate TOC entries', `${currentIncorrect.length} remaining — attempt ${attempt + 1}`);
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
return current;
|
|
774
|
+
}
|
|
775
|
+
async function metaProcessor(pageList, mode, opts, llm, pr, startIndex, tocContent, tocPageList) {
|
|
776
|
+
console.log(`[PageIndex] Mode: ${mode}, startIndex: ${startIndex}`);
|
|
777
|
+
let tocWithPageNumber;
|
|
778
|
+
if (mode === 'process_toc_with_page_numbers') {
|
|
779
|
+
tocWithPageNumber = await processTocWithPageNumbers(tocContent, tocPageList, pageList, startIndex, opts, llm, pr);
|
|
780
|
+
}
|
|
781
|
+
else if (mode === 'process_toc_no_page_numbers') {
|
|
782
|
+
tocWithPageNumber = await processTocNoPageNumbers(tocContent, pageList, startIndex, opts, llm, pr);
|
|
783
|
+
}
|
|
784
|
+
else {
|
|
785
|
+
tocWithPageNumber = await processNoToc(pageList, startIndex, opts, llm, pr);
|
|
786
|
+
}
|
|
787
|
+
tocWithPageNumber = tocWithPageNumber.filter((item) => item.physical_index != null);
|
|
788
|
+
tocWithPageNumber = (0, tree_1.validateAndTruncatePhysicalIndices)(tocWithPageNumber, pageList.length, startIndex);
|
|
789
|
+
const { accuracy, incorrectResults } = await verifyToc(pageList, tocWithPageNumber, startIndex, llm, pr);
|
|
790
|
+
if (accuracy === 1.0 && incorrectResults.length === 0)
|
|
791
|
+
return tocWithPageNumber;
|
|
792
|
+
if (accuracy > 0.6 && incorrectResults.length > 0) {
|
|
793
|
+
return fixIncorrectTocWithRetries(tocWithPageNumber, pageList, incorrectResults, startIndex, llm, pr);
|
|
794
|
+
}
|
|
795
|
+
if (mode === 'process_toc_with_page_numbers')
|
|
796
|
+
return metaProcessor(pageList, 'process_toc_no_page_numbers', opts, llm, pr, startIndex, tocContent, tocPageList);
|
|
797
|
+
if (mode === 'process_toc_no_page_numbers')
|
|
798
|
+
return metaProcessor(pageList, 'process_no_toc', opts, llm, pr, startIndex);
|
|
799
|
+
throw new Error('[PageIndex] Processing failed: could not build a valid TOC');
|
|
800
|
+
}
|
|
801
|
+
// ─── Large Node Processing ────────────────────────────────────────────────────
|
|
802
|
+
async function processLargeNodeRecursively(node, pageList, opts, llm, pr) {
|
|
803
|
+
const start = node.start_index ?? 1;
|
|
804
|
+
const end = node.end_index ?? pageList.length;
|
|
805
|
+
const nodePageList = pageList.slice(start - 1, end);
|
|
806
|
+
const tokenNum = nodePageList.reduce((sum, p) => sum + p.tokenCount, 0);
|
|
807
|
+
if (end - start > opts.maxPageNumEachNode && tokenNum >= opts.maxTokenNumEachNode) {
|
|
808
|
+
pr.advance('Resolving large sections', `Sub-indexing "${node.title}" (pages ${start}–${end})`);
|
|
809
|
+
let nodeTocTree = await metaProcessor(nodePageList, 'process_no_toc', opts, llm, pr, start);
|
|
810
|
+
nodeTocTree = await checkTitleAppearanceInStartConcurrent(nodeTocTree, pageList, llm);
|
|
811
|
+
const validItems = nodeTocTree.filter((item) => item.physical_index != null);
|
|
812
|
+
if (validItems.length > 0 && node.title.trim() === validItems[0].title.trim()) {
|
|
813
|
+
node.nodes = (0, tree_1.postProcessing)(validItems.slice(1), end);
|
|
814
|
+
node.end_index = validItems.length > 1 ? validItems[1].start_index ?? end : end;
|
|
815
|
+
}
|
|
816
|
+
else {
|
|
817
|
+
node.nodes = (0, tree_1.postProcessing)(validItems, end);
|
|
818
|
+
node.end_index = validItems.length > 0 ? validItems[0].start_index ?? end : end;
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
if (node.nodes && node.nodes.length > 0) {
|
|
822
|
+
await Promise.all(node.nodes.map((child) => processLargeNodeRecursively(child, pageList, opts, llm, pr)));
|
|
823
|
+
}
|
|
824
|
+
return node;
|
|
825
|
+
}
|
|
826
|
+
// ─── Summary Generation ───────────────────────────────────────────────────────
|
|
827
|
+
async function generateNodeSummary(node, llm) {
|
|
828
|
+
const prompt = `You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document.
|
|
829
|
+
|
|
830
|
+
Partial Document Text: ${node.text}
|
|
831
|
+
|
|
832
|
+
Directly return the description, do not include any other text.`;
|
|
833
|
+
return llmCall(llm, prompt);
|
|
834
|
+
}
|
|
835
|
+
async function generateSummariesForStructure(structure, llm, pr) {
|
|
836
|
+
const nodes = (0, tree_1.structureToList)(structure);
|
|
837
|
+
pr.report('Generating node summaries', `0 / ${nodes.length} nodes`);
|
|
838
|
+
let done = 0;
|
|
839
|
+
const summaries = await Promise.all(nodes.map(async (n) => {
|
|
840
|
+
const summary = await generateNodeSummary(n, llm);
|
|
841
|
+
done++;
|
|
842
|
+
pr.advance('Generating node summaries', `${done} / ${nodes.length} nodes`);
|
|
843
|
+
return summary;
|
|
844
|
+
}));
|
|
845
|
+
for (let i = 0; i < nodes.length; i++)
|
|
846
|
+
nodes[i].summary = summaries[i];
|
|
847
|
+
}
|
|
848
|
+
async function generateDocDescription(structure, llm, pr) {
|
|
849
|
+
pr.report('Generating document description');
|
|
850
|
+
const prompt = `Your are an expert in generating descriptions for a document.
|
|
851
|
+
You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
|
|
852
|
+
|
|
853
|
+
Document Structure: ${JSON.stringify(structure)}
|
|
854
|
+
|
|
855
|
+
Directly return the description, do not include any other text.`;
|
|
856
|
+
return llmCall(llm, prompt);
|
|
857
|
+
}
|
|
858
|
+
// ─── Tree Parser ──────────────────────────────────────────────────────────────
|
|
859
|
+
async function treeParser(pageList, opts, llm, pr) {
|
|
860
|
+
const checkTocResult = await checkToc(pageList, opts, llm, pr);
|
|
861
|
+
let tocWithPageNumber;
|
|
862
|
+
if (checkTocResult.toc_content &&
|
|
863
|
+
checkTocResult.toc_content.trim() &&
|
|
864
|
+
checkTocResult.page_index_given_in_toc === 'yes') {
|
|
865
|
+
tocWithPageNumber = await metaProcessor(pageList, 'process_toc_with_page_numbers', opts, llm, pr, 1, checkTocResult.toc_content, checkTocResult.toc_page_list);
|
|
866
|
+
}
|
|
867
|
+
else {
|
|
868
|
+
tocWithPageNumber = await metaProcessor(pageList, 'process_no_toc', opts, llm, pr, 1);
|
|
869
|
+
}
|
|
870
|
+
tocWithPageNumber = (0, tree_1.addPrefaceIfNeeded)(tocWithPageNumber);
|
|
871
|
+
tocWithPageNumber = await checkTitleAppearanceInStartConcurrent(tocWithPageNumber, pageList, llm);
|
|
872
|
+
const validTocItems = tocWithPageNumber.filter((item) => item.physical_index != null);
|
|
873
|
+
pr.report('Resolving large sections', 'Building final tree structure');
|
|
874
|
+
const tocTree = (0, tree_1.postProcessing)(validTocItems, pageList.length);
|
|
875
|
+
await Promise.all(tocTree.map((node) => processLargeNodeRecursively(node, pageList, opts, llm, pr)));
|
|
876
|
+
return tocTree;
|
|
877
|
+
}
|
|
878
|
+
// ─── Public API ───────────────────────────────────────────────────────────────
|
|
879
|
+
/**
|
|
880
|
+
* Builds a hierarchical tree index from a PDF document.
|
|
881
|
+
*
|
|
882
|
+
* Supply either `pdf` (raw PDF bytes, requires pdfjs-dist) or pre-extracted
|
|
883
|
+
* `pages` (array of `{text, tokenCount}` — one entry per page).
|
|
884
|
+
*
|
|
885
|
+
* @example — with OpenAI + progress bar
|
|
886
|
+
* ```ts
|
|
887
|
+
* import { pageIndex } from 'react-native-pageindex';
|
|
888
|
+
* import OpenAI from 'openai';
|
|
889
|
+
*
|
|
890
|
+
* const openai = new OpenAI({ apiKey: '...' });
|
|
891
|
+
*
|
|
892
|
+
* const result = await pageIndex({
|
|
893
|
+
* pages: myExtractedPages,
|
|
894
|
+
* docName: 'annual-report',
|
|
895
|
+
* llm: async (prompt, opts) => {
|
|
896
|
+
* const res = await openai.chat.completions.create({
|
|
897
|
+
* model: 'gpt-4o',
|
|
898
|
+
* messages: [...(opts?.chatHistory ?? []), { role: 'user', content: prompt }],
|
|
899
|
+
* });
|
|
900
|
+
* return { content: res.choices[0].message.content ?? '', finishReason: res.choices[0].finish_reason ?? 'stop' };
|
|
901
|
+
* },
|
|
902
|
+
* options: {
|
|
903
|
+
* onProgress: ({ step, percent, detail }) => {
|
|
904
|
+
* console.log(`[${percent}%] ${step}${detail ? ` — ${detail}` : ''}`);
|
|
905
|
+
* },
|
|
906
|
+
* },
|
|
907
|
+
* });
|
|
908
|
+
* ```
|
|
909
|
+
*/
|
|
910
|
+
async function pageIndex(input) {
|
|
911
|
+
const { pdf, pages: rawPages, llm, docName = 'document', options = {} } = input;
|
|
912
|
+
if (!pdf && !rawPages) {
|
|
913
|
+
throw new Error('[PageIndex] Provide either `pdf` (ArrayBuffer) or `pages` (PageData[])');
|
|
914
|
+
}
|
|
915
|
+
const opts = {
|
|
916
|
+
tocCheckPageNum: options.tocCheckPageNum ?? config_1.DEFAULT_PDF_OPTIONS.tocCheckPageNum,
|
|
917
|
+
maxPageNumEachNode: options.maxPageNumEachNode ?? config_1.DEFAULT_PDF_OPTIONS.maxPageNumEachNode,
|
|
918
|
+
maxTokenNumEachNode: options.maxTokenNumEachNode ?? config_1.DEFAULT_PDF_OPTIONS.maxTokenNumEachNode,
|
|
919
|
+
ifAddNodeId: options.ifAddNodeId ?? config_1.DEFAULT_PDF_OPTIONS.ifAddNodeId,
|
|
920
|
+
ifAddNodeSummary: options.ifAddNodeSummary ?? config_1.DEFAULT_PDF_OPTIONS.ifAddNodeSummary,
|
|
921
|
+
ifAddDocDescription: options.ifAddDocDescription ?? config_1.DEFAULT_PDF_OPTIONS.ifAddDocDescription,
|
|
922
|
+
ifAddNodeText: options.ifAddNodeText ?? config_1.DEFAULT_PDF_OPTIONS.ifAddNodeText,
|
|
923
|
+
counter: options.tokenCounter ?? tokens_1.defaultTokenCounter,
|
|
924
|
+
};
|
|
925
|
+
const pr = new progress_1.ProgressReporter([...PDF_STEPS], options.onProgress);
|
|
926
|
+
pr.report('Initializing');
|
|
927
|
+
let pageList;
|
|
928
|
+
if (rawPages) {
|
|
929
|
+
pageList = rawPages;
|
|
930
|
+
}
|
|
931
|
+
else {
|
|
932
|
+
pr.report('Extracting PDF pages');
|
|
933
|
+
const { extractPdfPages } = await Promise.resolve().then(() => __importStar(require('./utils/pdf')));
|
|
934
|
+
pageList = await extractPdfPages(pdf, opts.counter);
|
|
935
|
+
}
|
|
936
|
+
console.log(`[PageIndex] Processing ${pageList.length} pages`);
|
|
937
|
+
const structure = await treeParser(pageList, opts, llm, pr);
|
|
938
|
+
if (opts.ifAddNodeId)
|
|
939
|
+
(0, tree_1.writeNodeId)(structure);
|
|
940
|
+
if (opts.ifAddNodeSummary) {
|
|
941
|
+
if (!opts.ifAddNodeText) {
|
|
942
|
+
pr.report('Attaching page text to nodes');
|
|
943
|
+
(0, tree_1.addNodeText)(structure, pageList);
|
|
944
|
+
}
|
|
945
|
+
await generateSummariesForStructure(structure, llm, pr);
|
|
946
|
+
if (!opts.ifAddNodeText)
|
|
947
|
+
(0, tree_1.removeStructureText)(structure);
|
|
948
|
+
if (opts.ifAddDocDescription) {
|
|
949
|
+
const cleanStructure = (0, tree_1.createCleanStructureForDescription)(structure);
|
|
950
|
+
const docDescription = await generateDocDescription(cleanStructure, llm, pr);
|
|
951
|
+
pr.report('Done');
|
|
952
|
+
return { doc_name: docName, doc_description: docDescription, structure };
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
else if (opts.ifAddNodeText) {
|
|
956
|
+
pr.report('Attaching page text to nodes');
|
|
957
|
+
(0, tree_1.addNodeText)(structure, pageList);
|
|
958
|
+
}
|
|
959
|
+
pr.report('Done');
|
|
960
|
+
return { doc_name: docName, structure };
|
|
961
|
+
}
|
|
962
|
+
//# sourceMappingURL=pageIndex.js.map
|