pageindex 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/toc.ts ADDED
@@ -0,0 +1,468 @@
1
+ /**
2
+ * bun-pageindex: TOC Detection and Extraction
3
+ * Functions for detecting, extracting, and processing table of contents
4
+ */
5
+
6
+ import { chatGPT, chatGPTWithFinishReason, type ClientConfig } from "./openai";
7
+ import type { PdfPage } from "./pdf";
8
+ import type { TocItem, TocCheckResult } from "./types";
9
+ import { extractJson, getJsonContent, countTokens, convertPhysicalIndexToInt, convertPageToInt } from "./utils";
10
+ import * as prompts from "./prompts";
11
+
12
+ export interface TocOptions {
13
+ model: string;
14
+ tocCheckPageNum: number;
15
+ apiKey?: string;
16
+ baseUrl?: string;
17
+ }
18
+
19
+ /**
20
+ * Detect if a single page contains a TOC
21
+ */
22
+ export async function tocDetectorSinglePage(
23
+ content: string,
24
+ options: TocOptions
25
+ ): Promise<"yes" | "no"> {
26
+ const prompt = prompts.tocDetectorPrompt(content);
27
+ const response = await chatGPT({
28
+ model: options.model,
29
+ prompt,
30
+ apiKey: options.apiKey,
31
+ baseUrl: options.baseUrl,
32
+ });
33
+
34
+ const json = extractJson<{ toc_detected: string }>(response);
35
+ return (json?.toc_detected === "yes" ? "yes" : "no");
36
+ }
37
+
38
+ /**
39
+ * Find all pages containing TOC
40
+ */
41
+ export async function findTocPages(
42
+ startPageIndex: number,
43
+ pages: PdfPage[],
44
+ options: TocOptions
45
+ ): Promise<number[]> {
46
+ let lastPageIsYes = false;
47
+ const tocPageList: number[] = [];
48
+ let i = startPageIndex;
49
+
50
+ while (i < pages.length) {
51
+ // Only check beyond max_pages if we're still finding TOC pages
52
+ if (i >= options.tocCheckPageNum && !lastPageIsYes) {
53
+ break;
54
+ }
55
+
56
+ const page = pages[i];
57
+ if (!page) {
58
+ i++;
59
+ continue;
60
+ }
61
+
62
+ const detected = await tocDetectorSinglePage(page.text, options);
63
+
64
+ if (detected === "yes") {
65
+ tocPageList.push(i);
66
+ lastPageIsYes = true;
67
+ } else if (detected === "no" && lastPageIsYes) {
68
+ break;
69
+ }
70
+
71
+ i++;
72
+ }
73
+
74
+ return tocPageList;
75
+ }
76
+
77
+ /**
78
+ * Transform dots/ellipsis to colon in TOC text
79
+ */
80
+ function transformDotsToColon(text: string): string {
81
+ // Handle multiple consecutive dots
82
+ text = text.replace(/\.{5,}/g, ": ");
83
+ // Handle dots separated by spaces
84
+ text = text.replace(/(?:\. ){5,}\.?/g, ": ");
85
+ return text;
86
+ }
87
+
88
+ /**
89
+ * Detect if page numbers are given in TOC
90
+ */
91
+ export async function detectPageIndex(
92
+ tocContent: string,
93
+ options: TocOptions
94
+ ): Promise<"yes" | "no"> {
95
+ const prompt = prompts.detectPageIndexPrompt(tocContent);
96
+ const response = await chatGPT({
97
+ model: options.model,
98
+ prompt,
99
+ apiKey: options.apiKey,
100
+ baseUrl: options.baseUrl,
101
+ });
102
+
103
+ const json = extractJson<{ page_index_given_in_toc: string }>(response);
104
+ return json?.page_index_given_in_toc === "yes" ? "yes" : "no";
105
+ }
106
+
107
+ /**
108
+ * Extract TOC content from pages
109
+ */
110
+ export async function tocExtractor(
111
+ pages: PdfPage[],
112
+ tocPageList: number[],
113
+ options: TocOptions
114
+ ): Promise<{ tocContent: string; pageIndexGivenInToc: "yes" | "no" }> {
115
+ let tocContent = "";
116
+ for (const pageIndex of tocPageList) {
117
+ const page = pages[pageIndex];
118
+ if (page) {
119
+ tocContent += page.text;
120
+ }
121
+ }
122
+ tocContent = transformDotsToColon(tocContent);
123
+
124
+ const hasPageIndex = await detectPageIndex(tocContent, options);
125
+
126
+ return {
127
+ tocContent,
128
+ pageIndexGivenInToc: hasPageIndex,
129
+ };
130
+ }
131
+
132
+ /**
133
+ * Check if TOC transformation is complete
134
+ */
135
+ async function checkTocTransformationComplete(
136
+ rawToc: string,
137
+ cleanedToc: string,
138
+ options: TocOptions
139
+ ): Promise<boolean> {
140
+ const prompt = prompts.checkTocTransformationCompletePrompt(rawToc, cleanedToc);
141
+ const response = await chatGPT({
142
+ model: options.model,
143
+ prompt,
144
+ apiKey: options.apiKey,
145
+ baseUrl: options.baseUrl,
146
+ });
147
+
148
+ const json = extractJson<{ completed: string }>(response);
149
+ return json?.completed === "yes";
150
+ }
151
+
152
+ /**
153
+ * Transform raw TOC content to JSON structure
154
+ */
155
+ export async function tocTransformer(
156
+ tocContent: string,
157
+ options: TocOptions
158
+ ): Promise<TocItem[]> {
159
+ const prompt = prompts.tocTransformerPrompt(tocContent);
160
+
161
+ let { content: lastComplete, finishReason } = await chatGPTWithFinishReason({
162
+ model: options.model,
163
+ prompt,
164
+ apiKey: options.apiKey,
165
+ baseUrl: options.baseUrl,
166
+ });
167
+
168
+ let isComplete = await checkTocTransformationComplete(tocContent, lastComplete, options);
169
+
170
+ if (isComplete && finishReason === "finished") {
171
+ const json = extractJson<{ table_of_contents: TocItem[] }>(lastComplete);
172
+ if (json?.table_of_contents) {
173
+ return convertPageToInt(json.table_of_contents);
174
+ }
175
+ }
176
+
177
+ // Handle continuation if not complete
178
+ lastComplete = getJsonContent(lastComplete);
179
+ let attempts = 0;
180
+ const maxAttempts = 5;
181
+
182
+ while (!(isComplete && finishReason === "finished") && attempts < maxAttempts) {
183
+ // Trim to last complete object
184
+ const position = lastComplete.lastIndexOf("}");
185
+ if (position !== -1) {
186
+ lastComplete = lastComplete.slice(0, position + 2);
187
+ }
188
+
189
+ const continuePrompt = prompts.tocTransformerContinuePrompt(tocContent, lastComplete);
190
+ const result = await chatGPTWithFinishReason({
191
+ model: options.model,
192
+ prompt: continuePrompt,
193
+ apiKey: options.apiKey,
194
+ baseUrl: options.baseUrl,
195
+ });
196
+
197
+ let newContent = result.content;
198
+ finishReason = result.finishReason;
199
+
200
+ if (newContent.startsWith("```json")) {
201
+ newContent = getJsonContent(newContent);
202
+ }
203
+ lastComplete = lastComplete + newContent;
204
+
205
+ isComplete = await checkTocTransformationComplete(tocContent, lastComplete, options);
206
+ attempts++;
207
+ }
208
+
209
+ try {
210
+ const parsed = JSON.parse(lastComplete);
211
+ return convertPageToInt(parsed.table_of_contents || parsed);
212
+ } catch {
213
+ console.error("Failed to parse TOC JSON");
214
+ return [];
215
+ }
216
+ }
217
+
218
+ /**
219
+ * Extract physical index from pages for TOC items
220
+ */
221
+ export async function tocIndexExtractor(
222
+ toc: TocItem[],
223
+ content: string,
224
+ options: TocOptions
225
+ ): Promise<TocItem[]> {
226
+ const prompt = prompts.tocIndexExtractorPrompt(JSON.stringify(toc), content);
227
+ const response = await chatGPT({
228
+ model: options.model,
229
+ prompt,
230
+ apiKey: options.apiKey,
231
+ baseUrl: options.baseUrl,
232
+ });
233
+
234
+ const json = extractJson<TocItem[]>(response);
235
+ return json || [];
236
+ }
237
+
238
+ /**
239
+ * Generate TOC from document pages (no existing TOC)
240
+ */
241
+ export async function generateTocInit(
242
+ part: string,
243
+ options: TocOptions
244
+ ): Promise<TocItem[]> {
245
+ const prompt = prompts.generateTocInitPrompt(part);
246
+ const { content, finishReason } = await chatGPTWithFinishReason({
247
+ model: options.model,
248
+ prompt,
249
+ apiKey: options.apiKey,
250
+ baseUrl: options.baseUrl,
251
+ });
252
+
253
+ if (finishReason === "finished") {
254
+ const json = extractJson<TocItem[]>(content);
255
+ return json || [];
256
+ }
257
+
258
+ throw new Error(`Generation incomplete: ${finishReason}`);
259
+ }
260
+
261
+ /**
262
+ * Continue TOC generation with previous structure
263
+ */
264
+ export async function generateTocContinue(
265
+ tocContent: TocItem[],
266
+ part: string,
267
+ options: TocOptions
268
+ ): Promise<TocItem[]> {
269
+ const prompt = prompts.generateTocContinuePrompt(part, JSON.stringify(tocContent, null, 2));
270
+ const { content, finishReason } = await chatGPTWithFinishReason({
271
+ model: options.model,
272
+ prompt,
273
+ apiKey: options.apiKey,
274
+ baseUrl: options.baseUrl,
275
+ });
276
+
277
+ if (finishReason === "finished") {
278
+ const json = extractJson<TocItem[]>(content);
279
+ return json || [];
280
+ }
281
+
282
+ throw new Error(`Generation incomplete: ${finishReason}`);
283
+ }
284
+
285
+ /**
286
+ * Add page numbers to TOC structure from document parts
287
+ */
288
+ export async function addPageNumberToToc(
289
+ part: string,
290
+ structure: TocItem[],
291
+ options: TocOptions
292
+ ): Promise<TocItem[]> {
293
+ const prompt = prompts.addPageNumberToTocPrompt(part, JSON.stringify(structure, null, 2));
294
+ const response = await chatGPT({
295
+ model: options.model,
296
+ prompt,
297
+ apiKey: options.apiKey,
298
+ baseUrl: options.baseUrl,
299
+ });
300
+
301
+ const json = extractJson<TocItem[]>(response);
302
+ if (!json) return structure;
303
+
304
+ // Remove 'start' field from items
305
+ for (const item of json) {
306
+ delete (item as unknown as Record<string, unknown>).start;
307
+ }
308
+
309
+ return json;
310
+ }
311
+
312
+ /**
313
+ * Check title appearance in page
314
+ */
315
+ export async function checkTitleAppearance(
316
+ item: TocItem,
317
+ pages: PdfPage[],
318
+ startIndex: number,
319
+ options: TocOptions
320
+ ): Promise<{ listIndex: number | undefined; answer: "yes" | "no"; title: string; pageNumber: number | undefined }> {
321
+ const title = item.title;
322
+
323
+ if (!item.physicalIndex) {
324
+ return { listIndex: item.listIndex, answer: "no", title, pageNumber: undefined };
325
+ }
326
+
327
+ const pageNumber = item.physicalIndex;
328
+ const pageText = pages[pageNumber - startIndex]?.text || "";
329
+
330
+ const prompt = prompts.checkTitleAppearancePrompt(title, pageText);
331
+ const response = await chatGPT({
332
+ model: options.model,
333
+ prompt,
334
+ apiKey: options.apiKey,
335
+ baseUrl: options.baseUrl,
336
+ });
337
+
338
+ const json = extractJson<{ answer: string }>(response);
339
+ const answer = json?.answer === "yes" ? "yes" : "no";
340
+
341
+ return { listIndex: item.listIndex, answer, title, pageNumber };
342
+ }
343
+
344
+ /**
345
+ * Check title appearance at start of page
346
+ */
347
+ export async function checkTitleAppearanceInStart(
348
+ title: string,
349
+ pageText: string,
350
+ options: TocOptions
351
+ ): Promise<"yes" | "no"> {
352
+ const prompt = prompts.checkTitleStartAtBeginningPrompt(title, pageText);
353
+ const response = await chatGPT({
354
+ model: options.model,
355
+ prompt,
356
+ apiKey: options.apiKey,
357
+ baseUrl: options.baseUrl,
358
+ });
359
+
360
+ const json = extractJson<{ start_begin: string }>(response);
361
+ return json?.start_begin === "yes" ? "yes" : "no";
362
+ }
363
+
364
+ /**
365
+ * Check title appearance in start for multiple items concurrently
366
+ */
367
+ export async function checkTitleAppearanceInStartConcurrent(
368
+ structure: TocItem[],
369
+ pages: PdfPage[],
370
+ options: TocOptions
371
+ ): Promise<TocItem[]> {
372
+ const results: TocItem[] = [];
373
+
374
+ for (const item of structure) {
375
+ if (!item.physicalIndex) {
376
+ results.push({ ...item, appearStart: "no" });
377
+ continue;
378
+ }
379
+
380
+ const pageText = pages[item.physicalIndex - 1]?.text || "";
381
+ const appearStart = await checkTitleAppearanceInStart(item.title, pageText, options);
382
+ results.push({ ...item, appearStart });
383
+ }
384
+
385
+ return results;
386
+ }
387
+
388
+ /**
389
+ * Check for TOC in PDF and return result
390
+ */
391
+ export async function checkToc(
392
+ pages: PdfPage[],
393
+ options: TocOptions
394
+ ): Promise<TocCheckResult> {
395
+ const tocPageList = await findTocPages(0, pages, options);
396
+
397
+ if (tocPageList.length === 0) {
398
+ return {
399
+ tocContent: null,
400
+ tocPageList: [],
401
+ pageIndexGivenInToc: "no",
402
+ };
403
+ }
404
+
405
+ const tocResult = await tocExtractor(pages, tocPageList, options);
406
+
407
+ if (tocResult.pageIndexGivenInToc === "yes") {
408
+ return {
409
+ tocContent: tocResult.tocContent,
410
+ tocPageList,
411
+ pageIndexGivenInToc: "yes",
412
+ };
413
+ }
414
+
415
+ // Try to find additional TOC pages with page indices
416
+ const lastTocPage = tocPageList[tocPageList.length - 1];
417
+ let currentStartIndex = lastTocPage !== undefined ? lastTocPage + 1 : 0;
418
+
419
+ while (currentStartIndex < pages.length && currentStartIndex < options.tocCheckPageNum) {
420
+ const additionalTocPages = await findTocPages(currentStartIndex, pages, options);
421
+
422
+ if (additionalTocPages.length === 0) {
423
+ break;
424
+ }
425
+
426
+ const additionalTocResult = await tocExtractor(pages, additionalTocPages, options);
427
+
428
+ if (additionalTocResult.pageIndexGivenInToc === "yes") {
429
+ return {
430
+ tocContent: additionalTocResult.tocContent,
431
+ tocPageList: additionalTocPages,
432
+ pageIndexGivenInToc: "yes",
433
+ };
434
+ }
435
+
436
+ const lastAdditionalPage = additionalTocPages[additionalTocPages.length - 1];
437
+ currentStartIndex = lastAdditionalPage !== undefined ? lastAdditionalPage + 1 : pages.length;
438
+ }
439
+
440
+ return {
441
+ tocContent: tocResult.tocContent,
442
+ tocPageList,
443
+ pageIndexGivenInToc: "no",
444
+ };
445
+ }
446
+
447
+ /**
448
+ * Fix single TOC item index
449
+ */
450
+ export async function singleTocItemIndexFixer(
451
+ sectionTitle: string,
452
+ content: string,
453
+ options: TocOptions
454
+ ): Promise<number | null> {
455
+ const prompt = prompts.singleTocItemIndexFixerPrompt(sectionTitle, content);
456
+ const response = await chatGPT({
457
+ model: options.model,
458
+ prompt,
459
+ apiKey: options.apiKey,
460
+ baseUrl: options.baseUrl,
461
+ });
462
+
463
+ const json = extractJson<{ physical_index: string }>(response);
464
+ if (!json?.physical_index) return null;
465
+
466
+ const result = convertPhysicalIndexToInt(json.physical_index);
467
+ return typeof result === "number" ? result : null;
468
+ }