pdf-oxide 0.3.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -0
- package/binding.gyp +35 -0
- package/package.json +78 -0
- package/src/builders/annotation-builder.ts +367 -0
- package/src/builders/conversion-options-builder.ts +257 -0
- package/src/builders/index.ts +12 -0
- package/src/builders/metadata-builder.ts +317 -0
- package/src/builders/pdf-builder.ts +386 -0
- package/src/builders/search-options-builder.ts +151 -0
- package/src/document-editor-manager.ts +318 -0
- package/src/errors.ts +1629 -0
- package/src/form-field-manager.ts +666 -0
- package/src/hybrid-ml-manager.ts +283 -0
- package/src/index.ts +453 -0
- package/src/managers/accessibility-manager.ts +338 -0
- package/src/managers/annotation-manager.ts +439 -0
- package/src/managers/barcode-manager.ts +235 -0
- package/src/managers/batch-manager.ts +533 -0
- package/src/managers/cache-manager.ts +486 -0
- package/src/managers/compliance-manager.ts +375 -0
- package/src/managers/content-manager.ts +339 -0
- package/src/managers/document-utility-manager.ts +922 -0
- package/src/managers/dom-pdf-creator.ts +365 -0
- package/src/managers/editing-manager.ts +514 -0
- package/src/managers/enterprise-manager.ts +478 -0
- package/src/managers/extended-managers.ts +437 -0
- package/src/managers/extraction-manager.ts +583 -0
- package/src/managers/final-utilities.ts +429 -0
- package/src/managers/hybrid-ml-advanced.ts +479 -0
- package/src/managers/index.ts +239 -0
- package/src/managers/layer-manager.ts +500 -0
- package/src/managers/metadata-manager.ts +303 -0
- package/src/managers/ocr-manager.ts +756 -0
- package/src/managers/optimization-manager.ts +262 -0
- package/src/managers/outline-manager.ts +196 -0
- package/src/managers/page-manager.ts +289 -0
- package/src/managers/pattern-detection.ts +440 -0
- package/src/managers/rendering-manager.ts +863 -0
- package/src/managers/search-manager.ts +385 -0
- package/src/managers/security-manager.ts +345 -0
- package/src/managers/signature-manager.ts +1664 -0
- package/src/managers/streams.ts +618 -0
- package/src/managers/xfa-manager.ts +500 -0
- package/src/pdf-creator-manager.ts +494 -0
- package/src/properties.ts +522 -0
- package/src/result-accessors-manager.ts +867 -0
- package/src/tests/advanced-features.test.ts +414 -0
- package/src/tests/advanced.test.ts +266 -0
- package/src/tests/extended-managers.test.ts +316 -0
- package/src/tests/final-utilities.test.ts +455 -0
- package/src/tests/foundation.test.ts +315 -0
- package/src/tests/high-demand.test.ts +257 -0
- package/src/tests/specialized.test.ts +97 -0
- package/src/thumbnail-manager.ts +272 -0
- package/src/types/common.ts +142 -0
- package/src/types/document-types.ts +457 -0
- package/src/types/index.ts +6 -0
- package/src/types/manager-types.ts +284 -0
- package/src/types/native-bindings.ts +517 -0
- package/src/workers/index.ts +7 -0
- package/src/workers/pool.ts +274 -0
- package/src/workers/worker.ts +131 -0
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Manager for content extraction from PDF documents
|
|
3
|
+
*
|
|
4
|
+
* Caching is handled automatically at the Rust FFI layer, eliminating
|
|
5
|
+
* the need for duplicate cache implementations in the binding.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* import { ExtractionManager, ConversionOptionsBuilder } from 'pdf_oxide';
|
|
10
|
+
*
|
|
11
|
+
* const doc = PdfDocument.open('document.pdf');
|
|
12
|
+
* const extractionManager = new ExtractionManager(doc);
|
|
13
|
+
*
|
|
14
|
+
* // Extract text from a single page
|
|
15
|
+
* const text = extractionManager.extractText(0);
|
|
16
|
+
* console.log(text);
|
|
17
|
+
*
|
|
18
|
+
* // Extract all text
|
|
19
|
+
* const allText = extractionManager.extractAllText();
|
|
20
|
+
*
|
|
21
|
+
* // Extract with custom options
|
|
22
|
+
* const options = ConversionOptionsBuilder.highQuality().build();
|
|
23
|
+
* const markdown = extractionManager.extractMarkdown(0, options);
|
|
24
|
+
* ```
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
export interface ContentStatistics {
|
|
28
|
+
pageCount: number;
|
|
29
|
+
wordCount: number;
|
|
30
|
+
characterCount: number;
|
|
31
|
+
averageWordsPerPage: number;
|
|
32
|
+
averageCharactersPerPage: number;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface SearchMatch {
|
|
36
|
+
pageIndex: number;
|
|
37
|
+
pageNumber: number;
|
|
38
|
+
matchIndex: number;
|
|
39
|
+
snippet: string;
|
|
40
|
+
matchText: string;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export class ExtractionManager {
|
|
44
|
+
private _document: any;
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Creates a new ExtractionManager for the given document
|
|
48
|
+
* @param document - The PDF document
|
|
49
|
+
* @throws Error if document is null or undefined
|
|
50
|
+
*/
|
|
51
|
+
constructor(document: any) {
|
|
52
|
+
if (!document) {
|
|
53
|
+
throw new Error('Document is required');
|
|
54
|
+
}
|
|
55
|
+
this._document = document;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Extracts text from a single page.
|
|
60
|
+
*
|
|
61
|
+
* The native layer produces UTF-8 bytes, which Node decodes into a JS
|
|
62
|
+
* `string` (UTF-16 code units internally). As a result,
|
|
63
|
+
* `text.length` reports UTF-16 code units, not bytes — so a 648-byte
|
|
64
|
+
* UTF-8 string containing two accented letters reads as 646 in JS. Use
|
|
65
|
+
* `Buffer.byteLength(text, 'utf8')` if you need the byte count (e.g. to
|
|
66
|
+
* compare against Go's `len(string)` or Rust's `String::len()`).
|
|
67
|
+
*
|
|
68
|
+
* Results are automatically cached at the FFI layer.
|
|
69
|
+
*
|
|
70
|
+
* @param pageIndex - Zero-based page index
|
|
71
|
+
* @param options - Conversion options
|
|
72
|
+
* @returns Extracted text (UTF-16 code units)
|
|
73
|
+
* @throws Error if page index is invalid
|
|
74
|
+
*
|
|
75
|
+
* @example
|
|
76
|
+
* ```typescript
|
|
77
|
+
* const text = manager.extractText(0);
|
|
78
|
+
* console.log(`Page 1: ${text.length} UTF-16 code units`);
|
|
79
|
+
* console.log(` ${Buffer.byteLength(text, 'utf8')} UTF-8 bytes`);
|
|
80
|
+
* ```
|
|
81
|
+
*/
|
|
82
|
+
extractText(pageIndex: number, options?: Record<string, any>): string {
|
|
83
|
+
if (typeof pageIndex !== 'number' || pageIndex < 0) {
|
|
84
|
+
throw new Error('Page index must be a non-negative number');
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (pageIndex >= this._document.pageCount) {
|
|
88
|
+
throw new Error(`Page index ${pageIndex} out of range`);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
try {
|
|
92
|
+
return this._document.extractText(pageIndex);
|
|
93
|
+
} catch (error) {
|
|
94
|
+
throw new Error(`Failed to extract text from page ${pageIndex}: ${(error as Error).message}`);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Extracts text from all pages
|
|
100
|
+
* @param options - Conversion options
|
|
101
|
+
* @returns All extracted text concatenated
|
|
102
|
+
*
|
|
103
|
+
* @example
|
|
104
|
+
* ```typescript
|
|
105
|
+
* const allText = manager.extractAllText();
|
|
106
|
+
* console.log(`Total characters: ${allText.length}`);
|
|
107
|
+
* ```
|
|
108
|
+
*/
|
|
109
|
+
extractAllText(options?: Record<string, any>): string {
|
|
110
|
+
try {
|
|
111
|
+
const parts: string[] = [];
|
|
112
|
+
for (let i = 0; i < this._document.pageCount; i++) {
|
|
113
|
+
parts.push(this.extractText(i, options));
|
|
114
|
+
}
|
|
115
|
+
return parts.join('\n');
|
|
116
|
+
} catch (error) {
|
|
117
|
+
throw new Error(`Failed to extract all text: ${(error as Error).message}`);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Extracts text from a range of pages
|
|
123
|
+
* @param startPageIndex - Zero-based start page index
|
|
124
|
+
* @param endPageIndex - Zero-based end page index (inclusive)
|
|
125
|
+
* @param options - Conversion options
|
|
126
|
+
* @returns Extracted text from pages in range
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* const text = manager.extractTextRange(0, 10);
|
|
131
|
+
* console.log(`Text from pages 1-11: ${text}`);
|
|
132
|
+
* ```
|
|
133
|
+
*/
|
|
134
|
+
extractTextRange(
|
|
135
|
+
startPageIndex: number,
|
|
136
|
+
endPageIndex: number,
|
|
137
|
+
options?: Record<string, any>
|
|
138
|
+
): string {
|
|
139
|
+
if (typeof startPageIndex !== 'number' || startPageIndex < 0) {
|
|
140
|
+
throw new Error('Start page index must be a non-negative number');
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (typeof endPageIndex !== 'number' || endPageIndex < startPageIndex) {
|
|
144
|
+
throw new Error('End page index must be >= start page index');
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if (endPageIndex >= this._document.pageCount) {
|
|
148
|
+
throw new Error(`End page index ${endPageIndex} out of range`);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
try {
|
|
152
|
+
const parts: string[] = [];
|
|
153
|
+
for (let i = startPageIndex; i <= endPageIndex; i++) {
|
|
154
|
+
parts.push(this.extractText(i, options));
|
|
155
|
+
}
|
|
156
|
+
return parts.join('\n');
|
|
157
|
+
} catch (error) {
|
|
158
|
+
throw new Error(`Failed to extract text range: ${(error as Error).message}`);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Extracts text from specific page indices (non-contiguous)
|
|
164
|
+
* @param pageIndices - Array of zero-based page indices
|
|
165
|
+
* @param options - Conversion options
|
|
166
|
+
* @returns Extracted text from specified pages concatenated with newlines
|
|
167
|
+
* @throws Error if page indices are invalid
|
|
168
|
+
*
|
|
169
|
+
* @example
|
|
170
|
+
* ```typescript
|
|
171
|
+
* const text = manager.extractTextBatch([0, 2, 5]); // Extract pages 1, 3, 6
|
|
172
|
+
* console.log(text);
|
|
173
|
+
* ```
|
|
174
|
+
*/
|
|
175
|
+
extractTextBatch(pageIndices: number[], options?: Record<string, any>): string {
|
|
176
|
+
if (!Array.isArray(pageIndices)) {
|
|
177
|
+
throw new Error('Page indices must be an array');
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if (pageIndices.length === 0) {
|
|
181
|
+
return '';
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
try {
|
|
185
|
+
const parts: string[] = [];
|
|
186
|
+
for (const pageIndex of pageIndices) {
|
|
187
|
+
if (typeof pageIndex !== 'number' || pageIndex < 0 || pageIndex >= this._document.pageCount) {
|
|
188
|
+
throw new Error(`Invalid page index: ${pageIndex}`);
|
|
189
|
+
}
|
|
190
|
+
parts.push(this.extractText(pageIndex, options));
|
|
191
|
+
}
|
|
192
|
+
return parts.join('\n');
|
|
193
|
+
} catch (error) {
|
|
194
|
+
throw new Error(`Failed to extract text batch: ${(error as Error).message}`);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Extracts text from pages as an array (one entry per page)
|
|
200
|
+
* @param startPageIndex - Zero-based start page index
|
|
201
|
+
* @param endPageIndex - Zero-based end page index (inclusive)
|
|
202
|
+
* @param options - Conversion options
|
|
203
|
+
* @returns Array of extracted text, one per page
|
|
204
|
+
*
|
|
205
|
+
* @example
|
|
206
|
+
* ```typescript
|
|
207
|
+
* const pages = manager.extractTextArray(0, 5);
|
|
208
|
+
* pages.forEach((text, i) => console.log(`Page ${i}: ${text.length} chars`));
|
|
209
|
+
* ```
|
|
210
|
+
*/
|
|
211
|
+
extractTextArray(
|
|
212
|
+
startPageIndex: number,
|
|
213
|
+
endPageIndex: number,
|
|
214
|
+
options?: Record<string, any>
|
|
215
|
+
): string[] {
|
|
216
|
+
if (typeof startPageIndex !== 'number' || startPageIndex < 0) {
|
|
217
|
+
throw new Error('Start page index must be a non-negative number');
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
if (typeof endPageIndex !== 'number' || endPageIndex < startPageIndex) {
|
|
221
|
+
throw new Error('End page index must be >= start page index');
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if (endPageIndex >= this._document.pageCount) {
|
|
225
|
+
throw new Error(`End page index ${endPageIndex} out of range`);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
try {
|
|
229
|
+
const results: string[] = [];
|
|
230
|
+
for (let i = startPageIndex; i <= endPageIndex; i++) {
|
|
231
|
+
results.push(this.extractText(i, options));
|
|
232
|
+
}
|
|
233
|
+
return results;
|
|
234
|
+
} catch (error) {
|
|
235
|
+
throw new Error(`Failed to extract text array: ${(error as Error).message}`);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
/**
|
|
240
|
+
* Extracts page as Markdown.
|
|
241
|
+
* Results are automatically cached at the FFI layer.
|
|
242
|
+
* @param pageIndex - Zero-based page index
|
|
243
|
+
* @param options - Conversion options
|
|
244
|
+
* @returns Page content as Markdown
|
|
245
|
+
* @throws Error if page index is invalid
|
|
246
|
+
*
|
|
247
|
+
* @example
|
|
248
|
+
* ```typescript
|
|
249
|
+
* const markdown = manager.extractMarkdown(0);
|
|
250
|
+
* console.log(markdown); // Markdown formatted content
|
|
251
|
+
* ```
|
|
252
|
+
*/
|
|
253
|
+
extractMarkdown(pageIndex: number, options?: Record<string, any>): string {
|
|
254
|
+
if (typeof pageIndex !== 'number' || pageIndex < 0) {
|
|
255
|
+
throw new Error('Page index must be a non-negative number');
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if (pageIndex >= this._document.pageCount) {
|
|
259
|
+
throw new Error(`Page index ${pageIndex} out of range`);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
try {
|
|
263
|
+
return this._document.toMarkdown(pageIndex, options);
|
|
264
|
+
} catch (error) {
|
|
265
|
+
throw new Error(`Failed to extract markdown from page ${pageIndex}: ${(error as Error).message}`);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/**
|
|
270
|
+
* Extracts all pages as Markdown
|
|
271
|
+
* @param options - Conversion options
|
|
272
|
+
* @returns All pages as Markdown
|
|
273
|
+
*
|
|
274
|
+
* @example
|
|
275
|
+
* ```typescript
|
|
276
|
+
* const markdown = manager.extractAllMarkdown();
|
|
277
|
+
* // Write to file
|
|
278
|
+
* fs.writeFileSync('output.md', markdown);
|
|
279
|
+
* ```
|
|
280
|
+
*/
|
|
281
|
+
extractAllMarkdown(options?: Record<string, any>): string {
|
|
282
|
+
try {
|
|
283
|
+
const parts: string[] = [];
|
|
284
|
+
for (let i = 0; i < this._document.pageCount; i++) {
|
|
285
|
+
const heading = `\n## Page ${i + 1}\n`;
|
|
286
|
+
const content = this.extractMarkdown(i, options);
|
|
287
|
+
parts.push(heading + content);
|
|
288
|
+
}
|
|
289
|
+
return parts.join('\n');
|
|
290
|
+
} catch (error) {
|
|
291
|
+
throw new Error(`Failed to extract all markdown: ${(error as Error).message}`);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Extracts markdown from a range of pages
|
|
297
|
+
* @param startPageIndex - Zero-based start page index
|
|
298
|
+
* @param endPageIndex - Zero-based end page index (inclusive)
|
|
299
|
+
* @param options - Conversion options
|
|
300
|
+
* @returns Extracted markdown from pages in range
|
|
301
|
+
*/
|
|
302
|
+
extractMarkdownRange(
|
|
303
|
+
startPageIndex: number,
|
|
304
|
+
endPageIndex: number,
|
|
305
|
+
options?: Record<string, any>
|
|
306
|
+
): string {
|
|
307
|
+
if (typeof startPageIndex !== 'number' || startPageIndex < 0) {
|
|
308
|
+
throw new Error('Start page index must be a non-negative number');
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
if (typeof endPageIndex !== 'number' || endPageIndex < startPageIndex) {
|
|
312
|
+
throw new Error('End page index must be >= start page index');
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
if (endPageIndex >= this._document.pageCount) {
|
|
316
|
+
throw new Error(`End page index ${endPageIndex} out of range`);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
try {
|
|
320
|
+
const parts: string[] = [];
|
|
321
|
+
for (let i = startPageIndex; i <= endPageIndex; i++) {
|
|
322
|
+
const heading = `\n## Page ${i + 1}\n`;
|
|
323
|
+
const content = this.extractMarkdown(i, options);
|
|
324
|
+
parts.push(heading + content);
|
|
325
|
+
}
|
|
326
|
+
return parts.join('\n');
|
|
327
|
+
} catch (error) {
|
|
328
|
+
throw new Error(`Failed to extract markdown range: ${(error as Error).message}`);
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Gets word count for a page
|
|
334
|
+
* @param pageIndex - Zero-based page index
|
|
335
|
+
* @returns Estimated word count
|
|
336
|
+
*/
|
|
337
|
+
getPageWordCount(pageIndex: number): number {
|
|
338
|
+
const text = this.extractText(pageIndex);
|
|
339
|
+
return text.trim().split(/\s+/).length;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Gets total word count for all pages
|
|
344
|
+
* @returns Total word count across all pages
|
|
345
|
+
*/
|
|
346
|
+
getTotalWordCount(): number {
|
|
347
|
+
const allText = this.extractAllText();
|
|
348
|
+
return allText.trim().split(/\s+/).filter(word => word.length > 0).length;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
/**
|
|
352
|
+
* Gets character count for a page
|
|
353
|
+
* @param pageIndex - Zero-based page index
|
|
354
|
+
* @returns Character count (including whitespace)
|
|
355
|
+
*/
|
|
356
|
+
getPageCharacterCount(pageIndex: number): number {
|
|
357
|
+
const text = this.extractText(pageIndex);
|
|
358
|
+
return text.length;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/**
|
|
362
|
+
* Gets total character count for all pages
|
|
363
|
+
* @returns Total character count
|
|
364
|
+
*/
|
|
365
|
+
getTotalCharacterCount(): number {
|
|
366
|
+
let total = 0;
|
|
367
|
+
for (let i = 0; i < this._document.pageCount; i++) {
|
|
368
|
+
total += this.getPageCharacterCount(i);
|
|
369
|
+
}
|
|
370
|
+
return total;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Gets line count for a page
|
|
375
|
+
* @param pageIndex - Zero-based page index
|
|
376
|
+
* @returns Estimated line count
|
|
377
|
+
*/
|
|
378
|
+
getPageLineCount(pageIndex: number): number {
|
|
379
|
+
const text = this.extractText(pageIndex);
|
|
380
|
+
return text.split('\n').length;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Gets statistics for extracted content
|
|
385
|
+
* @returns Content statistics object
|
|
386
|
+
*
|
|
387
|
+
* @example
|
|
388
|
+
* ```typescript
|
|
389
|
+
* const stats = manager.getContentStatistics();
|
|
390
|
+
* console.log(`Total pages: ${stats.pageCount}`);
|
|
391
|
+
* console.log(`Total words: ${stats.wordCount}`);
|
|
392
|
+
* console.log(`Average page length: ${stats.averagePageLength}`);
|
|
393
|
+
* ```
|
|
394
|
+
*/
|
|
395
|
+
getContentStatistics(): ContentStatistics {
|
|
396
|
+
try {
|
|
397
|
+
const pageCount = this._document.pageCount;
|
|
398
|
+
const totalWords = this.getTotalWordCount();
|
|
399
|
+
const totalCharacters = this.getTotalCharacterCount();
|
|
400
|
+
|
|
401
|
+
return {
|
|
402
|
+
pageCount,
|
|
403
|
+
wordCount: totalWords,
|
|
404
|
+
characterCount: totalCharacters,
|
|
405
|
+
averageWordsPerPage: Math.round(totalWords / pageCount),
|
|
406
|
+
averageCharactersPerPage: Math.round(totalCharacters / pageCount),
|
|
407
|
+
};
|
|
408
|
+
} catch (error) {
|
|
409
|
+
throw new Error(`Failed to get content statistics: ${(error as Error).message}`);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
/**
|
|
414
|
+
* Searches for text across all pages and returns matching snippets
|
|
415
|
+
* @param searchText - Text to search for
|
|
416
|
+
* @param contextLength - Characters of context around match
|
|
417
|
+
* @returns Array of match objects with page and snippet
|
|
418
|
+
*
|
|
419
|
+
* @example
|
|
420
|
+
* ```typescript
|
|
421
|
+
* const matches = manager.searchContent('keyword', 50);
|
|
422
|
+
* matches.forEach(match => {
|
|
423
|
+
* console.log(`Page ${match.pageIndex + 1}: ...${match.snippet}...`);
|
|
424
|
+
* });
|
|
425
|
+
* ```
|
|
426
|
+
*/
|
|
427
|
+
searchContent(searchText: string, contextLength: number = 100): SearchMatch[] {
|
|
428
|
+
if (!searchText || typeof searchText !== 'string') {
|
|
429
|
+
throw new Error('Search text must be a non-empty string');
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
const results: SearchMatch[] = [];
|
|
433
|
+
const searchRegex = new RegExp(searchText, 'gi');
|
|
434
|
+
|
|
435
|
+
for (let i = 0; i < this._document.pageCount; i++) {
|
|
436
|
+
try {
|
|
437
|
+
const text = this.extractText(i);
|
|
438
|
+
let match;
|
|
439
|
+
|
|
440
|
+
while ((match = searchRegex.exec(text)) !== null) {
|
|
441
|
+
const start = Math.max(0, match.index - contextLength);
|
|
442
|
+
const end = Math.min(text.length, match.index + searchText.length + contextLength);
|
|
443
|
+
const snippet = text.substring(start, end);
|
|
444
|
+
|
|
445
|
+
results.push({
|
|
446
|
+
pageIndex: i,
|
|
447
|
+
pageNumber: i + 1,
|
|
448
|
+
matchIndex: match.index,
|
|
449
|
+
snippet: snippet.replace(/\n/g, ' '),
|
|
450
|
+
matchText: match[0],
|
|
451
|
+
});
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// Reset regex for next iteration
|
|
455
|
+
searchRegex.lastIndex = 0;
|
|
456
|
+
} catch (e) {
|
|
457
|
+
// Skip pages that fail extraction
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
return results;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Extract text from a page in a worker thread (non-blocking)
|
|
466
|
+
* @param documentPath - Path to the PDF document
|
|
467
|
+
* @param pageIndex - Page index to extract from
|
|
468
|
+
* @param options - Optional extraction options
|
|
469
|
+
* @param timeout - Optional timeout in milliseconds
|
|
470
|
+
* @returns Promise resolving to extracted text
|
|
471
|
+
*/
|
|
472
|
+
async extractTextInWorker(
|
|
473
|
+
documentPath: string,
|
|
474
|
+
pageIndex: number,
|
|
475
|
+
options?: Record<string, any>,
|
|
476
|
+
timeout?: number
|
|
477
|
+
): Promise<string> {
|
|
478
|
+
const { workerPool } = await import('../workers/index.js');
|
|
479
|
+
|
|
480
|
+
const result = await workerPool.runTask(
|
|
481
|
+
{
|
|
482
|
+
operation: 'extract',
|
|
483
|
+
documentPath,
|
|
484
|
+
params: {
|
|
485
|
+
type: 'text',
|
|
486
|
+
pageIndex,
|
|
487
|
+
options: options || {},
|
|
488
|
+
},
|
|
489
|
+
},
|
|
490
|
+
timeout
|
|
491
|
+
);
|
|
492
|
+
|
|
493
|
+
if (!result.success) {
|
|
494
|
+
throw new Error(
|
|
495
|
+
`Worker extraction failed: ${
|
|
496
|
+
result.error instanceof Error ? result.error.message : String(result.error)
|
|
497
|
+
}`
|
|
498
|
+
);
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
return result.data as string;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
/**
|
|
505
|
+
* Extract markdown from a page in a worker thread (non-blocking)
|
|
506
|
+
* @param documentPath - Path to the PDF document
|
|
507
|
+
* @param pageIndex - Page index to extract from
|
|
508
|
+
* @param options - Optional extraction options
|
|
509
|
+
* @param timeout - Optional timeout in milliseconds
|
|
510
|
+
* @returns Promise resolving to extracted markdown
|
|
511
|
+
*/
|
|
512
|
+
async extractMarkdownInWorker(
|
|
513
|
+
documentPath: string,
|
|
514
|
+
pageIndex: number,
|
|
515
|
+
options?: Record<string, any>,
|
|
516
|
+
timeout?: number
|
|
517
|
+
): Promise<string> {
|
|
518
|
+
const { workerPool } = await import('../workers/index.js');
|
|
519
|
+
|
|
520
|
+
const result = await workerPool.runTask(
|
|
521
|
+
{
|
|
522
|
+
operation: 'extract',
|
|
523
|
+
documentPath,
|
|
524
|
+
params: {
|
|
525
|
+
type: 'markdown',
|
|
526
|
+
pageIndex,
|
|
527
|
+
options: options || {},
|
|
528
|
+
},
|
|
529
|
+
},
|
|
530
|
+
timeout
|
|
531
|
+
);
|
|
532
|
+
|
|
533
|
+
if (!result.success) {
|
|
534
|
+
throw new Error(
|
|
535
|
+
`Worker extraction failed: ${
|
|
536
|
+
result.error instanceof Error ? result.error.message : String(result.error)
|
|
537
|
+
}`
|
|
538
|
+
);
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
return result.data as string;
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
/**
|
|
545
|
+
* Extract HTML from a page in a worker thread (non-blocking)
|
|
546
|
+
* @param documentPath - Path to the PDF document
|
|
547
|
+
* @param pageIndex - Page index to extract from
|
|
548
|
+
* @param options - Optional extraction options
|
|
549
|
+
* @param timeout - Optional timeout in milliseconds
|
|
550
|
+
* @returns Promise resolving to extracted HTML
|
|
551
|
+
*/
|
|
552
|
+
async extractHtmlInWorker(
|
|
553
|
+
documentPath: string,
|
|
554
|
+
pageIndex: number,
|
|
555
|
+
options?: Record<string, any>,
|
|
556
|
+
timeout?: number
|
|
557
|
+
): Promise<string> {
|
|
558
|
+
const { workerPool } = await import('../workers/index.js');
|
|
559
|
+
|
|
560
|
+
const result = await workerPool.runTask(
|
|
561
|
+
{
|
|
562
|
+
operation: 'extract',
|
|
563
|
+
documentPath,
|
|
564
|
+
params: {
|
|
565
|
+
type: 'html',
|
|
566
|
+
pageIndex,
|
|
567
|
+
options: options || {},
|
|
568
|
+
},
|
|
569
|
+
},
|
|
570
|
+
timeout
|
|
571
|
+
);
|
|
572
|
+
|
|
573
|
+
if (!result.success) {
|
|
574
|
+
throw new Error(
|
|
575
|
+
`Worker extraction failed: ${
|
|
576
|
+
result.error instanceof Error ? result.error.message : String(result.error)
|
|
577
|
+
}`
|
|
578
|
+
);
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
return result.data as string;
|
|
582
|
+
}
|
|
583
|
+
}
|