pdf-oxide 0.3.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +218 -0
  2. package/binding.gyp +35 -0
  3. package/package.json +78 -0
  4. package/src/builders/annotation-builder.ts +367 -0
  5. package/src/builders/conversion-options-builder.ts +257 -0
  6. package/src/builders/index.ts +12 -0
  7. package/src/builders/metadata-builder.ts +317 -0
  8. package/src/builders/pdf-builder.ts +386 -0
  9. package/src/builders/search-options-builder.ts +151 -0
  10. package/src/document-editor-manager.ts +318 -0
  11. package/src/errors.ts +1629 -0
  12. package/src/form-field-manager.ts +666 -0
  13. package/src/hybrid-ml-manager.ts +283 -0
  14. package/src/index.ts +453 -0
  15. package/src/managers/accessibility-manager.ts +338 -0
  16. package/src/managers/annotation-manager.ts +439 -0
  17. package/src/managers/barcode-manager.ts +235 -0
  18. package/src/managers/batch-manager.ts +533 -0
  19. package/src/managers/cache-manager.ts +486 -0
  20. package/src/managers/compliance-manager.ts +375 -0
  21. package/src/managers/content-manager.ts +339 -0
  22. package/src/managers/document-utility-manager.ts +922 -0
  23. package/src/managers/dom-pdf-creator.ts +365 -0
  24. package/src/managers/editing-manager.ts +514 -0
  25. package/src/managers/enterprise-manager.ts +478 -0
  26. package/src/managers/extended-managers.ts +437 -0
  27. package/src/managers/extraction-manager.ts +583 -0
  28. package/src/managers/final-utilities.ts +429 -0
  29. package/src/managers/hybrid-ml-advanced.ts +479 -0
  30. package/src/managers/index.ts +239 -0
  31. package/src/managers/layer-manager.ts +500 -0
  32. package/src/managers/metadata-manager.ts +303 -0
  33. package/src/managers/ocr-manager.ts +756 -0
  34. package/src/managers/optimization-manager.ts +262 -0
  35. package/src/managers/outline-manager.ts +196 -0
  36. package/src/managers/page-manager.ts +289 -0
  37. package/src/managers/pattern-detection.ts +440 -0
  38. package/src/managers/rendering-manager.ts +863 -0
  39. package/src/managers/search-manager.ts +385 -0
  40. package/src/managers/security-manager.ts +345 -0
  41. package/src/managers/signature-manager.ts +1664 -0
  42. package/src/managers/streams.ts +618 -0
  43. package/src/managers/xfa-manager.ts +500 -0
  44. package/src/pdf-creator-manager.ts +494 -0
  45. package/src/properties.ts +522 -0
  46. package/src/result-accessors-manager.ts +867 -0
  47. package/src/tests/advanced-features.test.ts +414 -0
  48. package/src/tests/advanced.test.ts +266 -0
  49. package/src/tests/extended-managers.test.ts +316 -0
  50. package/src/tests/final-utilities.test.ts +455 -0
  51. package/src/tests/foundation.test.ts +315 -0
  52. package/src/tests/high-demand.test.ts +257 -0
  53. package/src/tests/specialized.test.ts +97 -0
  54. package/src/thumbnail-manager.ts +272 -0
  55. package/src/types/common.ts +142 -0
  56. package/src/types/document-types.ts +457 -0
  57. package/src/types/index.ts +6 -0
  58. package/src/types/manager-types.ts +284 -0
  59. package/src/types/native-bindings.ts +517 -0
  60. package/src/workers/index.ts +7 -0
  61. package/src/workers/pool.ts +274 -0
  62. package/src/workers/worker.ts +131 -0
@@ -0,0 +1,583 @@
1
+ /**
2
+ * Manager for content extraction from PDF documents
3
+ *
4
+ * Caching is handled automatically at the Rust FFI layer, eliminating
5
+ * the need for duplicate cache implementations in the binding.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { ExtractionManager, ConversionOptionsBuilder } from 'pdf_oxide';
10
+ *
11
+ * const doc = PdfDocument.open('document.pdf');
12
+ * const extractionManager = new ExtractionManager(doc);
13
+ *
14
+ * // Extract text from a single page
15
+ * const text = extractionManager.extractText(0);
16
+ * console.log(text);
17
+ *
18
+ * // Extract all text
19
+ * const allText = extractionManager.extractAllText();
20
+ *
21
+ * // Extract with custom options
22
+ * const options = ConversionOptionsBuilder.highQuality().build();
23
+ * const markdown = extractionManager.extractMarkdown(0, options);
24
+ * ```
25
+ */
26
+
27
+ export interface ContentStatistics {
28
+ pageCount: number;
29
+ wordCount: number;
30
+ characterCount: number;
31
+ averageWordsPerPage: number;
32
+ averageCharactersPerPage: number;
33
+ }
34
+
35
+ export interface SearchMatch {
36
+ pageIndex: number;
37
+ pageNumber: number;
38
+ matchIndex: number;
39
+ snippet: string;
40
+ matchText: string;
41
+ }
42
+
43
+ export class ExtractionManager {
44
+ private _document: any;
45
+
46
+ /**
47
+ * Creates a new ExtractionManager for the given document
48
+ * @param document - The PDF document
49
+ * @throws Error if document is null or undefined
50
+ */
51
+ constructor(document: any) {
52
+ if (!document) {
53
+ throw new Error('Document is required');
54
+ }
55
+ this._document = document;
56
+ }
57
+
58
+ /**
59
+ * Extracts text from a single page.
60
+ *
61
+ * The native layer produces UTF-8 bytes, which Node decodes into a JS
62
+ * `string` (UTF-16 code units internally). As a result,
63
+ * `text.length` reports UTF-16 code units, not bytes — so a 648-byte
64
+ * UTF-8 string containing two accented letters reads as 646 in JS. Use
65
+ * `Buffer.byteLength(text, 'utf8')` if you need the byte count (e.g. to
66
+ * compare against Go's `len(string)` or Rust's `String::len()`).
67
+ *
68
+ * Results are automatically cached at the FFI layer.
69
+ *
70
+ * @param pageIndex - Zero-based page index
71
+ * @param options - Conversion options
72
+ * @returns Extracted text (UTF-16 code units)
73
+ * @throws Error if page index is invalid
74
+ *
75
+ * @example
76
+ * ```typescript
77
+ * const text = manager.extractText(0);
78
+ * console.log(`Page 1: ${text.length} UTF-16 code units`);
79
+ * console.log(` ${Buffer.byteLength(text, 'utf8')} UTF-8 bytes`);
80
+ * ```
81
+ */
82
+ extractText(pageIndex: number, options?: Record<string, any>): string {
83
+ if (typeof pageIndex !== 'number' || pageIndex < 0) {
84
+ throw new Error('Page index must be a non-negative number');
85
+ }
86
+
87
+ if (pageIndex >= this._document.pageCount) {
88
+ throw new Error(`Page index ${pageIndex} out of range`);
89
+ }
90
+
91
+ try {
92
+ return this._document.extractText(pageIndex);
93
+ } catch (error) {
94
+ throw new Error(`Failed to extract text from page ${pageIndex}: ${(error as Error).message}`);
95
+ }
96
+ }
97
+
98
+ /**
99
+ * Extracts text from all pages
100
+ * @param options - Conversion options
101
+ * @returns All extracted text concatenated
102
+ *
103
+ * @example
104
+ * ```typescript
105
+ * const allText = manager.extractAllText();
106
+ * console.log(`Total characters: ${allText.length}`);
107
+ * ```
108
+ */
109
+ extractAllText(options?: Record<string, any>): string {
110
+ try {
111
+ const parts: string[] = [];
112
+ for (let i = 0; i < this._document.pageCount; i++) {
113
+ parts.push(this.extractText(i, options));
114
+ }
115
+ return parts.join('\n');
116
+ } catch (error) {
117
+ throw new Error(`Failed to extract all text: ${(error as Error).message}`);
118
+ }
119
+ }
120
+
121
+ /**
122
+ * Extracts text from a range of pages
123
+ * @param startPageIndex - Zero-based start page index
124
+ * @param endPageIndex - Zero-based end page index (inclusive)
125
+ * @param options - Conversion options
126
+ * @returns Extracted text from pages in range
127
+ *
128
+ * @example
129
+ * ```typescript
130
+ * const text = manager.extractTextRange(0, 10);
131
+ * console.log(`Text from pages 1-11: ${text}`);
132
+ * ```
133
+ */
134
+ extractTextRange(
135
+ startPageIndex: number,
136
+ endPageIndex: number,
137
+ options?: Record<string, any>
138
+ ): string {
139
+ if (typeof startPageIndex !== 'number' || startPageIndex < 0) {
140
+ throw new Error('Start page index must be a non-negative number');
141
+ }
142
+
143
+ if (typeof endPageIndex !== 'number' || endPageIndex < startPageIndex) {
144
+ throw new Error('End page index must be >= start page index');
145
+ }
146
+
147
+ if (endPageIndex >= this._document.pageCount) {
148
+ throw new Error(`End page index ${endPageIndex} out of range`);
149
+ }
150
+
151
+ try {
152
+ const parts: string[] = [];
153
+ for (let i = startPageIndex; i <= endPageIndex; i++) {
154
+ parts.push(this.extractText(i, options));
155
+ }
156
+ return parts.join('\n');
157
+ } catch (error) {
158
+ throw new Error(`Failed to extract text range: ${(error as Error).message}`);
159
+ }
160
+ }
161
+
162
+ /**
163
+ * Extracts text from specific page indices (non-contiguous)
164
+ * @param pageIndices - Array of zero-based page indices
165
+ * @param options - Conversion options
166
+ * @returns Extracted text from specified pages concatenated with newlines
167
+ * @throws Error if page indices are invalid
168
+ *
169
+ * @example
170
+ * ```typescript
171
+ * const text = manager.extractTextBatch([0, 2, 5]); // Extract pages 1, 3, 6
172
+ * console.log(text);
173
+ * ```
174
+ */
175
+ extractTextBatch(pageIndices: number[], options?: Record<string, any>): string {
176
+ if (!Array.isArray(pageIndices)) {
177
+ throw new Error('Page indices must be an array');
178
+ }
179
+
180
+ if (pageIndices.length === 0) {
181
+ return '';
182
+ }
183
+
184
+ try {
185
+ const parts: string[] = [];
186
+ for (const pageIndex of pageIndices) {
187
+ if (typeof pageIndex !== 'number' || pageIndex < 0 || pageIndex >= this._document.pageCount) {
188
+ throw new Error(`Invalid page index: ${pageIndex}`);
189
+ }
190
+ parts.push(this.extractText(pageIndex, options));
191
+ }
192
+ return parts.join('\n');
193
+ } catch (error) {
194
+ throw new Error(`Failed to extract text batch: ${(error as Error).message}`);
195
+ }
196
+ }
197
+
198
+ /**
199
+ * Extracts text from pages as an array (one entry per page)
200
+ * @param startPageIndex - Zero-based start page index
201
+ * @param endPageIndex - Zero-based end page index (inclusive)
202
+ * @param options - Conversion options
203
+ * @returns Array of extracted text, one per page
204
+ *
205
+ * @example
206
+ * ```typescript
207
+ * const pages = manager.extractTextArray(0, 5);
208
+ * pages.forEach((text, i) => console.log(`Page ${i}: ${text.length} chars`));
209
+ * ```
210
+ */
211
+ extractTextArray(
212
+ startPageIndex: number,
213
+ endPageIndex: number,
214
+ options?: Record<string, any>
215
+ ): string[] {
216
+ if (typeof startPageIndex !== 'number' || startPageIndex < 0) {
217
+ throw new Error('Start page index must be a non-negative number');
218
+ }
219
+
220
+ if (typeof endPageIndex !== 'number' || endPageIndex < startPageIndex) {
221
+ throw new Error('End page index must be >= start page index');
222
+ }
223
+
224
+ if (endPageIndex >= this._document.pageCount) {
225
+ throw new Error(`End page index ${endPageIndex} out of range`);
226
+ }
227
+
228
+ try {
229
+ const results: string[] = [];
230
+ for (let i = startPageIndex; i <= endPageIndex; i++) {
231
+ results.push(this.extractText(i, options));
232
+ }
233
+ return results;
234
+ } catch (error) {
235
+ throw new Error(`Failed to extract text array: ${(error as Error).message}`);
236
+ }
237
+ }
238
+
239
+ /**
240
+ * Extracts page as Markdown.
241
+ * Results are automatically cached at the FFI layer.
242
+ * @param pageIndex - Zero-based page index
243
+ * @param options - Conversion options
244
+ * @returns Page content as Markdown
245
+ * @throws Error if page index is invalid
246
+ *
247
+ * @example
248
+ * ```typescript
249
+ * const markdown = manager.extractMarkdown(0);
250
+ * console.log(markdown); // Markdown formatted content
251
+ * ```
252
+ */
253
+ extractMarkdown(pageIndex: number, options?: Record<string, any>): string {
254
+ if (typeof pageIndex !== 'number' || pageIndex < 0) {
255
+ throw new Error('Page index must be a non-negative number');
256
+ }
257
+
258
+ if (pageIndex >= this._document.pageCount) {
259
+ throw new Error(`Page index ${pageIndex} out of range`);
260
+ }
261
+
262
+ try {
263
+ return this._document.toMarkdown(pageIndex, options);
264
+ } catch (error) {
265
+ throw new Error(`Failed to extract markdown from page ${pageIndex}: ${(error as Error).message}`);
266
+ }
267
+ }
268
+
269
+ /**
270
+ * Extracts all pages as Markdown
271
+ * @param options - Conversion options
272
+ * @returns All pages as Markdown
273
+ *
274
+ * @example
275
+ * ```typescript
276
+ * const markdown = manager.extractAllMarkdown();
277
+ * // Write to file
278
+ * fs.writeFileSync('output.md', markdown);
279
+ * ```
280
+ */
281
+ extractAllMarkdown(options?: Record<string, any>): string {
282
+ try {
283
+ const parts: string[] = [];
284
+ for (let i = 0; i < this._document.pageCount; i++) {
285
+ const heading = `\n## Page ${i + 1}\n`;
286
+ const content = this.extractMarkdown(i, options);
287
+ parts.push(heading + content);
288
+ }
289
+ return parts.join('\n');
290
+ } catch (error) {
291
+ throw new Error(`Failed to extract all markdown: ${(error as Error).message}`);
292
+ }
293
+ }
294
+
295
+ /**
296
+ * Extracts markdown from a range of pages
297
+ * @param startPageIndex - Zero-based start page index
298
+ * @param endPageIndex - Zero-based end page index (inclusive)
299
+ * @param options - Conversion options
300
+ * @returns Extracted markdown from pages in range
301
+ */
302
+ extractMarkdownRange(
303
+ startPageIndex: number,
304
+ endPageIndex: number,
305
+ options?: Record<string, any>
306
+ ): string {
307
+ if (typeof startPageIndex !== 'number' || startPageIndex < 0) {
308
+ throw new Error('Start page index must be a non-negative number');
309
+ }
310
+
311
+ if (typeof endPageIndex !== 'number' || endPageIndex < startPageIndex) {
312
+ throw new Error('End page index must be >= start page index');
313
+ }
314
+
315
+ if (endPageIndex >= this._document.pageCount) {
316
+ throw new Error(`End page index ${endPageIndex} out of range`);
317
+ }
318
+
319
+ try {
320
+ const parts: string[] = [];
321
+ for (let i = startPageIndex; i <= endPageIndex; i++) {
322
+ const heading = `\n## Page ${i + 1}\n`;
323
+ const content = this.extractMarkdown(i, options);
324
+ parts.push(heading + content);
325
+ }
326
+ return parts.join('\n');
327
+ } catch (error) {
328
+ throw new Error(`Failed to extract markdown range: ${(error as Error).message}`);
329
+ }
330
+ }
331
+
332
+ /**
333
+ * Gets word count for a page
334
+ * @param pageIndex - Zero-based page index
335
+ * @returns Estimated word count
336
+ */
337
+ getPageWordCount(pageIndex: number): number {
338
+ const text = this.extractText(pageIndex);
339
+ return text.trim().split(/\s+/).length;
340
+ }
341
+
342
+ /**
343
+ * Gets total word count for all pages
344
+ * @returns Total word count across all pages
345
+ */
346
+ getTotalWordCount(): number {
347
+ const allText = this.extractAllText();
348
+ return allText.trim().split(/\s+/).filter(word => word.length > 0).length;
349
+ }
350
+
351
+ /**
352
+ * Gets character count for a page
353
+ * @param pageIndex - Zero-based page index
354
+ * @returns Character count (including whitespace)
355
+ */
356
+ getPageCharacterCount(pageIndex: number): number {
357
+ const text = this.extractText(pageIndex);
358
+ return text.length;
359
+ }
360
+
361
+ /**
362
+ * Gets total character count for all pages
363
+ * @returns Total character count
364
+ */
365
+ getTotalCharacterCount(): number {
366
+ let total = 0;
367
+ for (let i = 0; i < this._document.pageCount; i++) {
368
+ total += this.getPageCharacterCount(i);
369
+ }
370
+ return total;
371
+ }
372
+
373
+ /**
374
+ * Gets line count for a page
375
+ * @param pageIndex - Zero-based page index
376
+ * @returns Estimated line count
377
+ */
378
+ getPageLineCount(pageIndex: number): number {
379
+ const text = this.extractText(pageIndex);
380
+ return text.split('\n').length;
381
+ }
382
+
383
+ /**
384
+ * Gets statistics for extracted content
385
+ * @returns Content statistics object
386
+ *
387
+ * @example
388
+ * ```typescript
389
+ * const stats = manager.getContentStatistics();
390
+ * console.log(`Total pages: ${stats.pageCount}`);
391
+ * console.log(`Total words: ${stats.wordCount}`);
392
+ * console.log(`Average page length: ${stats.averagePageLength}`);
393
+ * ```
394
+ */
395
+ getContentStatistics(): ContentStatistics {
396
+ try {
397
+ const pageCount = this._document.pageCount;
398
+ const totalWords = this.getTotalWordCount();
399
+ const totalCharacters = this.getTotalCharacterCount();
400
+
401
+ return {
402
+ pageCount,
403
+ wordCount: totalWords,
404
+ characterCount: totalCharacters,
405
+ averageWordsPerPage: Math.round(totalWords / pageCount),
406
+ averageCharactersPerPage: Math.round(totalCharacters / pageCount),
407
+ };
408
+ } catch (error) {
409
+ throw new Error(`Failed to get content statistics: ${(error as Error).message}`);
410
+ }
411
+ }
412
+
413
+ /**
414
+ * Searches for text across all pages and returns matching snippets
415
+ * @param searchText - Text to search for
416
+ * @param contextLength - Characters of context around match
417
+ * @returns Array of match objects with page and snippet
418
+ *
419
+ * @example
420
+ * ```typescript
421
+ * const matches = manager.searchContent('keyword', 50);
422
+ * matches.forEach(match => {
423
+ * console.log(`Page ${match.pageIndex + 1}: ...${match.snippet}...`);
424
+ * });
425
+ * ```
426
+ */
427
+ searchContent(searchText: string, contextLength: number = 100): SearchMatch[] {
428
+ if (!searchText || typeof searchText !== 'string') {
429
+ throw new Error('Search text must be a non-empty string');
430
+ }
431
+
432
+ const results: SearchMatch[] = [];
433
+ const searchRegex = new RegExp(searchText, 'gi');
434
+
435
+ for (let i = 0; i < this._document.pageCount; i++) {
436
+ try {
437
+ const text = this.extractText(i);
438
+ let match;
439
+
440
+ while ((match = searchRegex.exec(text)) !== null) {
441
+ const start = Math.max(0, match.index - contextLength);
442
+ const end = Math.min(text.length, match.index + searchText.length + contextLength);
443
+ const snippet = text.substring(start, end);
444
+
445
+ results.push({
446
+ pageIndex: i,
447
+ pageNumber: i + 1,
448
+ matchIndex: match.index,
449
+ snippet: snippet.replace(/\n/g, ' '),
450
+ matchText: match[0],
451
+ });
452
+ }
453
+
454
+ // Reset regex for next iteration
455
+ searchRegex.lastIndex = 0;
456
+ } catch (e) {
457
+ // Skip pages that fail extraction
458
+ }
459
+ }
460
+
461
+ return results;
462
+ }
463
+
464
+ /**
465
+ * Extract text from a page in a worker thread (non-blocking)
466
+ * @param documentPath - Path to the PDF document
467
+ * @param pageIndex - Page index to extract from
468
+ * @param options - Optional extraction options
469
+ * @param timeout - Optional timeout in milliseconds
470
+ * @returns Promise resolving to extracted text
471
+ */
472
+ async extractTextInWorker(
473
+ documentPath: string,
474
+ pageIndex: number,
475
+ options?: Record<string, any>,
476
+ timeout?: number
477
+ ): Promise<string> {
478
+ const { workerPool } = await import('../workers/index.js');
479
+
480
+ const result = await workerPool.runTask(
481
+ {
482
+ operation: 'extract',
483
+ documentPath,
484
+ params: {
485
+ type: 'text',
486
+ pageIndex,
487
+ options: options || {},
488
+ },
489
+ },
490
+ timeout
491
+ );
492
+
493
+ if (!result.success) {
494
+ throw new Error(
495
+ `Worker extraction failed: ${
496
+ result.error instanceof Error ? result.error.message : String(result.error)
497
+ }`
498
+ );
499
+ }
500
+
501
+ return result.data as string;
502
+ }
503
+
504
+ /**
505
+ * Extract markdown from a page in a worker thread (non-blocking)
506
+ * @param documentPath - Path to the PDF document
507
+ * @param pageIndex - Page index to extract from
508
+ * @param options - Optional extraction options
509
+ * @param timeout - Optional timeout in milliseconds
510
+ * @returns Promise resolving to extracted markdown
511
+ */
512
+ async extractMarkdownInWorker(
513
+ documentPath: string,
514
+ pageIndex: number,
515
+ options?: Record<string, any>,
516
+ timeout?: number
517
+ ): Promise<string> {
518
+ const { workerPool } = await import('../workers/index.js');
519
+
520
+ const result = await workerPool.runTask(
521
+ {
522
+ operation: 'extract',
523
+ documentPath,
524
+ params: {
525
+ type: 'markdown',
526
+ pageIndex,
527
+ options: options || {},
528
+ },
529
+ },
530
+ timeout
531
+ );
532
+
533
+ if (!result.success) {
534
+ throw new Error(
535
+ `Worker extraction failed: ${
536
+ result.error instanceof Error ? result.error.message : String(result.error)
537
+ }`
538
+ );
539
+ }
540
+
541
+ return result.data as string;
542
+ }
543
+
544
+ /**
545
+ * Extract HTML from a page in a worker thread (non-blocking)
546
+ * @param documentPath - Path to the PDF document
547
+ * @param pageIndex - Page index to extract from
548
+ * @param options - Optional extraction options
549
+ * @param timeout - Optional timeout in milliseconds
550
+ * @returns Promise resolving to extracted HTML
551
+ */
552
+ async extractHtmlInWorker(
553
+ documentPath: string,
554
+ pageIndex: number,
555
+ options?: Record<string, any>,
556
+ timeout?: number
557
+ ): Promise<string> {
558
+ const { workerPool } = await import('../workers/index.js');
559
+
560
+ const result = await workerPool.runTask(
561
+ {
562
+ operation: 'extract',
563
+ documentPath,
564
+ params: {
565
+ type: 'html',
566
+ pageIndex,
567
+ options: options || {},
568
+ },
569
+ },
570
+ timeout
571
+ );
572
+
573
+ if (!result.success) {
574
+ throw new Error(
575
+ `Worker extraction failed: ${
576
+ result.error instanceof Error ? result.error.message : String(result.error)
577
+ }`
578
+ );
579
+ }
580
+
581
+ return result.data as string;
582
+ }
583
+ }