pdf-oxide 0.3.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +218 -0
  2. package/binding.gyp +35 -0
  3. package/package.json +78 -0
  4. package/src/builders/annotation-builder.ts +367 -0
  5. package/src/builders/conversion-options-builder.ts +257 -0
  6. package/src/builders/index.ts +12 -0
  7. package/src/builders/metadata-builder.ts +317 -0
  8. package/src/builders/pdf-builder.ts +386 -0
  9. package/src/builders/search-options-builder.ts +151 -0
  10. package/src/document-editor-manager.ts +318 -0
  11. package/src/errors.ts +1629 -0
  12. package/src/form-field-manager.ts +666 -0
  13. package/src/hybrid-ml-manager.ts +283 -0
  14. package/src/index.ts +453 -0
  15. package/src/managers/accessibility-manager.ts +338 -0
  16. package/src/managers/annotation-manager.ts +439 -0
  17. package/src/managers/barcode-manager.ts +235 -0
  18. package/src/managers/batch-manager.ts +533 -0
  19. package/src/managers/cache-manager.ts +486 -0
  20. package/src/managers/compliance-manager.ts +375 -0
  21. package/src/managers/content-manager.ts +339 -0
  22. package/src/managers/document-utility-manager.ts +922 -0
  23. package/src/managers/dom-pdf-creator.ts +365 -0
  24. package/src/managers/editing-manager.ts +514 -0
  25. package/src/managers/enterprise-manager.ts +478 -0
  26. package/src/managers/extended-managers.ts +437 -0
  27. package/src/managers/extraction-manager.ts +583 -0
  28. package/src/managers/final-utilities.ts +429 -0
  29. package/src/managers/hybrid-ml-advanced.ts +479 -0
  30. package/src/managers/index.ts +239 -0
  31. package/src/managers/layer-manager.ts +500 -0
  32. package/src/managers/metadata-manager.ts +303 -0
  33. package/src/managers/ocr-manager.ts +756 -0
  34. package/src/managers/optimization-manager.ts +262 -0
  35. package/src/managers/outline-manager.ts +196 -0
  36. package/src/managers/page-manager.ts +289 -0
  37. package/src/managers/pattern-detection.ts +440 -0
  38. package/src/managers/rendering-manager.ts +863 -0
  39. package/src/managers/search-manager.ts +385 -0
  40. package/src/managers/security-manager.ts +345 -0
  41. package/src/managers/signature-manager.ts +1664 -0
  42. package/src/managers/streams.ts +618 -0
  43. package/src/managers/xfa-manager.ts +500 -0
  44. package/src/pdf-creator-manager.ts +494 -0
  45. package/src/properties.ts +522 -0
  46. package/src/result-accessors-manager.ts +867 -0
  47. package/src/tests/advanced-features.test.ts +414 -0
  48. package/src/tests/advanced.test.ts +266 -0
  49. package/src/tests/extended-managers.test.ts +316 -0
  50. package/src/tests/final-utilities.test.ts +455 -0
  51. package/src/tests/foundation.test.ts +315 -0
  52. package/src/tests/high-demand.test.ts +257 -0
  53. package/src/tests/specialized.test.ts +97 -0
  54. package/src/thumbnail-manager.ts +272 -0
  55. package/src/types/common.ts +142 -0
  56. package/src/types/document-types.ts +457 -0
  57. package/src/types/index.ts +6 -0
  58. package/src/types/manager-types.ts +284 -0
  59. package/src/types/native-bindings.ts +517 -0
  60. package/src/workers/index.ts +7 -0
  61. package/src/workers/pool.ts +274 -0
  62. package/src/workers/worker.ts +131 -0
@@ -0,0 +1,618 @@
1
+ /**
2
+ * Stream API support for PDF Oxide Node.js
3
+ *
4
+ * Provides Readable streams for search results, text extraction, and page metadata.
5
+ * Supports backpressure handling and proper Node.js stream semantics.
6
+ *
7
+ * Phase 2.4 implementation for idiomatic Node.js patterns with Stream API.
8
+ */
9
+
10
+ import { Readable } from 'node:stream';
11
+
12
+ /**
13
+ * SearchResult emitted by SearchStream
14
+ */
15
+ export interface SearchResultData {
16
+ text?: string;
17
+ pageIndex?: number;
18
+ position?: number;
19
+ boundingBox?: Record<string, number>;
20
+ }
21
+
22
+ /**
23
+ * ExtractionProgress emitted by ExtractionStream
24
+ */
25
+ export interface ExtractionProgressData {
26
+ pageIndex: number;
27
+ totalPages: number;
28
+ extractedText: string;
29
+ extractionType: 'text' | 'markdown' | 'html';
30
+ progress: number;
31
+ }
32
+
33
+ /**
34
+ * PageMetadata emitted by MetadataStream
35
+ */
36
+ export interface PageMetadataData {
37
+ pageIndex: number;
38
+ width: number;
39
+ height: number;
40
+ fontCount: number;
41
+ imageCount: number;
42
+ rotation: number;
43
+ }
44
+
45
+ /**
46
+ * Readable stream for search results
47
+ *
48
+ * Emits search results one at a time with proper backpressure handling.
49
+ * Supports searching either a specific page or the entire document.
50
+ *
51
+ * Supports both traditional stream API (.on('data')) and async iteration (for await...of).
52
+ *
53
+ * @example
54
+ * ```typescript
55
+ * // Traditional stream API
56
+ * const stream = new SearchStream(searchManager, 'keyword');
57
+ * stream.on('data', (result) => {
58
+ * console.log(`Found on page ${result.pageIndex}: ${result.text}`);
59
+ * });
60
+ *
61
+ * // Async iteration
62
+ * const stream = new SearchStream(searchManager, 'keyword');
63
+ * for await (const result of stream) {
64
+ * console.log(`Found on page ${result.pageIndex}: ${result.text}`);
65
+ * }
66
+ * ```
67
+ */
68
+ export class SearchStream extends Readable {
69
+ private searchManager: any;
70
+ private searchTerm: string;
71
+ private options: Record<string, any>;
72
+ private pageIndex: number | undefined;
73
+ private caseSensitive: boolean;
74
+ private wholeWords: boolean;
75
+ private maxResults: number;
76
+ private _results: any[] | null;
77
+ private _currentIndex: number;
78
+ private _resultCount: number;
79
+ private _initialized: boolean;
80
+ private _ended: boolean;
81
+
82
+ /**
83
+ * Creates a new SearchStream
84
+ * @param searchManager - The search manager instance
85
+ * @param searchTerm - Text to search for
86
+ * @param options - Search options
87
+ * @throws Error if parameters are invalid
88
+ */
89
+ constructor(searchManager: any, searchTerm: string, options: Record<string, any> = {}) {
90
+ super({ objectMode: true });
91
+
92
+ if (!searchManager) {
93
+ throw new Error('SearchManager is required');
94
+ }
95
+ if (!searchTerm || typeof searchTerm !== 'string') {
96
+ throw new Error('Search term must be a non-empty string');
97
+ }
98
+
99
+ this.searchManager = searchManager;
100
+ this.searchTerm = searchTerm;
101
+ this.options = options;
102
+ this.pageIndex = options.pageIndex;
103
+ this.caseSensitive = options.caseSensitive ?? false;
104
+ this.wholeWords = options.wholeWords ?? false;
105
+ this.maxResults = options.maxResults ?? Infinity;
106
+
107
+ this._results = null;
108
+ this._currentIndex = 0;
109
+ this._resultCount = 0;
110
+ this._initialized = false;
111
+ this._ended = false;
112
+ }
113
+
114
+ /**
115
+ * Initialize results (lazy initialization)
116
+ * @private
117
+ */
118
+ private _initialize(): void {
119
+ if (this._initialized) return;
120
+ this._initialized = true;
121
+
122
+ try {
123
+ // Perform search
124
+ if (this.pageIndex !== undefined) {
125
+ this._results = (this.searchManager.search(
126
+ this.searchTerm,
127
+ this.pageIndex,
128
+ { caseSensitive: this.caseSensitive, wholeWords: this.wholeWords }
129
+ ) || []) as any[];
130
+ } else {
131
+ this._results = (this.searchManager.searchAll(
132
+ this.searchTerm,
133
+ { caseSensitive: this.caseSensitive, wholeWords: this.wholeWords }
134
+ ) || []) as any[];
135
+ }
136
+
137
+ // Apply max results limit
138
+ if (this._results && this._results.length > this.maxResults) {
139
+ this._results = this._results.slice(0, this.maxResults);
140
+ }
141
+ } catch (error) {
142
+ this.destroy(error as Error);
143
+ }
144
+ }
145
+
146
+ /**
147
+ * Implement _read() for readable stream
148
+ * @private
149
+ */
150
+ _read(): void {
151
+ // Initialize on first read
152
+ if (!this._initialized) {
153
+ this._initialize();
154
+ }
155
+
156
+ // Check if we have results to emit
157
+ if (!this._results || this._currentIndex >= this._results.length) {
158
+ // All results emitted
159
+ if (!this._ended) {
160
+ this._ended = true;
161
+ this.push(null);
162
+ }
163
+ return;
164
+ }
165
+
166
+ // Emit next result
167
+ const result = this._results[this._currentIndex];
168
+ this._currentIndex++;
169
+
170
+ // Format the result
171
+ const data: SearchResultData = {
172
+ text: result.text || result.getText?.(),
173
+ pageIndex: result.pageIndex || result.page || 0,
174
+ position: result.position || 0,
175
+ boundingBox: result.boundingBox,
176
+ };
177
+
178
+ this.push(data);
179
+ }
180
+
181
+ /**
182
+ * Implement async iteration protocol for `for await...of` support
183
+ * @returns AsyncIterator for iterating over search results
184
+ */
185
+ async *[Symbol.asyncIterator](): AsyncGenerator<SearchResultData, void, unknown> {
186
+ // Initialize on first iteration
187
+ if (!this._initialized) {
188
+ this._initialize();
189
+ }
190
+
191
+ // Yield results one by one
192
+ while (this._results && this._currentIndex < this._results.length) {
193
+ const result = this._results[this._currentIndex];
194
+ this._currentIndex++;
195
+
196
+ const data: SearchResultData = {
197
+ text: result.text || result.getText?.(),
198
+ pageIndex: result.pageIndex || result.page || 0,
199
+ position: result.position || 0,
200
+ boundingBox: result.boundingBox,
201
+ };
202
+
203
+ yield data;
204
+ }
205
+
206
+ if (!this._ended) {
207
+ this._ended = true;
208
+ this.destroy();
209
+ }
210
+ }
211
+
212
+ }
213
+
214
+ /**
215
+ * Readable stream for text extraction with progress tracking
216
+ *
217
+ * Emits extraction progress for each page with progress percentage.
218
+ * Supports multiple extraction formats: text, markdown, html.
219
+ * Supports both traditional stream API and async iteration.
220
+ *
221
+ * @example
222
+ * ```typescript
223
+ * // Traditional stream API
224
+ * const stream = new ExtractionStream(extractionManager, 0, 10, 'markdown');
225
+ * stream.on('data', (progress) => {
226
+ * console.log(`Progress: ${Math.round(progress.progress * 100)}%`);
227
+ * console.log(`Page ${progress.pageIndex + 1}: ${progress.extractedText.length} chars`);
228
+ * });
229
+ *
230
+ * // Async iteration
231
+ * const stream = new ExtractionStream(extractionManager, 0, 10, 'markdown');
232
+ * for await (const progress of stream) {
233
+ * console.log(`Progress: ${Math.round(progress.progress * 100)}%`);
234
+ * }
235
+ * ```
236
+ */
237
+ export class ExtractionStream extends Readable {
238
+ private extractionManager: any;
239
+ private startPage: number;
240
+ private endPage: number;
241
+ private extractionType: 'text' | 'markdown' | 'html';
242
+ private options: Record<string, any>;
243
+ private _currentPage: number;
244
+ private _totalPages: number;
245
+ private _ended: boolean;
246
+
247
+ /**
248
+ * Creates a new ExtractionStream
249
+ * @param extractionManager - The extraction manager instance
250
+ * @param startPage - Starting page index (inclusive)
251
+ * @param endPage - Ending page index (exclusive)
252
+ * @param extractionType - 'text', 'markdown', or 'html'
253
+ * @param options - Additional extraction options
254
+ * @throws Error if parameters are invalid
255
+ */
256
+ constructor(
257
+ extractionManager: any,
258
+ startPage: number,
259
+ endPage: number,
260
+ extractionType: 'text' | 'markdown' | 'html' = 'text',
261
+ options: Record<string, any> = {}
262
+ ) {
263
+ super({ objectMode: true });
264
+
265
+ if (!extractionManager) {
266
+ throw new Error('ExtractionManager is required');
267
+ }
268
+ if (typeof startPage !== 'number' || startPage < 0) {
269
+ throw new Error('Start page must be a non-negative number');
270
+ }
271
+ if (typeof endPage !== 'number' || endPage <= startPage) {
272
+ throw new Error('End page must be greater than start page');
273
+ }
274
+ if (!['text', 'markdown', 'html'].includes(extractionType)) {
275
+ throw new Error("Extraction type must be 'text', 'markdown', or 'html'");
276
+ }
277
+
278
+ this.extractionManager = extractionManager;
279
+ this.startPage = startPage;
280
+ this.endPage = endPage;
281
+ this.extractionType = extractionType;
282
+ this.options = options;
283
+
284
+ this._currentPage = startPage;
285
+ this._totalPages = endPage - startPage;
286
+ this._ended = false;
287
+ }
288
+
289
+ /**
290
+ * Implement _read() for readable stream
291
+ * @private
292
+ */
293
+ _read(): void {
294
+ // Check if we've processed all pages
295
+ if (this._currentPage >= this.endPage) {
296
+ if (!this._ended) {
297
+ this._ended = true;
298
+ this.push(null);
299
+ }
300
+ return;
301
+ }
302
+
303
+ try {
304
+ // Extract current page
305
+ let extractedText: string;
306
+ if (this.extractionType === 'markdown') {
307
+ extractedText = this.extractionManager.extractMarkdown(
308
+ this._currentPage,
309
+ this.options
310
+ );
311
+ } else if (this.extractionType === 'html') {
312
+ extractedText = this.extractionManager.extractHtml(
313
+ this._currentPage,
314
+ this.options
315
+ );
316
+ } else {
317
+ extractedText = this.extractionManager.extractText(
318
+ this._currentPage,
319
+ this.options
320
+ );
321
+ }
322
+
323
+ // Emit progress object
324
+ const progress: ExtractionProgressData = {
325
+ pageIndex: this._currentPage,
326
+ totalPages: this._totalPages,
327
+ extractedText: extractedText || '',
328
+ extractionType: this.extractionType,
329
+ progress: (this._currentPage - this.startPage + 1) / this._totalPages,
330
+ };
331
+
332
+ this._currentPage++;
333
+ this.push(progress);
334
+ } catch (error) {
335
+ this.destroy(error as Error);
336
+ }
337
+ }
338
+
339
+ /**
340
+ * Implement async iteration protocol for `for await...of` support
341
+ * @returns AsyncGenerator for iterating over extraction progress
342
+ */
343
+ async *[Symbol.asyncIterator](): AsyncGenerator<ExtractionProgressData, void, unknown> {
344
+ // Process each page
345
+ while (this._currentPage < this.endPage) {
346
+ try {
347
+ // Extract current page
348
+ let extractedText: string;
349
+ if (this.extractionType === 'markdown') {
350
+ extractedText = this.extractionManager.extractMarkdown(
351
+ this._currentPage,
352
+ this.options
353
+ );
354
+ } else if (this.extractionType === 'html') {
355
+ extractedText = this.extractionManager.extractHtml(
356
+ this._currentPage,
357
+ this.options
358
+ );
359
+ } else {
360
+ extractedText = this.extractionManager.extractText(
361
+ this._currentPage,
362
+ this.options
363
+ );
364
+ }
365
+
366
+ // Create progress object
367
+ const progress: ExtractionProgressData = {
368
+ pageIndex: this._currentPage,
369
+ totalPages: this._totalPages,
370
+ extractedText: extractedText || '',
371
+ extractionType: this.extractionType,
372
+ progress: (this._currentPage - this.startPage + 1) / this._totalPages,
373
+ };
374
+
375
+ this._currentPage++;
376
+
377
+ yield progress;
378
+ } catch (error) {
379
+ this.destroy(error as Error);
380
+ return;
381
+ }
382
+ }
383
+
384
+ if (!this._ended) {
385
+ this._ended = true;
386
+ this.destroy();
387
+ }
388
+ }
389
+ }
390
+
391
+ /**
392
+ * Readable stream for page metadata retrieval
393
+ *
394
+ * Emits page metadata (dimensions, fonts, images) for each page in range.
395
+ * Supports lazy loading of metadata per page.
396
+ * Supports both traditional stream API and async iteration.
397
+ *
398
+ * @example
399
+ * ```typescript
400
+ * // Traditional stream API
401
+ * const stream = new MetadataStream(renderingManager, 0, 10);
402
+ * stream.on('data', (metadata) => {
403
+ * console.log(`Page ${metadata.pageIndex + 1}: ${metadata.width}x${metadata.height}`);
404
+ * console.log(` Fonts: ${metadata.fontCount}, Images: ${metadata.imageCount}`);
405
+ * });
406
+ *
407
+ * // Async iteration
408
+ * const stream = new MetadataStream(renderingManager, 0, 10);
409
+ * for await (const metadata of stream) {
410
+ * console.log(`Page ${metadata.pageIndex + 1}: ${metadata.width}x${metadata.height}`);
411
+ * }
412
+ * ```
413
+ */
414
+ export class MetadataStream extends Readable {
415
+ private renderingManager: any;
416
+ private startPage: number;
417
+ private endPage: number;
418
+ private _currentPage: number;
419
+ private _ended: boolean;
420
+
421
+ /**
422
+ * Creates a new MetadataStream
423
+ * @param renderingManager - The rendering manager instance
424
+ * @param startPage - Starting page index (inclusive)
425
+ * @param endPage - Ending page index (exclusive)
426
+ * @throws Error if parameters are invalid
427
+ */
428
+ constructor(renderingManager: any, startPage: number, endPage: number) {
429
+ super({ objectMode: true });
430
+
431
+ if (!renderingManager) {
432
+ throw new Error('RenderingManager is required');
433
+ }
434
+ if (typeof startPage !== 'number' || startPage < 0) {
435
+ throw new Error('Start page must be a non-negative number');
436
+ }
437
+ if (typeof endPage !== 'number' || endPage <= startPage) {
438
+ throw new Error('End page must be greater than start page');
439
+ }
440
+
441
+ this.renderingManager = renderingManager;
442
+ this.startPage = startPage;
443
+ this.endPage = endPage;
444
+
445
+ this._currentPage = startPage;
446
+ this._ended = false;
447
+ }
448
+
449
+ /**
450
+ * Implement _read() for readable stream
451
+ * @private
452
+ */
453
+ _read(): void {
454
+ // Check if we've processed all pages
455
+ if (this._currentPage >= this.endPage) {
456
+ if (!this._ended) {
457
+ this._ended = true;
458
+ this.push(null);
459
+ }
460
+ return;
461
+ }
462
+
463
+ try {
464
+ // Get page dimensions
465
+ const dimensions = this.renderingManager.getPageDimensions(this._currentPage);
466
+
467
+ // Get embedded resources
468
+ const fonts = this.renderingManager.getEmbeddedFonts?.(this._currentPage) || [];
469
+ const images = this.renderingManager.getEmbeddedImages?.(this._currentPage) || [];
470
+
471
+ // Get rotation
472
+ const rotation = dimensions?.rotation || 0;
473
+
474
+ // Emit metadata object
475
+ const metadata: PageMetadataData = {
476
+ pageIndex: this._currentPage,
477
+ width: dimensions?.width || 0,
478
+ height: dimensions?.height || 0,
479
+ fontCount: Array.isArray(fonts) ? fonts.length : 0,
480
+ imageCount: Array.isArray(images) ? images.length : 0,
481
+ rotation: rotation,
482
+ };
483
+
484
+ this._currentPage++;
485
+ this.push(metadata);
486
+ } catch (error) {
487
+ this.destroy(error as Error);
488
+ }
489
+ }
490
+
491
+ /**
492
+ * Implement async iteration protocol for `for await...of` support
493
+ * @returns AsyncGenerator for iterating over page metadata
494
+ */
495
+ async *[Symbol.asyncIterator](): AsyncGenerator<PageMetadataData, void, unknown> {
496
+ // Process each page
497
+ while (this._currentPage < this.endPage) {
498
+ try {
499
+ // Get page dimensions
500
+ const dimensions = this.renderingManager.getPageDimensions(this._currentPage);
501
+
502
+ // Get embedded resources
503
+ const fonts = this.renderingManager.getEmbeddedFonts?.(this._currentPage) || [];
504
+ const images = this.renderingManager.getEmbeddedImages?.(this._currentPage) || [];
505
+
506
+ // Get rotation
507
+ const rotation = dimensions?.rotation || 0;
508
+
509
+ // Create metadata object
510
+ const metadata: PageMetadataData = {
511
+ pageIndex: this._currentPage,
512
+ width: dimensions?.width || 0,
513
+ height: dimensions?.height || 0,
514
+ fontCount: Array.isArray(fonts) ? fonts.length : 0,
515
+ imageCount: Array.isArray(images) ? images.length : 0,
516
+ rotation: rotation,
517
+ };
518
+
519
+ this._currentPage++;
520
+
521
+ yield metadata;
522
+ } catch (error) {
523
+ this.destroy(error as Error);
524
+ return;
525
+ }
526
+ }
527
+
528
+ if (!this._ended) {
529
+ this._ended = true;
530
+ this.destroy();
531
+ }
532
+ }
533
+ }
534
+
535
+ /**
536
+ * Creates a readable stream for search results
537
+ *
538
+ * Convenience function to create a SearchStream instance.
539
+ *
540
+ * @param searchManager - The search manager
541
+ * @param searchTerm - Text to search for
542
+ * @param options - Search options
543
+ * @returns A readable stream of search results
544
+ *
545
+ * @example
546
+ * ```typescript
547
+ * createSearchStream(manager, 'error')
548
+ * .pipe(through2.obj((result, enc, cb) => {
549
+ * console.log(`Found: ${result.text}`);
550
+ * cb();
551
+ * }));
552
+ * ```
553
+ */
554
+ export function createSearchStream(
555
+ searchManager: any,
556
+ searchTerm: string,
557
+ options: Record<string, any> = {}
558
+ ): SearchStream {
559
+ return new SearchStream(searchManager, searchTerm, options);
560
+ }
561
+
562
+ /**
563
+ * Creates a readable stream for extraction with progress
564
+ *
565
+ * Convenience function to create an ExtractionStream instance.
566
+ *
567
+ * @param extractionManager - The extraction manager
568
+ * @param startPage - Starting page index
569
+ * @param endPage - Ending page index
570
+ * @param extractionType - Extraction format
571
+ * @param options - Additional options
572
+ * @returns A readable stream of extraction progress
573
+ *
574
+ * @example
575
+ * ```typescript
576
+ * createExtractionStream(manager, 0, 10, 'markdown')
577
+ * .pipe(through2.obj((progress, enc, cb) => {
578
+ * console.log(`${Math.round(progress.progress * 100)}% complete`);
579
+ * cb();
580
+ * }));
581
+ * ```
582
+ */
583
+ export function createExtractionStream(
584
+ extractionManager: any,
585
+ startPage: number,
586
+ endPage: number,
587
+ extractionType: 'text' | 'markdown' | 'html' = 'text',
588
+ options: Record<string, any> = {}
589
+ ): ExtractionStream {
590
+ return new ExtractionStream(extractionManager, startPage, endPage, extractionType, options);
591
+ }
592
+
593
+ /**
594
+ * Creates a readable stream for page metadata
595
+ *
596
+ * Convenience function to create a MetadataStream instance.
597
+ *
598
+ * @param renderingManager - The rendering manager
599
+ * @param startPage - Starting page index
600
+ * @param endPage - Ending page index
601
+ * @returns A readable stream of page metadata
602
+ *
603
+ * @example
604
+ * ```typescript
605
+ * createMetadataStream(manager, 0, 10)
606
+ * .pipe(through2.obj((metadata, enc, cb) => {
607
+ * console.log(`Page ${metadata.pageIndex}: ${metadata.width}x${metadata.height}`);
608
+ * cb();
609
+ * }));
610
+ * ```
611
+ */
612
+ export function createMetadataStream(
613
+ renderingManager: any,
614
+ startPage: number,
615
+ endPage: number
616
+ ): MetadataStream {
617
+ return new MetadataStream(renderingManager, startPage, endPage);
618
+ }