pdf-oxide 0.3.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +218 -0
  2. package/binding.gyp +35 -0
  3. package/package.json +78 -0
  4. package/src/builders/annotation-builder.ts +367 -0
  5. package/src/builders/conversion-options-builder.ts +257 -0
  6. package/src/builders/index.ts +12 -0
  7. package/src/builders/metadata-builder.ts +317 -0
  8. package/src/builders/pdf-builder.ts +386 -0
  9. package/src/builders/search-options-builder.ts +151 -0
  10. package/src/document-editor-manager.ts +318 -0
  11. package/src/errors.ts +1629 -0
  12. package/src/form-field-manager.ts +666 -0
  13. package/src/hybrid-ml-manager.ts +283 -0
  14. package/src/index.ts +453 -0
  15. package/src/managers/accessibility-manager.ts +338 -0
  16. package/src/managers/annotation-manager.ts +439 -0
  17. package/src/managers/barcode-manager.ts +235 -0
  18. package/src/managers/batch-manager.ts +533 -0
  19. package/src/managers/cache-manager.ts +486 -0
  20. package/src/managers/compliance-manager.ts +375 -0
  21. package/src/managers/content-manager.ts +339 -0
  22. package/src/managers/document-utility-manager.ts +922 -0
  23. package/src/managers/dom-pdf-creator.ts +365 -0
  24. package/src/managers/editing-manager.ts +514 -0
  25. package/src/managers/enterprise-manager.ts +478 -0
  26. package/src/managers/extended-managers.ts +437 -0
  27. package/src/managers/extraction-manager.ts +583 -0
  28. package/src/managers/final-utilities.ts +429 -0
  29. package/src/managers/hybrid-ml-advanced.ts +479 -0
  30. package/src/managers/index.ts +239 -0
  31. package/src/managers/layer-manager.ts +500 -0
  32. package/src/managers/metadata-manager.ts +303 -0
  33. package/src/managers/ocr-manager.ts +756 -0
  34. package/src/managers/optimization-manager.ts +262 -0
  35. package/src/managers/outline-manager.ts +196 -0
  36. package/src/managers/page-manager.ts +289 -0
  37. package/src/managers/pattern-detection.ts +440 -0
  38. package/src/managers/rendering-manager.ts +863 -0
  39. package/src/managers/search-manager.ts +385 -0
  40. package/src/managers/security-manager.ts +345 -0
  41. package/src/managers/signature-manager.ts +1664 -0
  42. package/src/managers/streams.ts +618 -0
  43. package/src/managers/xfa-manager.ts +500 -0
  44. package/src/pdf-creator-manager.ts +494 -0
  45. package/src/properties.ts +522 -0
  46. package/src/result-accessors-manager.ts +867 -0
  47. package/src/tests/advanced-features.test.ts +414 -0
  48. package/src/tests/advanced.test.ts +266 -0
  49. package/src/tests/extended-managers.test.ts +316 -0
  50. package/src/tests/final-utilities.test.ts +455 -0
  51. package/src/tests/foundation.test.ts +315 -0
  52. package/src/tests/high-demand.test.ts +257 -0
  53. package/src/tests/specialized.test.ts +97 -0
  54. package/src/thumbnail-manager.ts +272 -0
  55. package/src/types/common.ts +142 -0
  56. package/src/types/document-types.ts +457 -0
  57. package/src/types/index.ts +6 -0
  58. package/src/types/manager-types.ts +284 -0
  59. package/src/types/native-bindings.ts +517 -0
  60. package/src/workers/index.ts +7 -0
  61. package/src/workers/pool.ts +274 -0
  62. package/src/workers/worker.ts +131 -0
@@ -0,0 +1,533 @@
1
+ /**
2
+ * Batch Processing Manager for Parallel Document Operations
3
+ *
4
+ * Enables efficient parallel processing of multiple PDF documents with:
5
+ * - Configurable concurrency control
6
+ * - Real-time progress tracking with ETA calculation
7
+ * - Memory-aware backpressure handling
8
+ * - Per-document timeout support
9
+ * - Error resilience and detailed reporting
10
+ *
11
+ * @example
12
+ * ```typescript
13
+ * import { BatchManager } from 'pdf-oxide';
14
+ * import { PdfDocument } from 'pdf-oxide';
15
+ *
16
+ * const batch = new BatchManager([
17
+ * { path: 'doc1.pdf' },
18
+ * { path: 'doc2.pdf' },
19
+ * { path: 'doc3.pdf' }
20
+ * ]);
21
+ *
22
+ * // Extract text from multiple documents in parallel
23
+ * const results = await batch.extractTextBatch({
24
+ * maxParallel: 4,
25
+ * timeout: 30000,
26
+ * onProgress: (progress) => {
27
+ * console.log('Progress: ' + Math.round(progress.progress * 100) + '%');
28
+ * console.log('ETA: ' + progress.eta + 'ms');
29
+ * }
30
+ * });
31
+ *
32
+ * results.forEach(result => {
33
+ * if (result.success) {
34
+ * console.log(result.document.path + ': ' + result.data.length + ' chars');
35
+ * } else {
36
+ * console.error(result.document.path + ': ' + result.error.message);
37
+ * }
38
+ * });
39
+ * ```
40
+ */
41
+
42
+ import os from 'os';
43
+ import type { PdfErrorDetails } from '../types/common';
44
+
45
+ /**
46
+ * Represents a document to be processed in a batch
47
+ */
48
+ export interface BatchDocument {
49
+ /** File path to the PDF document */
50
+ path: string;
51
+ /** Optional unique identifier */
52
+ id?: string;
53
+ /** Priority (1-10, default 5) */
54
+ priority?: number;
55
+ /** Optional metadata */
56
+ metadata?: Record<string, any>;
57
+ }
58
+
59
+ /**
60
+ * Progress information for batch operations
61
+ */
62
+ export interface BatchProgress {
63
+ /** Total documents in batch */
64
+ total: number;
65
+ /** Number of successfully completed documents */
66
+ completed: number;
67
+ /** Number of failed documents */
68
+ failed: number;
69
+ /** Current document being processed (index) */
70
+ current: number;
71
+ /** Progress percentage (0.0-1.0) */
72
+ progress: number;
73
+ /** Estimated time remaining in milliseconds */
74
+ eta: number;
75
+ /** Number of currently active operations */
76
+ activeOperations: number;
77
+ /** Batch start time (milliseconds since epoch) */
78
+ startTime: number;
79
+ /** Elapsed time since start (milliseconds) */
80
+ elapsedTime: number;
81
+ }
82
+
83
+ /**
84
+ * Result of processing a single document in a batch
85
+ */
86
+ export interface BatchResult<T = any> {
87
+ /** The document that was processed */
88
+ document: BatchDocument;
89
+ /** Whether the operation succeeded */
90
+ success: boolean;
91
+ /** The result data if successful */
92
+ data?: T;
93
+ /** Error if operation failed */
94
+ error?: Error;
95
+ /** Time to process this document (milliseconds) */
96
+ duration: number;
97
+ }
98
+
99
+ /**
100
+ * Options for batch processing
101
+ */
102
+ export interface BatchOptions {
103
+ /** Maximum number of parallel operations (default: CPU count) */
104
+ maxParallel?: number;
105
+ /** Timeout per document in milliseconds (default: 30000) */
106
+ timeout?: number;
107
+ /** Progress callback invoked on each document completion */
108
+ onProgress?: (progress: BatchProgress) => void;
109
+ /** Backpressure configuration for memory management */
110
+ backpressure?: {
111
+ /** Maximum memory usage in MB (default: 500) */
112
+ maxMemoryMB?: number;
113
+ /** Interval to check memory in ms (default: 1000) */
114
+ checkInterval?: number;
115
+ };
116
+ }
117
+
118
+ /**
119
+ * Statistics for completed batch operations
120
+ */
121
+ export interface BatchStatistics {
122
+ /** Total documents processed */
123
+ total: number;
124
+ /** Successfully completed documents */
125
+ completed: number;
126
+ /** Failed documents */
127
+ failed: number;
128
+ /** Total time elapsed (milliseconds) */
129
+ totalTime: number;
130
+ /** Average time per document (milliseconds) */
131
+ averageTime: number;
132
+ /** Documents per second throughput */
133
+ throughput: number;
134
+ /** Peak memory usage (MB) */
135
+ peakMemory: number;
136
+ }
137
+
138
+ /**
139
+ * Batch processor for parallel document operations
140
+ */
141
+ export class BatchManager {
142
+ private documents: BatchDocument[];
143
+ private stats: BatchStatistics = {
144
+ total: 0,
145
+ completed: 0,
146
+ failed: 0,
147
+ totalTime: 0,
148
+ averageTime: 0,
149
+ throughput: 0,
150
+ peakMemory: 0,
151
+ };
152
+
153
+ /**
154
+ * Creates a new BatchManager
155
+ * @param documents - Array of documents to process
156
+ * @throws Error if documents array is empty or invalid
157
+ */
158
+ constructor(documents: BatchDocument[]) {
159
+ if (!Array.isArray(documents) || documents.length === 0) {
160
+ throw new Error('Documents array must not be empty');
161
+ }
162
+
163
+ for (const doc of documents) {
164
+ if (!doc.path || typeof doc.path !== 'string') {
165
+ throw new Error('Each document must have a valid path property');
166
+ }
167
+ }
168
+
169
+ this.documents = documents;
170
+ this.stats.total = documents.length;
171
+ }
172
+
173
+ /**
174
+ * Get current statistics
175
+ */
176
+ getStatistics(): BatchStatistics {
177
+ return { ...this.stats };
178
+ }
179
+
180
+ /**
181
+ * Process documents in a queue with concurrency control
182
+ * @private
183
+ */
184
+ private async processQueue<T>(
185
+ processor: (doc: BatchDocument, index: number) => Promise<BatchResult<T>>,
186
+ options: BatchOptions = {}
187
+ ): Promise<BatchResult<T>[]> {
188
+ const maxParallel = options.maxParallel || os.cpus().length;
189
+ const timeout = options.timeout || 30000;
190
+ const backpressure = options.backpressure || {
191
+ maxMemoryMB: 500,
192
+ checkInterval: 1000,
193
+ };
194
+
195
+ const results: BatchResult<T>[] = [];
196
+ const startTime = Date.now();
197
+ let completed = 0;
198
+ let failed = 0;
199
+ let active = 0;
200
+
201
+ // Progress tracking helper
202
+ const reportProgress = () => {
203
+ const elapsedTime = Date.now() - startTime;
204
+ const completedDocs = completed + failed;
205
+ const avgTimePerDoc = completedDocs > 0 ? elapsedTime / completedDocs : 0;
206
+ const eta =
207
+ completedDocs > 0
208
+ ? (this.documents.length - completedDocs) * avgTimePerDoc
209
+ : 0;
210
+
211
+ if (options.onProgress) {
212
+ options.onProgress({
213
+ total: this.documents.length,
214
+ completed,
215
+ failed,
216
+ current: completedDocs,
217
+ progress: this.documents.length > 0 ? completedDocs / this.documents.length : 0,
218
+ eta: Math.max(0, eta),
219
+ activeOperations: active,
220
+ startTime,
221
+ elapsedTime,
222
+ });
223
+ }
224
+ };
225
+
226
+ // Memory monitoring helper
227
+ const checkMemory = async (): Promise<void> => {
228
+ const memUsageMB = process.memoryUsage().heapUsed / 1024 / 1024;
229
+ this.stats.peakMemory = Math.max(this.stats.peakMemory, memUsageMB);
230
+
231
+ if (memUsageMB > (backpressure.maxMemoryMB || 500)) {
232
+ // Wait a bit for garbage collection
233
+ await new Promise((resolve) => setTimeout(resolve, 100));
234
+ }
235
+ };
236
+
237
+ // Process all documents with concurrency control
238
+ let index = 0;
239
+ const queue: Promise<void>[] = [];
240
+
241
+ while (index < this.documents.length || queue.length > 0) {
242
+ // Check memory before starting new operations
243
+ await checkMemory();
244
+
245
+ // Start new operations while under concurrency limit
246
+ while (active < maxParallel && index < this.documents.length) {
247
+ const docIndex = index++;
248
+ const doc = this.documents[docIndex];
249
+
250
+ active++;
251
+
252
+ const promise = (async () => {
253
+ const docStartTime = Date.now();
254
+ try {
255
+ const result = await Promise.race([
256
+ processor(doc!, docIndex),
257
+ new Promise<BatchResult<T>>((_, reject) =>
258
+ setTimeout(
259
+ () => reject(new Error('Timeout after ' + timeout + 'ms')),
260
+ timeout
261
+ )
262
+ ),
263
+ ]);
264
+
265
+ result.duration = Date.now() - docStartTime;
266
+ results[docIndex] = result;
267
+
268
+ if (result.success) {
269
+ completed++;
270
+ } else {
271
+ failed++;
272
+ }
273
+ } catch (error) {
274
+ const duration = Date.now() - docStartTime;
275
+ results[docIndex] = {
276
+ document: doc!,
277
+ success: false,
278
+ error: error instanceof Error ? error : new Error(String(error)),
279
+ duration,
280
+ };
281
+ failed++;
282
+ } finally {
283
+ active--;
284
+ reportProgress();
285
+ }
286
+ })();
287
+
288
+ queue.push(promise);
289
+ }
290
+
291
+ // Wait for at least one operation to complete
292
+ if (queue.length > 0) {
293
+ await Promise.race(queue);
294
+ const idx = queue.findIndex((p) => p !== undefined);
295
+ if (idx >= 0) {
296
+ queue.splice(idx, 1);
297
+ }
298
+ }
299
+ }
300
+
301
+ // Update final statistics
302
+ const totalTime = Date.now() - startTime;
303
+ this.stats.totalTime = totalTime;
304
+ this.stats.completed = completed;
305
+ this.stats.failed = failed;
306
+ this.stats.averageTime =
307
+ completed > 0 ? totalTime / completed : 0;
308
+ this.stats.throughput = totalTime > 0 ? (completed / totalTime) * 1000 : 0;
309
+
310
+ return results.filter((r) => r !== undefined);
311
+ }
312
+
313
+ /**
314
+ * Extract text from multiple documents in parallel
315
+ * @param options - Batch processing options
316
+ * @returns Array of extraction results
317
+ */
318
+ async extractTextBatch(
319
+ options: BatchOptions = {}
320
+ ): Promise<BatchResult<string>[]> {
321
+ return this.processQueue(async (doc, _index) => {
322
+ try {
323
+ // Dynamic import to avoid circular dependencies
324
+ const { PdfDocument } = await import('../index.js');
325
+ const pdfDoc = PdfDocument.open(doc.path);
326
+ const extractionMgr = (pdfDoc as any).createExtractionManager?.();
327
+ if (!extractionMgr) {
328
+ throw new Error('Failed to create extraction manager');
329
+ }
330
+ const text = extractionMgr.extractAllText();
331
+ if (typeof pdfDoc.close === 'function') {
332
+ pdfDoc.close();
333
+ }
334
+
335
+ return {
336
+ document: doc,
337
+ success: true,
338
+ data: text,
339
+ duration: 0,
340
+ };
341
+ } catch (error) {
342
+ return {
343
+ document: doc,
344
+ success: false,
345
+ error: error instanceof Error ? error : new Error(String(error)),
346
+ duration: 0,
347
+ };
348
+ }
349
+ }, options);
350
+ }
351
+
352
+ /**
353
+ * Extract markdown from multiple documents in parallel
354
+ * @param options - Batch processing options
355
+ * @returns Array of extraction results
356
+ */
357
+ async extractMarkdownBatch(
358
+ options: BatchOptions = {}
359
+ ): Promise<BatchResult<string>[]> {
360
+ return this.processQueue(async (doc, _index) => {
361
+ try {
362
+ const { PdfDocument } = await import('../index.js');
363
+ const pdfDoc = PdfDocument.open(doc.path);
364
+ const extractionMgr = (pdfDoc as any).createExtractionManager?.();
365
+ if (!extractionMgr) {
366
+ throw new Error('Failed to create extraction manager');
367
+ }
368
+
369
+ // Extract markdown from all pages
370
+ let markdown = '';
371
+ const pageCount = (pdfDoc as any).pageCount || 0;
372
+ for (let i = 0; i < pageCount; i++) {
373
+ markdown += extractionMgr.extractMarkdown(i) || '';
374
+ if (i < pageCount - 1) {
375
+ markdown += '\n\n---\n\n';
376
+ }
377
+ }
378
+ if (typeof pdfDoc.close === 'function') {
379
+ pdfDoc.close();
380
+ }
381
+
382
+ return {
383
+ document: doc,
384
+ success: true,
385
+ data: markdown,
386
+ duration: 0,
387
+ };
388
+ } catch (error) {
389
+ return {
390
+ document: doc,
391
+ success: false,
392
+ error: error instanceof Error ? error : new Error(String(error)),
393
+ duration: 0,
394
+ };
395
+ }
396
+ }, options);
397
+ }
398
+
399
+ /**
400
+ * Extract HTML from multiple documents in parallel
401
+ * @param options - Batch processing options
402
+ * @returns Array of extraction results
403
+ */
404
+ async extractHtmlBatch(
405
+ options: BatchOptions = {}
406
+ ): Promise<BatchResult<string>[]> {
407
+ return this.processQueue(async (doc, _index) => {
408
+ try {
409
+ const { PdfDocument } = await import('../index.js');
410
+ const pdfDoc = PdfDocument.open(doc.path);
411
+ const extractionMgr = (pdfDoc as any).createExtractionManager?.();
412
+ if (!extractionMgr) {
413
+ throw new Error('Failed to create extraction manager');
414
+ }
415
+
416
+ // Extract HTML from all pages
417
+ let html = '<html><body>';
418
+ const pageCount = (pdfDoc as any).pageCount || 0;
419
+ for (let i = 0; i < pageCount; i++) {
420
+ html += '<div class="page page-' + (i + 1) + '">';
421
+ html += extractionMgr.extractHtml(i) || '';
422
+ html += '</div>';
423
+ }
424
+ html += '</body></html>';
425
+ if (typeof pdfDoc.close === 'function') {
426
+ pdfDoc.close();
427
+ }
428
+
429
+ return {
430
+ document: doc,
431
+ success: true,
432
+ data: html,
433
+ duration: 0,
434
+ };
435
+ } catch (error) {
436
+ return {
437
+ document: doc,
438
+ success: false,
439
+ error: error instanceof Error ? error : new Error(String(error)),
440
+ duration: 0,
441
+ };
442
+ }
443
+ }, options);
444
+ }
445
+
446
+ /**
447
+ * Search for a term in multiple documents in parallel
448
+ * @param searchTerm - Term to search for
449
+ * @param options - Batch processing options
450
+ * @returns Array of search results
451
+ */
452
+ async searchBatch(
453
+ searchTerm: string,
454
+ options: BatchOptions = {}
455
+ ): Promise<BatchResult<Array<{ page: number; count: number }>>[]> {
456
+ if (!searchTerm || typeof searchTerm !== 'string') {
457
+ throw new Error('Search term must be a non-empty string');
458
+ }
459
+
460
+ return this.processQueue(async (doc, _index) => {
461
+ try {
462
+ const { PdfDocument } = await import('../index.js');
463
+ const pdfDoc = PdfDocument.open(doc.path);
464
+ const searchMgr = (pdfDoc as any).createSearchManager?.();
465
+ if (!searchMgr) {
466
+ throw new Error('Failed to create search manager');
467
+ }
468
+
469
+ const results: Array<{ page: number; count: number }> = [];
470
+ const pageCount = (pdfDoc as any).pageCount || 0;
471
+ for (let i = 0; i < pageCount; i++) {
472
+ const matches = searchMgr.search(searchTerm, i) || [];
473
+ if (matches.length > 0) {
474
+ results.push({ page: i, count: matches.length });
475
+ }
476
+ }
477
+ if (typeof pdfDoc.close === 'function') {
478
+ pdfDoc.close();
479
+ }
480
+
481
+ return {
482
+ document: doc,
483
+ success: true,
484
+ data: results,
485
+ duration: 0,
486
+ };
487
+ } catch (error) {
488
+ return {
489
+ document: doc,
490
+ success: false,
491
+ error: error instanceof Error ? error : new Error(String(error)),
492
+ duration: 0,
493
+ };
494
+ }
495
+ }, options);
496
+ }
497
+
498
+ /**
499
+ * Generic batch processor for custom operations
500
+ * @param processor - Function to process each document
501
+ * @param options - Batch processing options
502
+ * @returns Array of results
503
+ */
504
+ async processBatch<T>(
505
+ processor: (doc: BatchDocument, pdfDoc: any) => Promise<T>,
506
+ options: BatchOptions = {}
507
+ ): Promise<BatchResult<T>[]> {
508
+ return this.processQueue(async (doc, _index) => {
509
+ try {
510
+ const { PdfDocument } = await import('../index.js');
511
+ const pdfDoc = PdfDocument.open(doc.path);
512
+ const data = await processor(doc, pdfDoc);
513
+ if (typeof pdfDoc.close === 'function') {
514
+ pdfDoc.close();
515
+ }
516
+
517
+ return {
518
+ document: doc,
519
+ success: true,
520
+ data,
521
+ duration: 0,
522
+ };
523
+ } catch (error) {
524
+ return {
525
+ document: doc,
526
+ success: false,
527
+ error: error instanceof Error ? error : new Error(String(error)),
528
+ duration: 0,
529
+ };
530
+ }
531
+ }, options);
532
+ }
533
+ }