pdf-oxide 0.3.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -0
- package/binding.gyp +35 -0
- package/package.json +78 -0
- package/src/builders/annotation-builder.ts +367 -0
- package/src/builders/conversion-options-builder.ts +257 -0
- package/src/builders/index.ts +12 -0
- package/src/builders/metadata-builder.ts +317 -0
- package/src/builders/pdf-builder.ts +386 -0
- package/src/builders/search-options-builder.ts +151 -0
- package/src/document-editor-manager.ts +318 -0
- package/src/errors.ts +1629 -0
- package/src/form-field-manager.ts +666 -0
- package/src/hybrid-ml-manager.ts +283 -0
- package/src/index.ts +453 -0
- package/src/managers/accessibility-manager.ts +338 -0
- package/src/managers/annotation-manager.ts +439 -0
- package/src/managers/barcode-manager.ts +235 -0
- package/src/managers/batch-manager.ts +533 -0
- package/src/managers/cache-manager.ts +486 -0
- package/src/managers/compliance-manager.ts +375 -0
- package/src/managers/content-manager.ts +339 -0
- package/src/managers/document-utility-manager.ts +922 -0
- package/src/managers/dom-pdf-creator.ts +365 -0
- package/src/managers/editing-manager.ts +514 -0
- package/src/managers/enterprise-manager.ts +478 -0
- package/src/managers/extended-managers.ts +437 -0
- package/src/managers/extraction-manager.ts +583 -0
- package/src/managers/final-utilities.ts +429 -0
- package/src/managers/hybrid-ml-advanced.ts +479 -0
- package/src/managers/index.ts +239 -0
- package/src/managers/layer-manager.ts +500 -0
- package/src/managers/metadata-manager.ts +303 -0
- package/src/managers/ocr-manager.ts +756 -0
- package/src/managers/optimization-manager.ts +262 -0
- package/src/managers/outline-manager.ts +196 -0
- package/src/managers/page-manager.ts +289 -0
- package/src/managers/pattern-detection.ts +440 -0
- package/src/managers/rendering-manager.ts +863 -0
- package/src/managers/search-manager.ts +385 -0
- package/src/managers/security-manager.ts +345 -0
- package/src/managers/signature-manager.ts +1664 -0
- package/src/managers/streams.ts +618 -0
- package/src/managers/xfa-manager.ts +500 -0
- package/src/pdf-creator-manager.ts +494 -0
- package/src/properties.ts +522 -0
- package/src/result-accessors-manager.ts +867 -0
- package/src/tests/advanced-features.test.ts +414 -0
- package/src/tests/advanced.test.ts +266 -0
- package/src/tests/extended-managers.test.ts +316 -0
- package/src/tests/final-utilities.test.ts +455 -0
- package/src/tests/foundation.test.ts +315 -0
- package/src/tests/high-demand.test.ts +257 -0
- package/src/tests/specialized.test.ts +97 -0
- package/src/thumbnail-manager.ts +272 -0
- package/src/types/common.ts +142 -0
- package/src/types/document-types.ts +457 -0
- package/src/types/index.ts +6 -0
- package/src/types/manager-types.ts +284 -0
- package/src/types/native-bindings.ts +517 -0
- package/src/workers/index.ts +7 -0
- package/src/workers/pool.ts +274 -0
- package/src/workers/worker.ts +131 -0
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Batch Processing Manager for Parallel Document Operations
|
|
3
|
+
*
|
|
4
|
+
* Enables efficient parallel processing of multiple PDF documents with:
|
|
5
|
+
* - Configurable concurrency control
|
|
6
|
+
* - Real-time progress tracking with ETA calculation
|
|
7
|
+
* - Memory-aware backpressure handling
|
|
8
|
+
* - Per-document timeout support
|
|
9
|
+
* - Error resilience and detailed reporting
|
|
10
|
+
*
|
|
11
|
+
* @example
|
|
12
|
+
* ```typescript
|
|
13
|
+
* import { BatchManager } from 'pdf-oxide';
|
|
14
|
+
* import { PdfDocument } from 'pdf-oxide';
|
|
15
|
+
*
|
|
16
|
+
* const batch = new BatchManager([
|
|
17
|
+
* { path: 'doc1.pdf' },
|
|
18
|
+
* { path: 'doc2.pdf' },
|
|
19
|
+
* { path: 'doc3.pdf' }
|
|
20
|
+
* ]);
|
|
21
|
+
*
|
|
22
|
+
* // Extract text from multiple documents in parallel
|
|
23
|
+
* const results = await batch.extractTextBatch({
|
|
24
|
+
* maxParallel: 4,
|
|
25
|
+
* timeout: 30000,
|
|
26
|
+
* onProgress: (progress) => {
|
|
27
|
+
* console.log('Progress: ' + Math.round(progress.progress * 100) + '%');
|
|
28
|
+
* console.log('ETA: ' + progress.eta + 'ms');
|
|
29
|
+
* }
|
|
30
|
+
* });
|
|
31
|
+
*
|
|
32
|
+
* results.forEach(result => {
|
|
33
|
+
* if (result.success) {
|
|
34
|
+
* console.log(result.document.path + ': ' + result.data.length + ' chars');
|
|
35
|
+
* } else {
|
|
36
|
+
* console.error(result.document.path + ': ' + result.error.message);
|
|
37
|
+
* }
|
|
38
|
+
* });
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
|
|
42
|
+
import os from 'os';
|
|
43
|
+
import type { PdfErrorDetails } from '../types/common';
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Represents a document to be processed in a batch
|
|
47
|
+
*/
|
|
48
|
+
export interface BatchDocument {
|
|
49
|
+
/** File path to the PDF document */
|
|
50
|
+
path: string;
|
|
51
|
+
/** Optional unique identifier */
|
|
52
|
+
id?: string;
|
|
53
|
+
/** Priority (1-10, default 5) */
|
|
54
|
+
priority?: number;
|
|
55
|
+
/** Optional metadata */
|
|
56
|
+
metadata?: Record<string, any>;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Progress information for batch operations
|
|
61
|
+
*/
|
|
62
|
+
export interface BatchProgress {
|
|
63
|
+
/** Total documents in batch */
|
|
64
|
+
total: number;
|
|
65
|
+
/** Number of successfully completed documents */
|
|
66
|
+
completed: number;
|
|
67
|
+
/** Number of failed documents */
|
|
68
|
+
failed: number;
|
|
69
|
+
/** Current document being processed (index) */
|
|
70
|
+
current: number;
|
|
71
|
+
/** Progress percentage (0.0-1.0) */
|
|
72
|
+
progress: number;
|
|
73
|
+
/** Estimated time remaining in milliseconds */
|
|
74
|
+
eta: number;
|
|
75
|
+
/** Number of currently active operations */
|
|
76
|
+
activeOperations: number;
|
|
77
|
+
/** Batch start time (milliseconds since epoch) */
|
|
78
|
+
startTime: number;
|
|
79
|
+
/** Elapsed time since start (milliseconds) */
|
|
80
|
+
elapsedTime: number;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Result of processing a single document in a batch
|
|
85
|
+
*/
|
|
86
|
+
export interface BatchResult<T = any> {
|
|
87
|
+
/** The document that was processed */
|
|
88
|
+
document: BatchDocument;
|
|
89
|
+
/** Whether the operation succeeded */
|
|
90
|
+
success: boolean;
|
|
91
|
+
/** The result data if successful */
|
|
92
|
+
data?: T;
|
|
93
|
+
/** Error if operation failed */
|
|
94
|
+
error?: Error;
|
|
95
|
+
/** Time to process this document (milliseconds) */
|
|
96
|
+
duration: number;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Options for batch processing
|
|
101
|
+
*/
|
|
102
|
+
export interface BatchOptions {
|
|
103
|
+
/** Maximum number of parallel operations (default: CPU count) */
|
|
104
|
+
maxParallel?: number;
|
|
105
|
+
/** Timeout per document in milliseconds (default: 30000) */
|
|
106
|
+
timeout?: number;
|
|
107
|
+
/** Progress callback invoked on each document completion */
|
|
108
|
+
onProgress?: (progress: BatchProgress) => void;
|
|
109
|
+
/** Backpressure configuration for memory management */
|
|
110
|
+
backpressure?: {
|
|
111
|
+
/** Maximum memory usage in MB (default: 500) */
|
|
112
|
+
maxMemoryMB?: number;
|
|
113
|
+
/** Interval to check memory in ms (default: 1000) */
|
|
114
|
+
checkInterval?: number;
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Statistics for completed batch operations
|
|
120
|
+
*/
|
|
121
|
+
export interface BatchStatistics {
|
|
122
|
+
/** Total documents processed */
|
|
123
|
+
total: number;
|
|
124
|
+
/** Successfully completed documents */
|
|
125
|
+
completed: number;
|
|
126
|
+
/** Failed documents */
|
|
127
|
+
failed: number;
|
|
128
|
+
/** Total time elapsed (milliseconds) */
|
|
129
|
+
totalTime: number;
|
|
130
|
+
/** Average time per document (milliseconds) */
|
|
131
|
+
averageTime: number;
|
|
132
|
+
/** Documents per second throughput */
|
|
133
|
+
throughput: number;
|
|
134
|
+
/** Peak memory usage (MB) */
|
|
135
|
+
peakMemory: number;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Batch processor for parallel document operations
|
|
140
|
+
*/
|
|
141
|
+
export class BatchManager {
|
|
142
|
+
private documents: BatchDocument[];
|
|
143
|
+
private stats: BatchStatistics = {
|
|
144
|
+
total: 0,
|
|
145
|
+
completed: 0,
|
|
146
|
+
failed: 0,
|
|
147
|
+
totalTime: 0,
|
|
148
|
+
averageTime: 0,
|
|
149
|
+
throughput: 0,
|
|
150
|
+
peakMemory: 0,
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Creates a new BatchManager
|
|
155
|
+
* @param documents - Array of documents to process
|
|
156
|
+
* @throws Error if documents array is empty or invalid
|
|
157
|
+
*/
|
|
158
|
+
constructor(documents: BatchDocument[]) {
|
|
159
|
+
if (!Array.isArray(documents) || documents.length === 0) {
|
|
160
|
+
throw new Error('Documents array must not be empty');
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
for (const doc of documents) {
|
|
164
|
+
if (!doc.path || typeof doc.path !== 'string') {
|
|
165
|
+
throw new Error('Each document must have a valid path property');
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
this.documents = documents;
|
|
170
|
+
this.stats.total = documents.length;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Get current statistics
|
|
175
|
+
*/
|
|
176
|
+
getStatistics(): BatchStatistics {
|
|
177
|
+
return { ...this.stats };
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Process documents in a queue with concurrency control
|
|
182
|
+
* @private
|
|
183
|
+
*/
|
|
184
|
+
private async processQueue<T>(
|
|
185
|
+
processor: (doc: BatchDocument, index: number) => Promise<BatchResult<T>>,
|
|
186
|
+
options: BatchOptions = {}
|
|
187
|
+
): Promise<BatchResult<T>[]> {
|
|
188
|
+
const maxParallel = options.maxParallel || os.cpus().length;
|
|
189
|
+
const timeout = options.timeout || 30000;
|
|
190
|
+
const backpressure = options.backpressure || {
|
|
191
|
+
maxMemoryMB: 500,
|
|
192
|
+
checkInterval: 1000,
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
const results: BatchResult<T>[] = [];
|
|
196
|
+
const startTime = Date.now();
|
|
197
|
+
let completed = 0;
|
|
198
|
+
let failed = 0;
|
|
199
|
+
let active = 0;
|
|
200
|
+
|
|
201
|
+
// Progress tracking helper
|
|
202
|
+
const reportProgress = () => {
|
|
203
|
+
const elapsedTime = Date.now() - startTime;
|
|
204
|
+
const completedDocs = completed + failed;
|
|
205
|
+
const avgTimePerDoc = completedDocs > 0 ? elapsedTime / completedDocs : 0;
|
|
206
|
+
const eta =
|
|
207
|
+
completedDocs > 0
|
|
208
|
+
? (this.documents.length - completedDocs) * avgTimePerDoc
|
|
209
|
+
: 0;
|
|
210
|
+
|
|
211
|
+
if (options.onProgress) {
|
|
212
|
+
options.onProgress({
|
|
213
|
+
total: this.documents.length,
|
|
214
|
+
completed,
|
|
215
|
+
failed,
|
|
216
|
+
current: completedDocs,
|
|
217
|
+
progress: this.documents.length > 0 ? completedDocs / this.documents.length : 0,
|
|
218
|
+
eta: Math.max(0, eta),
|
|
219
|
+
activeOperations: active,
|
|
220
|
+
startTime,
|
|
221
|
+
elapsedTime,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
};
|
|
225
|
+
|
|
226
|
+
// Memory monitoring helper
|
|
227
|
+
const checkMemory = async (): Promise<void> => {
|
|
228
|
+
const memUsageMB = process.memoryUsage().heapUsed / 1024 / 1024;
|
|
229
|
+
this.stats.peakMemory = Math.max(this.stats.peakMemory, memUsageMB);
|
|
230
|
+
|
|
231
|
+
if (memUsageMB > (backpressure.maxMemoryMB || 500)) {
|
|
232
|
+
// Wait a bit for garbage collection
|
|
233
|
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
234
|
+
}
|
|
235
|
+
};
|
|
236
|
+
|
|
237
|
+
// Process all documents with concurrency control
|
|
238
|
+
let index = 0;
|
|
239
|
+
const queue: Promise<void>[] = [];
|
|
240
|
+
|
|
241
|
+
while (index < this.documents.length || queue.length > 0) {
|
|
242
|
+
// Check memory before starting new operations
|
|
243
|
+
await checkMemory();
|
|
244
|
+
|
|
245
|
+
// Start new operations while under concurrency limit
|
|
246
|
+
while (active < maxParallel && index < this.documents.length) {
|
|
247
|
+
const docIndex = index++;
|
|
248
|
+
const doc = this.documents[docIndex];
|
|
249
|
+
|
|
250
|
+
active++;
|
|
251
|
+
|
|
252
|
+
const promise = (async () => {
|
|
253
|
+
const docStartTime = Date.now();
|
|
254
|
+
try {
|
|
255
|
+
const result = await Promise.race([
|
|
256
|
+
processor(doc!, docIndex),
|
|
257
|
+
new Promise<BatchResult<T>>((_, reject) =>
|
|
258
|
+
setTimeout(
|
|
259
|
+
() => reject(new Error('Timeout after ' + timeout + 'ms')),
|
|
260
|
+
timeout
|
|
261
|
+
)
|
|
262
|
+
),
|
|
263
|
+
]);
|
|
264
|
+
|
|
265
|
+
result.duration = Date.now() - docStartTime;
|
|
266
|
+
results[docIndex] = result;
|
|
267
|
+
|
|
268
|
+
if (result.success) {
|
|
269
|
+
completed++;
|
|
270
|
+
} else {
|
|
271
|
+
failed++;
|
|
272
|
+
}
|
|
273
|
+
} catch (error) {
|
|
274
|
+
const duration = Date.now() - docStartTime;
|
|
275
|
+
results[docIndex] = {
|
|
276
|
+
document: doc!,
|
|
277
|
+
success: false,
|
|
278
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
279
|
+
duration,
|
|
280
|
+
};
|
|
281
|
+
failed++;
|
|
282
|
+
} finally {
|
|
283
|
+
active--;
|
|
284
|
+
reportProgress();
|
|
285
|
+
}
|
|
286
|
+
})();
|
|
287
|
+
|
|
288
|
+
queue.push(promise);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Wait for at least one operation to complete
|
|
292
|
+
if (queue.length > 0) {
|
|
293
|
+
await Promise.race(queue);
|
|
294
|
+
const idx = queue.findIndex((p) => p !== undefined);
|
|
295
|
+
if (idx >= 0) {
|
|
296
|
+
queue.splice(idx, 1);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// Update final statistics
|
|
302
|
+
const totalTime = Date.now() - startTime;
|
|
303
|
+
this.stats.totalTime = totalTime;
|
|
304
|
+
this.stats.completed = completed;
|
|
305
|
+
this.stats.failed = failed;
|
|
306
|
+
this.stats.averageTime =
|
|
307
|
+
completed > 0 ? totalTime / completed : 0;
|
|
308
|
+
this.stats.throughput = totalTime > 0 ? (completed / totalTime) * 1000 : 0;
|
|
309
|
+
|
|
310
|
+
return results.filter((r) => r !== undefined);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* Extract text from multiple documents in parallel
|
|
315
|
+
* @param options - Batch processing options
|
|
316
|
+
* @returns Array of extraction results
|
|
317
|
+
*/
|
|
318
|
+
async extractTextBatch(
|
|
319
|
+
options: BatchOptions = {}
|
|
320
|
+
): Promise<BatchResult<string>[]> {
|
|
321
|
+
return this.processQueue(async (doc, _index) => {
|
|
322
|
+
try {
|
|
323
|
+
// Dynamic import to avoid circular dependencies
|
|
324
|
+
const { PdfDocument } = await import('../index.js');
|
|
325
|
+
const pdfDoc = PdfDocument.open(doc.path);
|
|
326
|
+
const extractionMgr = (pdfDoc as any).createExtractionManager?.();
|
|
327
|
+
if (!extractionMgr) {
|
|
328
|
+
throw new Error('Failed to create extraction manager');
|
|
329
|
+
}
|
|
330
|
+
const text = extractionMgr.extractAllText();
|
|
331
|
+
if (typeof pdfDoc.close === 'function') {
|
|
332
|
+
pdfDoc.close();
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
return {
|
|
336
|
+
document: doc,
|
|
337
|
+
success: true,
|
|
338
|
+
data: text,
|
|
339
|
+
duration: 0,
|
|
340
|
+
};
|
|
341
|
+
} catch (error) {
|
|
342
|
+
return {
|
|
343
|
+
document: doc,
|
|
344
|
+
success: false,
|
|
345
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
346
|
+
duration: 0,
|
|
347
|
+
};
|
|
348
|
+
}
|
|
349
|
+
}, options);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
/**
|
|
353
|
+
* Extract markdown from multiple documents in parallel
|
|
354
|
+
* @param options - Batch processing options
|
|
355
|
+
* @returns Array of extraction results
|
|
356
|
+
*/
|
|
357
|
+
async extractMarkdownBatch(
|
|
358
|
+
options: BatchOptions = {}
|
|
359
|
+
): Promise<BatchResult<string>[]> {
|
|
360
|
+
return this.processQueue(async (doc, _index) => {
|
|
361
|
+
try {
|
|
362
|
+
const { PdfDocument } = await import('../index.js');
|
|
363
|
+
const pdfDoc = PdfDocument.open(doc.path);
|
|
364
|
+
const extractionMgr = (pdfDoc as any).createExtractionManager?.();
|
|
365
|
+
if (!extractionMgr) {
|
|
366
|
+
throw new Error('Failed to create extraction manager');
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Extract markdown from all pages
|
|
370
|
+
let markdown = '';
|
|
371
|
+
const pageCount = (pdfDoc as any).pageCount || 0;
|
|
372
|
+
for (let i = 0; i < pageCount; i++) {
|
|
373
|
+
markdown += extractionMgr.extractMarkdown(i) || '';
|
|
374
|
+
if (i < pageCount - 1) {
|
|
375
|
+
markdown += '\n\n---\n\n';
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
if (typeof pdfDoc.close === 'function') {
|
|
379
|
+
pdfDoc.close();
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
return {
|
|
383
|
+
document: doc,
|
|
384
|
+
success: true,
|
|
385
|
+
data: markdown,
|
|
386
|
+
duration: 0,
|
|
387
|
+
};
|
|
388
|
+
} catch (error) {
|
|
389
|
+
return {
|
|
390
|
+
document: doc,
|
|
391
|
+
success: false,
|
|
392
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
393
|
+
duration: 0,
|
|
394
|
+
};
|
|
395
|
+
}
|
|
396
|
+
}, options);
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Extract HTML from multiple documents in parallel
|
|
401
|
+
* @param options - Batch processing options
|
|
402
|
+
* @returns Array of extraction results
|
|
403
|
+
*/
|
|
404
|
+
async extractHtmlBatch(
|
|
405
|
+
options: BatchOptions = {}
|
|
406
|
+
): Promise<BatchResult<string>[]> {
|
|
407
|
+
return this.processQueue(async (doc, _index) => {
|
|
408
|
+
try {
|
|
409
|
+
const { PdfDocument } = await import('../index.js');
|
|
410
|
+
const pdfDoc = PdfDocument.open(doc.path);
|
|
411
|
+
const extractionMgr = (pdfDoc as any).createExtractionManager?.();
|
|
412
|
+
if (!extractionMgr) {
|
|
413
|
+
throw new Error('Failed to create extraction manager');
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Extract HTML from all pages
|
|
417
|
+
let html = '<html><body>';
|
|
418
|
+
const pageCount = (pdfDoc as any).pageCount || 0;
|
|
419
|
+
for (let i = 0; i < pageCount; i++) {
|
|
420
|
+
html += '<div class="page page-' + (i + 1) + '">';
|
|
421
|
+
html += extractionMgr.extractHtml(i) || '';
|
|
422
|
+
html += '</div>';
|
|
423
|
+
}
|
|
424
|
+
html += '</body></html>';
|
|
425
|
+
if (typeof pdfDoc.close === 'function') {
|
|
426
|
+
pdfDoc.close();
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
return {
|
|
430
|
+
document: doc,
|
|
431
|
+
success: true,
|
|
432
|
+
data: html,
|
|
433
|
+
duration: 0,
|
|
434
|
+
};
|
|
435
|
+
} catch (error) {
|
|
436
|
+
return {
|
|
437
|
+
document: doc,
|
|
438
|
+
success: false,
|
|
439
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
440
|
+
duration: 0,
|
|
441
|
+
};
|
|
442
|
+
}
|
|
443
|
+
}, options);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
/**
|
|
447
|
+
* Search for a term in multiple documents in parallel
|
|
448
|
+
* @param searchTerm - Term to search for
|
|
449
|
+
* @param options - Batch processing options
|
|
450
|
+
* @returns Array of search results
|
|
451
|
+
*/
|
|
452
|
+
async searchBatch(
|
|
453
|
+
searchTerm: string,
|
|
454
|
+
options: BatchOptions = {}
|
|
455
|
+
): Promise<BatchResult<Array<{ page: number; count: number }>>[]> {
|
|
456
|
+
if (!searchTerm || typeof searchTerm !== 'string') {
|
|
457
|
+
throw new Error('Search term must be a non-empty string');
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
return this.processQueue(async (doc, _index) => {
|
|
461
|
+
try {
|
|
462
|
+
const { PdfDocument } = await import('../index.js');
|
|
463
|
+
const pdfDoc = PdfDocument.open(doc.path);
|
|
464
|
+
const searchMgr = (pdfDoc as any).createSearchManager?.();
|
|
465
|
+
if (!searchMgr) {
|
|
466
|
+
throw new Error('Failed to create search manager');
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
const results: Array<{ page: number; count: number }> = [];
|
|
470
|
+
const pageCount = (pdfDoc as any).pageCount || 0;
|
|
471
|
+
for (let i = 0; i < pageCount; i++) {
|
|
472
|
+
const matches = searchMgr.search(searchTerm, i) || [];
|
|
473
|
+
if (matches.length > 0) {
|
|
474
|
+
results.push({ page: i, count: matches.length });
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
if (typeof pdfDoc.close === 'function') {
|
|
478
|
+
pdfDoc.close();
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
return {
|
|
482
|
+
document: doc,
|
|
483
|
+
success: true,
|
|
484
|
+
data: results,
|
|
485
|
+
duration: 0,
|
|
486
|
+
};
|
|
487
|
+
} catch (error) {
|
|
488
|
+
return {
|
|
489
|
+
document: doc,
|
|
490
|
+
success: false,
|
|
491
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
492
|
+
duration: 0,
|
|
493
|
+
};
|
|
494
|
+
}
|
|
495
|
+
}, options);
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
/**
|
|
499
|
+
* Generic batch processor for custom operations
|
|
500
|
+
* @param processor - Function to process each document
|
|
501
|
+
* @param options - Batch processing options
|
|
502
|
+
* @returns Array of results
|
|
503
|
+
*/
|
|
504
|
+
async processBatch<T>(
|
|
505
|
+
processor: (doc: BatchDocument, pdfDoc: any) => Promise<T>,
|
|
506
|
+
options: BatchOptions = {}
|
|
507
|
+
): Promise<BatchResult<T>[]> {
|
|
508
|
+
return this.processQueue(async (doc, _index) => {
|
|
509
|
+
try {
|
|
510
|
+
const { PdfDocument } = await import('../index.js');
|
|
511
|
+
const pdfDoc = PdfDocument.open(doc.path);
|
|
512
|
+
const data = await processor(doc, pdfDoc);
|
|
513
|
+
if (typeof pdfDoc.close === 'function') {
|
|
514
|
+
pdfDoc.close();
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
return {
|
|
518
|
+
document: doc,
|
|
519
|
+
success: true,
|
|
520
|
+
data,
|
|
521
|
+
duration: 0,
|
|
522
|
+
};
|
|
523
|
+
} catch (error) {
|
|
524
|
+
return {
|
|
525
|
+
document: doc,
|
|
526
|
+
success: false,
|
|
527
|
+
error: error instanceof Error ? error : new Error(String(error)),
|
|
528
|
+
duration: 0,
|
|
529
|
+
};
|
|
530
|
+
}
|
|
531
|
+
}, options);
|
|
532
|
+
}
|
|
533
|
+
}
|