pdf-oxide 0.3.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -0
- package/binding.gyp +35 -0
- package/package.json +78 -0
- package/src/builders/annotation-builder.ts +367 -0
- package/src/builders/conversion-options-builder.ts +257 -0
- package/src/builders/index.ts +12 -0
- package/src/builders/metadata-builder.ts +317 -0
- package/src/builders/pdf-builder.ts +386 -0
- package/src/builders/search-options-builder.ts +151 -0
- package/src/document-editor-manager.ts +318 -0
- package/src/errors.ts +1629 -0
- package/src/form-field-manager.ts +666 -0
- package/src/hybrid-ml-manager.ts +283 -0
- package/src/index.ts +453 -0
- package/src/managers/accessibility-manager.ts +338 -0
- package/src/managers/annotation-manager.ts +439 -0
- package/src/managers/barcode-manager.ts +235 -0
- package/src/managers/batch-manager.ts +533 -0
- package/src/managers/cache-manager.ts +486 -0
- package/src/managers/compliance-manager.ts +375 -0
- package/src/managers/content-manager.ts +339 -0
- package/src/managers/document-utility-manager.ts +922 -0
- package/src/managers/dom-pdf-creator.ts +365 -0
- package/src/managers/editing-manager.ts +514 -0
- package/src/managers/enterprise-manager.ts +478 -0
- package/src/managers/extended-managers.ts +437 -0
- package/src/managers/extraction-manager.ts +583 -0
- package/src/managers/final-utilities.ts +429 -0
- package/src/managers/hybrid-ml-advanced.ts +479 -0
- package/src/managers/index.ts +239 -0
- package/src/managers/layer-manager.ts +500 -0
- package/src/managers/metadata-manager.ts +303 -0
- package/src/managers/ocr-manager.ts +756 -0
- package/src/managers/optimization-manager.ts +262 -0
- package/src/managers/outline-manager.ts +196 -0
- package/src/managers/page-manager.ts +289 -0
- package/src/managers/pattern-detection.ts +440 -0
- package/src/managers/rendering-manager.ts +863 -0
- package/src/managers/search-manager.ts +385 -0
- package/src/managers/security-manager.ts +345 -0
- package/src/managers/signature-manager.ts +1664 -0
- package/src/managers/streams.ts +618 -0
- package/src/managers/xfa-manager.ts +500 -0
- package/src/pdf-creator-manager.ts +494 -0
- package/src/properties.ts +522 -0
- package/src/result-accessors-manager.ts +867 -0
- package/src/tests/advanced-features.test.ts +414 -0
- package/src/tests/advanced.test.ts +266 -0
- package/src/tests/extended-managers.test.ts +316 -0
- package/src/tests/final-utilities.test.ts +455 -0
- package/src/tests/foundation.test.ts +315 -0
- package/src/tests/high-demand.test.ts +257 -0
- package/src/tests/specialized.test.ts +97 -0
- package/src/thumbnail-manager.ts +272 -0
- package/src/types/common.ts +142 -0
- package/src/types/document-types.ts +457 -0
- package/src/types/index.ts +6 -0
- package/src/types/manager-types.ts +284 -0
- package/src/types/native-bindings.ts +517 -0
- package/src/workers/index.ts +7 -0
- package/src/workers/pool.ts +274 -0
- package/src/workers/worker.ts +131 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Worker Thread Pool Manager
|
|
3
|
+
* Enables non-blocking parallel PDF processing
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { Worker } from 'worker_threads';
|
|
7
|
+
import os from 'os';
|
|
8
|
+
import path from 'path';
|
|
9
|
+
import { fileURLToPath } from 'url';
|
|
10
|
+
|
|
11
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Represents a task to be processed by a worker
|
|
15
|
+
*/
|
|
16
|
+
export interface WorkerTask<T = any> {
|
|
17
|
+
operation: 'extract' | 'search' | 'render' | 'analyze';
|
|
18
|
+
documentPath: string;
|
|
19
|
+
params: Record<string, any>;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Result returned from a worker
|
|
24
|
+
*/
|
|
25
|
+
export interface WorkerResult<T = any> {
|
|
26
|
+
success: boolean;
|
|
27
|
+
data?: T;
|
|
28
|
+
error?: Error | string;
|
|
29
|
+
duration: number;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
interface QueuedTask {
|
|
33
|
+
task: WorkerTask<any>;
|
|
34
|
+
resolve: (value: WorkerResult<any>) => void;
|
|
35
|
+
reject: (error: Error) => void;
|
|
36
|
+
timeout: NodeJS.Timeout;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Thread pool for parallel PDF processing
|
|
41
|
+
*/
|
|
42
|
+
export class WorkerPool {
|
|
43
|
+
private workers: Worker[] = [];
|
|
44
|
+
private queue: QueuedTask[] = [];
|
|
45
|
+
private activeCount = 0;
|
|
46
|
+
private terminated = false;
|
|
47
|
+
private readonly defaultTimeout = 30000; // 30 seconds
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Initialize the worker pool
|
|
51
|
+
* @param poolSize - Number of worker threads to create
|
|
52
|
+
*/
|
|
53
|
+
constructor(private poolSize: number = 4) {
|
|
54
|
+
this.validatePoolSize();
|
|
55
|
+
this.initializeWorkers();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
private validatePoolSize(): void {
|
|
59
|
+
if (this.poolSize < 1 || this.poolSize > 32) {
|
|
60
|
+
throw new Error(
|
|
61
|
+
`Pool size must be between 1 and 32, got ${this.poolSize}`
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
private initializeWorkers(): void {
|
|
67
|
+
try {
|
|
68
|
+
for (let i = 0; i < this.poolSize; i++) {
|
|
69
|
+
const worker = new Worker(path.join(__dirname, 'worker.js'));
|
|
70
|
+
|
|
71
|
+
worker.on('error', (error) => {
|
|
72
|
+
console.error(`Worker ${i} error:`, error);
|
|
73
|
+
this.handleWorkerError(error);
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
worker.on('exit', (code) => {
|
|
77
|
+
if (code !== 0 && !this.terminated) {
|
|
78
|
+
console.warn(`Worker ${i} exited with code ${code}`);
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
this.workers.push(worker);
|
|
83
|
+
}
|
|
84
|
+
} catch (error) {
|
|
85
|
+
this.cleanup();
|
|
86
|
+
throw new Error(
|
|
87
|
+
`Failed to initialize worker pool: ${
|
|
88
|
+
error instanceof Error ? error.message : String(error)
|
|
89
|
+
}`
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Run a task in the worker pool
|
|
96
|
+
* @param task - The task to run
|
|
97
|
+
* @param timeout - Optional timeout in milliseconds
|
|
98
|
+
* @returns Promise that resolves with the result
|
|
99
|
+
*/
|
|
100
|
+
public async runTask<T = any>(
|
|
101
|
+
task: WorkerTask<T>,
|
|
102
|
+
timeout: number = this.defaultTimeout
|
|
103
|
+
): Promise<WorkerResult<T>> {
|
|
104
|
+
if (this.terminated) {
|
|
105
|
+
throw new Error('Worker pool has been terminated');
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (timeout < 1000 || timeout > 300000) {
|
|
109
|
+
throw new Error('Timeout must be between 1 and 300 seconds');
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return new Promise<WorkerResult<T>>((resolve, reject) => {
|
|
113
|
+
const timeoutHandle = setTimeout(() => {
|
|
114
|
+
this.queue = this.queue.filter((q) => q.task !== task);
|
|
115
|
+
reject(
|
|
116
|
+
new Error(
|
|
117
|
+
`Worker task timeout after ${timeout}ms: ${task.operation} on ${task.documentPath}`
|
|
118
|
+
)
|
|
119
|
+
);
|
|
120
|
+
}, timeout);
|
|
121
|
+
|
|
122
|
+
this.queue.push({
|
|
123
|
+
task,
|
|
124
|
+
resolve,
|
|
125
|
+
reject,
|
|
126
|
+
timeout: timeoutHandle,
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
this.processQueue();
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
private processQueue(): void {
|
|
134
|
+
if (this.queue.length === 0 || this.activeCount >= this.poolSize) {
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
const queuedTask = this.queue.shift();
|
|
139
|
+
if (!queuedTask) return;
|
|
140
|
+
|
|
141
|
+
const { task, resolve, reject, timeout } = queuedTask;
|
|
142
|
+
|
|
143
|
+
// Find an available worker
|
|
144
|
+
const workerIndex = this.activeCount % this.poolSize;
|
|
145
|
+
const worker = this.workers[workerIndex];
|
|
146
|
+
|
|
147
|
+
if (!worker) {
|
|
148
|
+
reject(new Error('No available worker'));
|
|
149
|
+
clearTimeout(timeout);
|
|
150
|
+
return;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
this.activeCount++;
|
|
154
|
+
|
|
155
|
+
const messageHandler = (result: WorkerResult<any>) => {
|
|
156
|
+
clearTimeout(timeout);
|
|
157
|
+
resolve(result as WorkerResult<any>);
|
|
158
|
+
this.activeCount--;
|
|
159
|
+
worker.off('message', messageHandler);
|
|
160
|
+
worker.off('error', errorHandler);
|
|
161
|
+
this.processQueue();
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
const errorHandler = (error: Error) => {
|
|
165
|
+
clearTimeout(timeout);
|
|
166
|
+
reject(error);
|
|
167
|
+
this.activeCount--;
|
|
168
|
+
worker.off('message', messageHandler);
|
|
169
|
+
worker.off('error', errorHandler);
|
|
170
|
+
this.processQueue();
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
worker.on('message', messageHandler);
|
|
174
|
+
worker.once('error', errorHandler);
|
|
175
|
+
|
|
176
|
+
try {
|
|
177
|
+
worker.postMessage(task);
|
|
178
|
+
} catch (error) {
|
|
179
|
+
clearTimeout(timeout);
|
|
180
|
+
reject(error instanceof Error ? error : new Error(String(error)));
|
|
181
|
+
this.activeCount--;
|
|
182
|
+
worker.off('message', messageHandler);
|
|
183
|
+
worker.off('error', errorHandler);
|
|
184
|
+
this.processQueue();
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
private handleWorkerError(error: Error): void {
|
|
189
|
+
if (this.queue.length > 0) {
|
|
190
|
+
const queuedTask = this.queue.shift();
|
|
191
|
+
if (queuedTask) {
|
|
192
|
+
clearTimeout(queuedTask.timeout);
|
|
193
|
+
queuedTask.reject(error);
|
|
194
|
+
this.activeCount--;
|
|
195
|
+
this.processQueue();
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Terminate all workers
|
|
202
|
+
* @returns Promise that resolves when all workers are terminated
|
|
203
|
+
*/
|
|
204
|
+
public async terminate(): Promise<void> {
|
|
205
|
+
this.terminated = true;
|
|
206
|
+
|
|
207
|
+
// Reject all queued tasks
|
|
208
|
+
while (this.queue.length > 0) {
|
|
209
|
+
const queuedTask = this.queue.shift();
|
|
210
|
+
if (queuedTask) {
|
|
211
|
+
clearTimeout(queuedTask.timeout);
|
|
212
|
+
queuedTask.reject(new Error('Worker pool terminated'));
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// Terminate all workers
|
|
217
|
+
await Promise.all(
|
|
218
|
+
this.workers.map((worker) =>
|
|
219
|
+
worker
|
|
220
|
+
.terminate()
|
|
221
|
+
.catch((error) =>
|
|
222
|
+
console.warn('Error terminating worker:', error)
|
|
223
|
+
)
|
|
224
|
+
)
|
|
225
|
+
);
|
|
226
|
+
|
|
227
|
+
this.cleanup();
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
private cleanup(): void {
|
|
231
|
+
this.workers = [];
|
|
232
|
+
this.queue = [];
|
|
233
|
+
this.activeCount = 0;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Get current pool statistics
|
|
238
|
+
*/
|
|
239
|
+
public getStats(): {
|
|
240
|
+
poolSize: number;
|
|
241
|
+
activeWorkers: number;
|
|
242
|
+
queuedTasks: number;
|
|
243
|
+
terminated: boolean;
|
|
244
|
+
} {
|
|
245
|
+
return {
|
|
246
|
+
poolSize: this.poolSize,
|
|
247
|
+
activeWorkers: this.activeCount,
|
|
248
|
+
queuedTasks: this.queue.length,
|
|
249
|
+
terminated: this.terminated,
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Global worker pool instance (singleton)
|
|
256
|
+
* Auto-configured based on CPU count
|
|
257
|
+
*/
|
|
258
|
+
const hardwareConcurrency = Math.max(1, os.cpus().length);
|
|
259
|
+
|
|
260
|
+
export const workerPool = new WorkerPool(
|
|
261
|
+
Math.min(hardwareConcurrency, 8)
|
|
262
|
+
);
|
|
263
|
+
|
|
264
|
+
/**
|
|
265
|
+
* Graceful shutdown
|
|
266
|
+
*/
|
|
267
|
+
process.on('exit', async () => {
|
|
268
|
+
if (!workerPool || (workerPool as any).terminated) return;
|
|
269
|
+
try {
|
|
270
|
+
await workerPool.terminate();
|
|
271
|
+
} catch (error) {
|
|
272
|
+
console.error('Error during worker pool shutdown:', error);
|
|
273
|
+
}
|
|
274
|
+
});
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Worker Thread Script
|
|
3
|
+
* Handles off-main-thread PDF processing tasks
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { parentPort, workerData } from 'worker_threads';
|
|
7
|
+
import type { WorkerTask, WorkerResult } from './pool.js';
|
|
8
|
+
|
|
9
|
+
// Types for operations - will be available at runtime via the PdfDocument
|
|
10
|
+
type PdfDocument = any;
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Process a worker task
|
|
14
|
+
*/
|
|
15
|
+
async function handleTask(task: WorkerTask<any>): Promise<WorkerResult<any>> {
|
|
16
|
+
const startTime = Date.now();
|
|
17
|
+
|
|
18
|
+
try {
|
|
19
|
+
// Dynamically import PdfDocument since we can't use top-level imports
|
|
20
|
+
// in a worker context reliably across all environments
|
|
21
|
+
const { PdfDocument } = await import('../index.js');
|
|
22
|
+
|
|
23
|
+
if (!PdfDocument) {
|
|
24
|
+
throw new Error('PdfDocument not available in worker');
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
let result: any;
|
|
28
|
+
|
|
29
|
+
switch (task.operation) {
|
|
30
|
+
case 'extract': {
|
|
31
|
+
const doc = PdfDocument.open(task.documentPath);
|
|
32
|
+
const extMgr = doc.extraction;
|
|
33
|
+
|
|
34
|
+
if (task.params.type === 'markdown') {
|
|
35
|
+
result = extMgr.extractMarkdown(
|
|
36
|
+
task.params.pageIndex,
|
|
37
|
+
task.params.options
|
|
38
|
+
);
|
|
39
|
+
} else if (task.params.type === 'html') {
|
|
40
|
+
result = extMgr.extractHtml(
|
|
41
|
+
task.params.pageIndex,
|
|
42
|
+
task.params.options
|
|
43
|
+
);
|
|
44
|
+
} else {
|
|
45
|
+
result = extMgr.extractText(
|
|
46
|
+
task.params.pageIndex,
|
|
47
|
+
task.params.options
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
break;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
case 'search': {
|
|
54
|
+
const doc = PdfDocument.open(task.documentPath);
|
|
55
|
+
const searchMgr = doc.search;
|
|
56
|
+
result = searchMgr.searchAll(
|
|
57
|
+
task.params.query,
|
|
58
|
+
task.params.options || {}
|
|
59
|
+
);
|
|
60
|
+
break;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
case 'render': {
|
|
64
|
+
const doc = PdfDocument.open(task.documentPath);
|
|
65
|
+
const renderMgr = doc.rendering;
|
|
66
|
+
result = renderMgr.renderPage(
|
|
67
|
+
task.params.pageIndex,
|
|
68
|
+
task.params.options || {}
|
|
69
|
+
);
|
|
70
|
+
break;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
case 'analyze': {
|
|
74
|
+
const doc = PdfDocument.open(task.documentPath);
|
|
75
|
+
|
|
76
|
+
result = {
|
|
77
|
+
pageCount: doc.pageCount,
|
|
78
|
+
metadata: doc.metadata?.getMetadata?.() || null,
|
|
79
|
+
outline: {
|
|
80
|
+
count: doc.outline?.getOutlineCount?.() || 0,
|
|
81
|
+
isFlat: doc.outline?.isFlat?.() || false,
|
|
82
|
+
},
|
|
83
|
+
layers: {
|
|
84
|
+
count: doc.layers?.getLayerCount?.() || 0,
|
|
85
|
+
visible: doc.layers?.getVisibleLayerCount?.() || 0,
|
|
86
|
+
},
|
|
87
|
+
};
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
default:
|
|
92
|
+
throw new Error(`Unknown operation: ${task.operation}`);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const duration = Date.now() - startTime;
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
success: true,
|
|
99
|
+
data: result,
|
|
100
|
+
duration,
|
|
101
|
+
};
|
|
102
|
+
} catch (error) {
|
|
103
|
+
const duration = Date.now() - startTime;
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
success: false,
|
|
107
|
+
error:
|
|
108
|
+
error instanceof Error
|
|
109
|
+
? {
|
|
110
|
+
name: error.name,
|
|
111
|
+
message: error.message,
|
|
112
|
+
stack: error.stack,
|
|
113
|
+
}
|
|
114
|
+
: String(error),
|
|
115
|
+
duration,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Main worker message handler
|
|
122
|
+
*/
|
|
123
|
+
if (parentPort) {
|
|
124
|
+
parentPort.on('message', async (task: WorkerTask<any>) => {
|
|
125
|
+
const result = await handleTask(task);
|
|
126
|
+
parentPort?.postMessage(result);
|
|
127
|
+
});
|
|
128
|
+
} else {
|
|
129
|
+
console.error('Worker script must be run as a Worker thread');
|
|
130
|
+
process.exit(1);
|
|
131
|
+
}
|