pdf-oxide 0.3.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -0
- package/binding.gyp +35 -0
- package/package.json +78 -0
- package/src/builders/annotation-builder.ts +367 -0
- package/src/builders/conversion-options-builder.ts +257 -0
- package/src/builders/index.ts +12 -0
- package/src/builders/metadata-builder.ts +317 -0
- package/src/builders/pdf-builder.ts +386 -0
- package/src/builders/search-options-builder.ts +151 -0
- package/src/document-editor-manager.ts +318 -0
- package/src/errors.ts +1629 -0
- package/src/form-field-manager.ts +666 -0
- package/src/hybrid-ml-manager.ts +283 -0
- package/src/index.ts +453 -0
- package/src/managers/accessibility-manager.ts +338 -0
- package/src/managers/annotation-manager.ts +439 -0
- package/src/managers/barcode-manager.ts +235 -0
- package/src/managers/batch-manager.ts +533 -0
- package/src/managers/cache-manager.ts +486 -0
- package/src/managers/compliance-manager.ts +375 -0
- package/src/managers/content-manager.ts +339 -0
- package/src/managers/document-utility-manager.ts +922 -0
- package/src/managers/dom-pdf-creator.ts +365 -0
- package/src/managers/editing-manager.ts +514 -0
- package/src/managers/enterprise-manager.ts +478 -0
- package/src/managers/extended-managers.ts +437 -0
- package/src/managers/extraction-manager.ts +583 -0
- package/src/managers/final-utilities.ts +429 -0
- package/src/managers/hybrid-ml-advanced.ts +479 -0
- package/src/managers/index.ts +239 -0
- package/src/managers/layer-manager.ts +500 -0
- package/src/managers/metadata-manager.ts +303 -0
- package/src/managers/ocr-manager.ts +756 -0
- package/src/managers/optimization-manager.ts +262 -0
- package/src/managers/outline-manager.ts +196 -0
- package/src/managers/page-manager.ts +289 -0
- package/src/managers/pattern-detection.ts +440 -0
- package/src/managers/rendering-manager.ts +863 -0
- package/src/managers/search-manager.ts +385 -0
- package/src/managers/security-manager.ts +345 -0
- package/src/managers/signature-manager.ts +1664 -0
- package/src/managers/streams.ts +618 -0
- package/src/managers/xfa-manager.ts +500 -0
- package/src/pdf-creator-manager.ts +494 -0
- package/src/properties.ts +522 -0
- package/src/result-accessors-manager.ts +867 -0
- package/src/tests/advanced-features.test.ts +414 -0
- package/src/tests/advanced.test.ts +266 -0
- package/src/tests/extended-managers.test.ts +316 -0
- package/src/tests/final-utilities.test.ts +455 -0
- package/src/tests/foundation.test.ts +315 -0
- package/src/tests/high-demand.test.ts +257 -0
- package/src/tests/specialized.test.ts +97 -0
- package/src/thumbnail-manager.ts +272 -0
- package/src/types/common.ts +142 -0
- package/src/types/document-types.ts +457 -0
- package/src/types/index.ts +6 -0
- package/src/types/manager-types.ts +284 -0
- package/src/types/native-bindings.ts +517 -0
- package/src/workers/index.ts +7 -0
- package/src/workers/pool.ts +274 -0
- package/src/workers/worker.ts +131 -0
|
@@ -0,0 +1,618 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Stream API support for PDF Oxide Node.js
|
|
3
|
+
*
|
|
4
|
+
* Provides Readable streams for search results, text extraction, and page metadata.
|
|
5
|
+
* Supports backpressure handling and proper Node.js stream semantics.
|
|
6
|
+
*
|
|
7
|
+
* Phase 2.4 implementation for idiomatic Node.js patterns with Stream API.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { Readable } from 'node:stream';
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* SearchResult emitted by SearchStream
|
|
14
|
+
*/
|
|
15
|
+
export interface SearchResultData {
|
|
16
|
+
text?: string;
|
|
17
|
+
pageIndex?: number;
|
|
18
|
+
position?: number;
|
|
19
|
+
boundingBox?: Record<string, number>;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* ExtractionProgress emitted by ExtractionStream
|
|
24
|
+
*/
|
|
25
|
+
export interface ExtractionProgressData {
|
|
26
|
+
pageIndex: number;
|
|
27
|
+
totalPages: number;
|
|
28
|
+
extractedText: string;
|
|
29
|
+
extractionType: 'text' | 'markdown' | 'html';
|
|
30
|
+
progress: number;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* PageMetadata emitted by MetadataStream
|
|
35
|
+
*/
|
|
36
|
+
export interface PageMetadataData {
|
|
37
|
+
pageIndex: number;
|
|
38
|
+
width: number;
|
|
39
|
+
height: number;
|
|
40
|
+
fontCount: number;
|
|
41
|
+
imageCount: number;
|
|
42
|
+
rotation: number;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Readable stream for search results
|
|
47
|
+
*
|
|
48
|
+
* Emits search results one at a time with proper backpressure handling.
|
|
49
|
+
* Supports searching either a specific page or the entire document.
|
|
50
|
+
*
|
|
51
|
+
* Supports both traditional stream API (.on('data')) and async iteration (for await...of).
|
|
52
|
+
*
|
|
53
|
+
* @example
|
|
54
|
+
* ```typescript
|
|
55
|
+
* // Traditional stream API
|
|
56
|
+
* const stream = new SearchStream(searchManager, 'keyword');
|
|
57
|
+
* stream.on('data', (result) => {
|
|
58
|
+
* console.log(`Found on page ${result.pageIndex}: ${result.text}`);
|
|
59
|
+
* });
|
|
60
|
+
*
|
|
61
|
+
* // Async iteration
|
|
62
|
+
* const stream = new SearchStream(searchManager, 'keyword');
|
|
63
|
+
* for await (const result of stream) {
|
|
64
|
+
* console.log(`Found on page ${result.pageIndex}: ${result.text}`);
|
|
65
|
+
* }
|
|
66
|
+
* ```
|
|
67
|
+
*/
|
|
68
|
+
export class SearchStream extends Readable {
|
|
69
|
+
private searchManager: any;
|
|
70
|
+
private searchTerm: string;
|
|
71
|
+
private options: Record<string, any>;
|
|
72
|
+
private pageIndex: number | undefined;
|
|
73
|
+
private caseSensitive: boolean;
|
|
74
|
+
private wholeWords: boolean;
|
|
75
|
+
private maxResults: number;
|
|
76
|
+
private _results: any[] | null;
|
|
77
|
+
private _currentIndex: number;
|
|
78
|
+
private _resultCount: number;
|
|
79
|
+
private _initialized: boolean;
|
|
80
|
+
private _ended: boolean;
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Creates a new SearchStream
|
|
84
|
+
* @param searchManager - The search manager instance
|
|
85
|
+
* @param searchTerm - Text to search for
|
|
86
|
+
* @param options - Search options
|
|
87
|
+
* @throws Error if parameters are invalid
|
|
88
|
+
*/
|
|
89
|
+
constructor(searchManager: any, searchTerm: string, options: Record<string, any> = {}) {
|
|
90
|
+
super({ objectMode: true });
|
|
91
|
+
|
|
92
|
+
if (!searchManager) {
|
|
93
|
+
throw new Error('SearchManager is required');
|
|
94
|
+
}
|
|
95
|
+
if (!searchTerm || typeof searchTerm !== 'string') {
|
|
96
|
+
throw new Error('Search term must be a non-empty string');
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
this.searchManager = searchManager;
|
|
100
|
+
this.searchTerm = searchTerm;
|
|
101
|
+
this.options = options;
|
|
102
|
+
this.pageIndex = options.pageIndex;
|
|
103
|
+
this.caseSensitive = options.caseSensitive ?? false;
|
|
104
|
+
this.wholeWords = options.wholeWords ?? false;
|
|
105
|
+
this.maxResults = options.maxResults ?? Infinity;
|
|
106
|
+
|
|
107
|
+
this._results = null;
|
|
108
|
+
this._currentIndex = 0;
|
|
109
|
+
this._resultCount = 0;
|
|
110
|
+
this._initialized = false;
|
|
111
|
+
this._ended = false;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Initialize results (lazy initialization)
|
|
116
|
+
* @private
|
|
117
|
+
*/
|
|
118
|
+
private _initialize(): void {
|
|
119
|
+
if (this._initialized) return;
|
|
120
|
+
this._initialized = true;
|
|
121
|
+
|
|
122
|
+
try {
|
|
123
|
+
// Perform search
|
|
124
|
+
if (this.pageIndex !== undefined) {
|
|
125
|
+
this._results = (this.searchManager.search(
|
|
126
|
+
this.searchTerm,
|
|
127
|
+
this.pageIndex,
|
|
128
|
+
{ caseSensitive: this.caseSensitive, wholeWords: this.wholeWords }
|
|
129
|
+
) || []) as any[];
|
|
130
|
+
} else {
|
|
131
|
+
this._results = (this.searchManager.searchAll(
|
|
132
|
+
this.searchTerm,
|
|
133
|
+
{ caseSensitive: this.caseSensitive, wholeWords: this.wholeWords }
|
|
134
|
+
) || []) as any[];
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Apply max results limit
|
|
138
|
+
if (this._results && this._results.length > this.maxResults) {
|
|
139
|
+
this._results = this._results.slice(0, this.maxResults);
|
|
140
|
+
}
|
|
141
|
+
} catch (error) {
|
|
142
|
+
this.destroy(error as Error);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Implement _read() for readable stream
|
|
148
|
+
* @private
|
|
149
|
+
*/
|
|
150
|
+
_read(): void {
|
|
151
|
+
// Initialize on first read
|
|
152
|
+
if (!this._initialized) {
|
|
153
|
+
this._initialize();
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Check if we have results to emit
|
|
157
|
+
if (!this._results || this._currentIndex >= this._results.length) {
|
|
158
|
+
// All results emitted
|
|
159
|
+
if (!this._ended) {
|
|
160
|
+
this._ended = true;
|
|
161
|
+
this.push(null);
|
|
162
|
+
}
|
|
163
|
+
return;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Emit next result
|
|
167
|
+
const result = this._results[this._currentIndex];
|
|
168
|
+
this._currentIndex++;
|
|
169
|
+
|
|
170
|
+
// Format the result
|
|
171
|
+
const data: SearchResultData = {
|
|
172
|
+
text: result.text || result.getText?.(),
|
|
173
|
+
pageIndex: result.pageIndex || result.page || 0,
|
|
174
|
+
position: result.position || 0,
|
|
175
|
+
boundingBox: result.boundingBox,
|
|
176
|
+
};
|
|
177
|
+
|
|
178
|
+
this.push(data);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Implement async iteration protocol for `for await...of` support
|
|
183
|
+
* @returns AsyncIterator for iterating over search results
|
|
184
|
+
*/
|
|
185
|
+
async *[Symbol.asyncIterator](): AsyncGenerator<SearchResultData, void, unknown> {
|
|
186
|
+
// Initialize on first iteration
|
|
187
|
+
if (!this._initialized) {
|
|
188
|
+
this._initialize();
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Yield results one by one
|
|
192
|
+
while (this._results && this._currentIndex < this._results.length) {
|
|
193
|
+
const result = this._results[this._currentIndex];
|
|
194
|
+
this._currentIndex++;
|
|
195
|
+
|
|
196
|
+
const data: SearchResultData = {
|
|
197
|
+
text: result.text || result.getText?.(),
|
|
198
|
+
pageIndex: result.pageIndex || result.page || 0,
|
|
199
|
+
position: result.position || 0,
|
|
200
|
+
boundingBox: result.boundingBox,
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
yield data;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
if (!this._ended) {
|
|
207
|
+
this._ended = true;
|
|
208
|
+
this.destroy();
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Readable stream for text extraction with progress tracking
|
|
216
|
+
*
|
|
217
|
+
* Emits extraction progress for each page with progress percentage.
|
|
218
|
+
* Supports multiple extraction formats: text, markdown, html.
|
|
219
|
+
* Supports both traditional stream API and async iteration.
|
|
220
|
+
*
|
|
221
|
+
* @example
|
|
222
|
+
* ```typescript
|
|
223
|
+
* // Traditional stream API
|
|
224
|
+
* const stream = new ExtractionStream(extractionManager, 0, 10, 'markdown');
|
|
225
|
+
* stream.on('data', (progress) => {
|
|
226
|
+
* console.log(`Progress: ${Math.round(progress.progress * 100)}%`);
|
|
227
|
+
* console.log(`Page ${progress.pageIndex + 1}: ${progress.extractedText.length} chars`);
|
|
228
|
+
* });
|
|
229
|
+
*
|
|
230
|
+
* // Async iteration
|
|
231
|
+
* const stream = new ExtractionStream(extractionManager, 0, 10, 'markdown');
|
|
232
|
+
* for await (const progress of stream) {
|
|
233
|
+
* console.log(`Progress: ${Math.round(progress.progress * 100)}%`);
|
|
234
|
+
* }
|
|
235
|
+
* ```
|
|
236
|
+
*/
|
|
237
|
+
export class ExtractionStream extends Readable {
|
|
238
|
+
private extractionManager: any;
|
|
239
|
+
private startPage: number;
|
|
240
|
+
private endPage: number;
|
|
241
|
+
private extractionType: 'text' | 'markdown' | 'html';
|
|
242
|
+
private options: Record<string, any>;
|
|
243
|
+
private _currentPage: number;
|
|
244
|
+
private _totalPages: number;
|
|
245
|
+
private _ended: boolean;
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Creates a new ExtractionStream
|
|
249
|
+
* @param extractionManager - The extraction manager instance
|
|
250
|
+
* @param startPage - Starting page index (inclusive)
|
|
251
|
+
* @param endPage - Ending page index (exclusive)
|
|
252
|
+
* @param extractionType - 'text', 'markdown', or 'html'
|
|
253
|
+
* @param options - Additional extraction options
|
|
254
|
+
* @throws Error if parameters are invalid
|
|
255
|
+
*/
|
|
256
|
+
constructor(
|
|
257
|
+
extractionManager: any,
|
|
258
|
+
startPage: number,
|
|
259
|
+
endPage: number,
|
|
260
|
+
extractionType: 'text' | 'markdown' | 'html' = 'text',
|
|
261
|
+
options: Record<string, any> = {}
|
|
262
|
+
) {
|
|
263
|
+
super({ objectMode: true });
|
|
264
|
+
|
|
265
|
+
if (!extractionManager) {
|
|
266
|
+
throw new Error('ExtractionManager is required');
|
|
267
|
+
}
|
|
268
|
+
if (typeof startPage !== 'number' || startPage < 0) {
|
|
269
|
+
throw new Error('Start page must be a non-negative number');
|
|
270
|
+
}
|
|
271
|
+
if (typeof endPage !== 'number' || endPage <= startPage) {
|
|
272
|
+
throw new Error('End page must be greater than start page');
|
|
273
|
+
}
|
|
274
|
+
if (!['text', 'markdown', 'html'].includes(extractionType)) {
|
|
275
|
+
throw new Error("Extraction type must be 'text', 'markdown', or 'html'");
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
this.extractionManager = extractionManager;
|
|
279
|
+
this.startPage = startPage;
|
|
280
|
+
this.endPage = endPage;
|
|
281
|
+
this.extractionType = extractionType;
|
|
282
|
+
this.options = options;
|
|
283
|
+
|
|
284
|
+
this._currentPage = startPage;
|
|
285
|
+
this._totalPages = endPage - startPage;
|
|
286
|
+
this._ended = false;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Implement _read() for readable stream
|
|
291
|
+
* @private
|
|
292
|
+
*/
|
|
293
|
+
_read(): void {
|
|
294
|
+
// Check if we've processed all pages
|
|
295
|
+
if (this._currentPage >= this.endPage) {
|
|
296
|
+
if (!this._ended) {
|
|
297
|
+
this._ended = true;
|
|
298
|
+
this.push(null);
|
|
299
|
+
}
|
|
300
|
+
return;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
try {
|
|
304
|
+
// Extract current page
|
|
305
|
+
let extractedText: string;
|
|
306
|
+
if (this.extractionType === 'markdown') {
|
|
307
|
+
extractedText = this.extractionManager.extractMarkdown(
|
|
308
|
+
this._currentPage,
|
|
309
|
+
this.options
|
|
310
|
+
);
|
|
311
|
+
} else if (this.extractionType === 'html') {
|
|
312
|
+
extractedText = this.extractionManager.extractHtml(
|
|
313
|
+
this._currentPage,
|
|
314
|
+
this.options
|
|
315
|
+
);
|
|
316
|
+
} else {
|
|
317
|
+
extractedText = this.extractionManager.extractText(
|
|
318
|
+
this._currentPage,
|
|
319
|
+
this.options
|
|
320
|
+
);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Emit progress object
|
|
324
|
+
const progress: ExtractionProgressData = {
|
|
325
|
+
pageIndex: this._currentPage,
|
|
326
|
+
totalPages: this._totalPages,
|
|
327
|
+
extractedText: extractedText || '',
|
|
328
|
+
extractionType: this.extractionType,
|
|
329
|
+
progress: (this._currentPage - this.startPage + 1) / this._totalPages,
|
|
330
|
+
};
|
|
331
|
+
|
|
332
|
+
this._currentPage++;
|
|
333
|
+
this.push(progress);
|
|
334
|
+
} catch (error) {
|
|
335
|
+
this.destroy(error as Error);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Implement async iteration protocol for `for await...of` support
|
|
341
|
+
* @returns AsyncGenerator for iterating over extraction progress
|
|
342
|
+
*/
|
|
343
|
+
async *[Symbol.asyncIterator](): AsyncGenerator<ExtractionProgressData, void, unknown> {
|
|
344
|
+
// Process each page
|
|
345
|
+
while (this._currentPage < this.endPage) {
|
|
346
|
+
try {
|
|
347
|
+
// Extract current page
|
|
348
|
+
let extractedText: string;
|
|
349
|
+
if (this.extractionType === 'markdown') {
|
|
350
|
+
extractedText = this.extractionManager.extractMarkdown(
|
|
351
|
+
this._currentPage,
|
|
352
|
+
this.options
|
|
353
|
+
);
|
|
354
|
+
} else if (this.extractionType === 'html') {
|
|
355
|
+
extractedText = this.extractionManager.extractHtml(
|
|
356
|
+
this._currentPage,
|
|
357
|
+
this.options
|
|
358
|
+
);
|
|
359
|
+
} else {
|
|
360
|
+
extractedText = this.extractionManager.extractText(
|
|
361
|
+
this._currentPage,
|
|
362
|
+
this.options
|
|
363
|
+
);
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// Create progress object
|
|
367
|
+
const progress: ExtractionProgressData = {
|
|
368
|
+
pageIndex: this._currentPage,
|
|
369
|
+
totalPages: this._totalPages,
|
|
370
|
+
extractedText: extractedText || '',
|
|
371
|
+
extractionType: this.extractionType,
|
|
372
|
+
progress: (this._currentPage - this.startPage + 1) / this._totalPages,
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
this._currentPage++;
|
|
376
|
+
|
|
377
|
+
yield progress;
|
|
378
|
+
} catch (error) {
|
|
379
|
+
this.destroy(error as Error);
|
|
380
|
+
return;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
if (!this._ended) {
|
|
385
|
+
this._ended = true;
|
|
386
|
+
this.destroy();
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Readable stream for page metadata retrieval
|
|
393
|
+
*
|
|
394
|
+
* Emits page metadata (dimensions, fonts, images) for each page in range.
|
|
395
|
+
* Supports lazy loading of metadata per page.
|
|
396
|
+
* Supports both traditional stream API and async iteration.
|
|
397
|
+
*
|
|
398
|
+
* @example
|
|
399
|
+
* ```typescript
|
|
400
|
+
* // Traditional stream API
|
|
401
|
+
* const stream = new MetadataStream(renderingManager, 0, 10);
|
|
402
|
+
* stream.on('data', (metadata) => {
|
|
403
|
+
* console.log(`Page ${metadata.pageIndex + 1}: ${metadata.width}x${metadata.height}`);
|
|
404
|
+
* console.log(` Fonts: ${metadata.fontCount}, Images: ${metadata.imageCount}`);
|
|
405
|
+
* });
|
|
406
|
+
*
|
|
407
|
+
* // Async iteration
|
|
408
|
+
* const stream = new MetadataStream(renderingManager, 0, 10);
|
|
409
|
+
* for await (const metadata of stream) {
|
|
410
|
+
* console.log(`Page ${metadata.pageIndex + 1}: ${metadata.width}x${metadata.height}`);
|
|
411
|
+
* }
|
|
412
|
+
* ```
|
|
413
|
+
*/
|
|
414
|
+
export class MetadataStream extends Readable {
|
|
415
|
+
private renderingManager: any;
|
|
416
|
+
private startPage: number;
|
|
417
|
+
private endPage: number;
|
|
418
|
+
private _currentPage: number;
|
|
419
|
+
private _ended: boolean;
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Creates a new MetadataStream
|
|
423
|
+
* @param renderingManager - The rendering manager instance
|
|
424
|
+
* @param startPage - Starting page index (inclusive)
|
|
425
|
+
* @param endPage - Ending page index (exclusive)
|
|
426
|
+
* @throws Error if parameters are invalid
|
|
427
|
+
*/
|
|
428
|
+
constructor(renderingManager: any, startPage: number, endPage: number) {
|
|
429
|
+
super({ objectMode: true });
|
|
430
|
+
|
|
431
|
+
if (!renderingManager) {
|
|
432
|
+
throw new Error('RenderingManager is required');
|
|
433
|
+
}
|
|
434
|
+
if (typeof startPage !== 'number' || startPage < 0) {
|
|
435
|
+
throw new Error('Start page must be a non-negative number');
|
|
436
|
+
}
|
|
437
|
+
if (typeof endPage !== 'number' || endPage <= startPage) {
|
|
438
|
+
throw new Error('End page must be greater than start page');
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
this.renderingManager = renderingManager;
|
|
442
|
+
this.startPage = startPage;
|
|
443
|
+
this.endPage = endPage;
|
|
444
|
+
|
|
445
|
+
this._currentPage = startPage;
|
|
446
|
+
this._ended = false;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
/**
|
|
450
|
+
* Implement _read() for readable stream
|
|
451
|
+
* @private
|
|
452
|
+
*/
|
|
453
|
+
_read(): void {
|
|
454
|
+
// Check if we've processed all pages
|
|
455
|
+
if (this._currentPage >= this.endPage) {
|
|
456
|
+
if (!this._ended) {
|
|
457
|
+
this._ended = true;
|
|
458
|
+
this.push(null);
|
|
459
|
+
}
|
|
460
|
+
return;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
try {
|
|
464
|
+
// Get page dimensions
|
|
465
|
+
const dimensions = this.renderingManager.getPageDimensions(this._currentPage);
|
|
466
|
+
|
|
467
|
+
// Get embedded resources
|
|
468
|
+
const fonts = this.renderingManager.getEmbeddedFonts?.(this._currentPage) || [];
|
|
469
|
+
const images = this.renderingManager.getEmbeddedImages?.(this._currentPage) || [];
|
|
470
|
+
|
|
471
|
+
// Get rotation
|
|
472
|
+
const rotation = dimensions?.rotation || 0;
|
|
473
|
+
|
|
474
|
+
// Emit metadata object
|
|
475
|
+
const metadata: PageMetadataData = {
|
|
476
|
+
pageIndex: this._currentPage,
|
|
477
|
+
width: dimensions?.width || 0,
|
|
478
|
+
height: dimensions?.height || 0,
|
|
479
|
+
fontCount: Array.isArray(fonts) ? fonts.length : 0,
|
|
480
|
+
imageCount: Array.isArray(images) ? images.length : 0,
|
|
481
|
+
rotation: rotation,
|
|
482
|
+
};
|
|
483
|
+
|
|
484
|
+
this._currentPage++;
|
|
485
|
+
this.push(metadata);
|
|
486
|
+
} catch (error) {
|
|
487
|
+
this.destroy(error as Error);
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Implement async iteration protocol for `for await...of` support
|
|
493
|
+
* @returns AsyncGenerator for iterating over page metadata
|
|
494
|
+
*/
|
|
495
|
+
async *[Symbol.asyncIterator](): AsyncGenerator<PageMetadataData, void, unknown> {
|
|
496
|
+
// Process each page
|
|
497
|
+
while (this._currentPage < this.endPage) {
|
|
498
|
+
try {
|
|
499
|
+
// Get page dimensions
|
|
500
|
+
const dimensions = this.renderingManager.getPageDimensions(this._currentPage);
|
|
501
|
+
|
|
502
|
+
// Get embedded resources
|
|
503
|
+
const fonts = this.renderingManager.getEmbeddedFonts?.(this._currentPage) || [];
|
|
504
|
+
const images = this.renderingManager.getEmbeddedImages?.(this._currentPage) || [];
|
|
505
|
+
|
|
506
|
+
// Get rotation
|
|
507
|
+
const rotation = dimensions?.rotation || 0;
|
|
508
|
+
|
|
509
|
+
// Create metadata object
|
|
510
|
+
const metadata: PageMetadataData = {
|
|
511
|
+
pageIndex: this._currentPage,
|
|
512
|
+
width: dimensions?.width || 0,
|
|
513
|
+
height: dimensions?.height || 0,
|
|
514
|
+
fontCount: Array.isArray(fonts) ? fonts.length : 0,
|
|
515
|
+
imageCount: Array.isArray(images) ? images.length : 0,
|
|
516
|
+
rotation: rotation,
|
|
517
|
+
};
|
|
518
|
+
|
|
519
|
+
this._currentPage++;
|
|
520
|
+
|
|
521
|
+
yield metadata;
|
|
522
|
+
} catch (error) {
|
|
523
|
+
this.destroy(error as Error);
|
|
524
|
+
return;
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
if (!this._ended) {
|
|
529
|
+
this._ended = true;
|
|
530
|
+
this.destroy();
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
/**
|
|
536
|
+
* Creates a readable stream for search results
|
|
537
|
+
*
|
|
538
|
+
* Convenience function to create a SearchStream instance.
|
|
539
|
+
*
|
|
540
|
+
* @param searchManager - The search manager
|
|
541
|
+
* @param searchTerm - Text to search for
|
|
542
|
+
* @param options - Search options
|
|
543
|
+
* @returns A readable stream of search results
|
|
544
|
+
*
|
|
545
|
+
* @example
|
|
546
|
+
* ```typescript
|
|
547
|
+
* createSearchStream(manager, 'error')
|
|
548
|
+
* .pipe(through2.obj((result, enc, cb) => {
|
|
549
|
+
* console.log(`Found: ${result.text}`);
|
|
550
|
+
* cb();
|
|
551
|
+
* }));
|
|
552
|
+
* ```
|
|
553
|
+
*/
|
|
554
|
+
export function createSearchStream(
|
|
555
|
+
searchManager: any,
|
|
556
|
+
searchTerm: string,
|
|
557
|
+
options: Record<string, any> = {}
|
|
558
|
+
): SearchStream {
|
|
559
|
+
return new SearchStream(searchManager, searchTerm, options);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
/**
|
|
563
|
+
* Creates a readable stream for extraction with progress
|
|
564
|
+
*
|
|
565
|
+
* Convenience function to create an ExtractionStream instance.
|
|
566
|
+
*
|
|
567
|
+
* @param extractionManager - The extraction manager
|
|
568
|
+
* @param startPage - Starting page index
|
|
569
|
+
* @param endPage - Ending page index
|
|
570
|
+
* @param extractionType - Extraction format
|
|
571
|
+
* @param options - Additional options
|
|
572
|
+
* @returns A readable stream of extraction progress
|
|
573
|
+
*
|
|
574
|
+
* @example
|
|
575
|
+
* ```typescript
|
|
576
|
+
* createExtractionStream(manager, 0, 10, 'markdown')
|
|
577
|
+
* .pipe(through2.obj((progress, enc, cb) => {
|
|
578
|
+
* console.log(`${Math.round(progress.progress * 100)}% complete`);
|
|
579
|
+
* cb();
|
|
580
|
+
* }));
|
|
581
|
+
* ```
|
|
582
|
+
*/
|
|
583
|
+
export function createExtractionStream(
|
|
584
|
+
extractionManager: any,
|
|
585
|
+
startPage: number,
|
|
586
|
+
endPage: number,
|
|
587
|
+
extractionType: 'text' | 'markdown' | 'html' = 'text',
|
|
588
|
+
options: Record<string, any> = {}
|
|
589
|
+
): ExtractionStream {
|
|
590
|
+
return new ExtractionStream(extractionManager, startPage, endPage, extractionType, options);
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
/**
|
|
594
|
+
* Creates a readable stream for page metadata
|
|
595
|
+
*
|
|
596
|
+
* Convenience function to create a MetadataStream instance.
|
|
597
|
+
*
|
|
598
|
+
* @param renderingManager - The rendering manager
|
|
599
|
+
* @param startPage - Starting page index
|
|
600
|
+
* @param endPage - Ending page index
|
|
601
|
+
* @returns A readable stream of page metadata
|
|
602
|
+
*
|
|
603
|
+
* @example
|
|
604
|
+
* ```typescript
|
|
605
|
+
* createMetadataStream(manager, 0, 10)
|
|
606
|
+
* .pipe(through2.obj((metadata, enc, cb) => {
|
|
607
|
+
* console.log(`Page ${metadata.pageIndex}: ${metadata.width}x${metadata.height}`);
|
|
608
|
+
* cb();
|
|
609
|
+
* }));
|
|
610
|
+
* ```
|
|
611
|
+
*/
|
|
612
|
+
export function createMetadataStream(
|
|
613
|
+
renderingManager: any,
|
|
614
|
+
startPage: number,
|
|
615
|
+
endPage: number
|
|
616
|
+
): MetadataStream {
|
|
617
|
+
return new MetadataStream(renderingManager, startPage, endPage);
|
|
618
|
+
}
|