pdf-oxide 0.3.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +218 -0
  2. package/binding.gyp +35 -0
  3. package/package.json +78 -0
  4. package/src/builders/annotation-builder.ts +367 -0
  5. package/src/builders/conversion-options-builder.ts +257 -0
  6. package/src/builders/index.ts +12 -0
  7. package/src/builders/metadata-builder.ts +317 -0
  8. package/src/builders/pdf-builder.ts +386 -0
  9. package/src/builders/search-options-builder.ts +151 -0
  10. package/src/document-editor-manager.ts +318 -0
  11. package/src/errors.ts +1629 -0
  12. package/src/form-field-manager.ts +666 -0
  13. package/src/hybrid-ml-manager.ts +283 -0
  14. package/src/index.ts +453 -0
  15. package/src/managers/accessibility-manager.ts +338 -0
  16. package/src/managers/annotation-manager.ts +439 -0
  17. package/src/managers/barcode-manager.ts +235 -0
  18. package/src/managers/batch-manager.ts +533 -0
  19. package/src/managers/cache-manager.ts +486 -0
  20. package/src/managers/compliance-manager.ts +375 -0
  21. package/src/managers/content-manager.ts +339 -0
  22. package/src/managers/document-utility-manager.ts +922 -0
  23. package/src/managers/dom-pdf-creator.ts +365 -0
  24. package/src/managers/editing-manager.ts +514 -0
  25. package/src/managers/enterprise-manager.ts +478 -0
  26. package/src/managers/extended-managers.ts +437 -0
  27. package/src/managers/extraction-manager.ts +583 -0
  28. package/src/managers/final-utilities.ts +429 -0
  29. package/src/managers/hybrid-ml-advanced.ts +479 -0
  30. package/src/managers/index.ts +239 -0
  31. package/src/managers/layer-manager.ts +500 -0
  32. package/src/managers/metadata-manager.ts +303 -0
  33. package/src/managers/ocr-manager.ts +756 -0
  34. package/src/managers/optimization-manager.ts +262 -0
  35. package/src/managers/outline-manager.ts +196 -0
  36. package/src/managers/page-manager.ts +289 -0
  37. package/src/managers/pattern-detection.ts +440 -0
  38. package/src/managers/rendering-manager.ts +863 -0
  39. package/src/managers/search-manager.ts +385 -0
  40. package/src/managers/security-manager.ts +345 -0
  41. package/src/managers/signature-manager.ts +1664 -0
  42. package/src/managers/streams.ts +618 -0
  43. package/src/managers/xfa-manager.ts +500 -0
  44. package/src/pdf-creator-manager.ts +494 -0
  45. package/src/properties.ts +522 -0
  46. package/src/result-accessors-manager.ts +867 -0
  47. package/src/tests/advanced-features.test.ts +414 -0
  48. package/src/tests/advanced.test.ts +266 -0
  49. package/src/tests/extended-managers.test.ts +316 -0
  50. package/src/tests/final-utilities.test.ts +455 -0
  51. package/src/tests/foundation.test.ts +315 -0
  52. package/src/tests/high-demand.test.ts +257 -0
  53. package/src/tests/specialized.test.ts +97 -0
  54. package/src/thumbnail-manager.ts +272 -0
  55. package/src/types/common.ts +142 -0
  56. package/src/types/document-types.ts +457 -0
  57. package/src/types/index.ts +6 -0
  58. package/src/types/manager-types.ts +284 -0
  59. package/src/types/native-bindings.ts +517 -0
  60. package/src/workers/index.ts +7 -0
  61. package/src/workers/pool.ts +274 -0
  62. package/src/workers/worker.ts +131 -0
@@ -0,0 +1,289 @@
1
+ /**
2
+ * Manager for PDF page-level operations
3
+ *
4
+ * Provides methods to query page count, dimensions, and validate indices.
5
+ *
6
+ * @example
7
+ * ```typescript
8
+ * import { PageManager } from 'pdf_oxide';
9
+ *
10
+ * const doc = PdfDocument.open('document.pdf');
11
+ * const pageManager = new PageManager(doc);
12
+ *
13
+ * console.log(`Document has ${pageManager.getPageCount()} pages`);
14
+ *
15
+ * if (pageManager.isValidPageIndex(5)) {
16
+ * const info = pageManager.getPageInfo(5);
17
+ * console.log(`Page 5: ${info.width} x ${info.height} points`);
18
+ * }
19
+ * ```
20
+ */
21
+
22
+ export interface PageInfo {
23
+ index: number;
24
+ width: number;
25
+ height: number;
26
+ }
27
+
28
+ export interface PageRange {
29
+ firstPage: number;
30
+ lastPage: number;
31
+ }
32
+
33
+ export interface PageStatistics {
34
+ count: number;
35
+ minWidth: number;
36
+ maxWidth: number;
37
+ minHeight: number;
38
+ maxHeight: number;
39
+ averageWidth: number;
40
+ averageHeight: number;
41
+ hasVariableSizes: boolean;
42
+ }
43
+
44
+ export class PageManager {
45
+ private _document: any;
46
+ private _cache: Map<string, any>;
47
+
48
+ /**
49
+ * Creates a new PageManager for the given document
50
+ * @param document - The PDF document
51
+ * @throws Error if document is null or undefined
52
+ */
53
+ constructor(document: any) {
54
+ if (!document) {
55
+ throw new Error('Document is required');
56
+ }
57
+ this._document = document;
58
+ this._cache = new Map();
59
+ }
60
+
61
+ /**
62
+ * Clears the page cache
63
+ */
64
+ clearCache(): void {
65
+ this._cache.clear();
66
+ }
67
+
68
+ /**
69
+ * Gets the total number of pages in the document
70
+ * @returns Number of pages
71
+ */
72
+ getPageCount(): number {
73
+ const cacheKey = 'page:count';
74
+ if (this._cache.has(cacheKey)) {
75
+ return this._cache.get(cacheKey);
76
+ }
77
+
78
+ try {
79
+ const count = this._document.pageCount || 0;
80
+ this._cache.set(cacheKey, count);
81
+ return count;
82
+ } catch (error) {
83
+ return 0;
84
+ }
85
+ }
86
+
87
+ /**
88
+ * Checks if a page index is valid for this document
89
+ * @param pageIndex - Page index to validate (0-based)
90
+ * @returns True if the page index is valid
91
+ */
92
+ isValidPageIndex(pageIndex: number): boolean {
93
+ if (pageIndex < 0) {
94
+ return false;
95
+ }
96
+ return pageIndex < this.getPageCount();
97
+ }
98
+
99
+ /**
100
+ * Gets information about a specific page
101
+ * @param pageIndex - Page index (0-based)
102
+ * @returns PageInfo object with page dimensions
103
+ * @throws Error if page index is invalid
104
+ *
105
+ * @example
106
+ * ```typescript
107
+ * const info = manager.getPageInfo(0);
108
+ * console.log(`Page 0: ${info.width} x ${info.height} points`);
109
+ * ```
110
+ */
111
+ getPageInfo(pageIndex: number): PageInfo {
112
+ const cacheKey = `page:info:${pageIndex}`;
113
+ if (this._cache.has(cacheKey)) {
114
+ return this._cache.get(cacheKey);
115
+ }
116
+
117
+ if (!this.isValidPageIndex(pageIndex)) {
118
+ throw new Error(`Invalid page index: ${pageIndex}`);
119
+ }
120
+
121
+ try {
122
+ const page = this._document.getPage(pageIndex);
123
+ const info: PageInfo = {
124
+ index: pageIndex,
125
+ width: page?.width || 0,
126
+ height: page?.height || 0,
127
+ };
128
+ this._cache.set(cacheKey, info);
129
+ return info;
130
+ } catch (error) {
131
+ return {
132
+ index: pageIndex,
133
+ width: 0,
134
+ height: 0,
135
+ };
136
+ }
137
+ }
138
+
139
+ /**
140
+ * Gets information about all pages in the document
141
+ * @returns Array of PageInfo objects
142
+ *
143
+ * @example
144
+ * ```typescript
145
+ * const pages = manager.getAllPageInfo();
146
+ * pages.forEach(page => {
147
+ * console.log(`Page ${page.index}: ${page.width} x ${page.height}`);
148
+ * });
149
+ * ```
150
+ */
151
+ getAllPageInfo(): PageInfo[] {
152
+ const cacheKey = 'page:info:all';
153
+ if (this._cache.has(cacheKey)) {
154
+ return this._cache.get(cacheKey);
155
+ }
156
+
157
+ const count = this.getPageCount();
158
+ const pages: PageInfo[] = [];
159
+ for (let i = 0; i < count; i++) {
160
+ pages.push(this.getPageInfo(i));
161
+ }
162
+
163
+ this._cache.set(cacheKey, pages);
164
+ return pages;
165
+ }
166
+
167
+ /**
168
+ * Checks if the document has no pages
169
+ * @returns True if the document has no pages
170
+ */
171
+ isEmpty(): boolean {
172
+ return this.getPageCount() === 0;
173
+ }
174
+
175
+ /**
176
+ * Checks if the document has more than one page
177
+ * @returns True if document has multiple pages
178
+ */
179
+ hasMultiplePages(): boolean {
180
+ return this.getPageCount() > 1;
181
+ }
182
+
183
+ /**
184
+ * Gets the valid page range
185
+ * @returns Object with firstPage and lastPage indices
186
+ *
187
+ * @example
188
+ * ```typescript
189
+ * const range = manager.getPageRange();
190
+ * console.log(`Page range: ${range.firstPage} to ${range.lastPage}`);
191
+ * ```
192
+ */
193
+ getPageRange(): PageRange {
194
+ const count = this.getPageCount();
195
+ if (count === 0) {
196
+ return { firstPage: 0, lastPage: -1 };
197
+ }
198
+ return { firstPage: 0, lastPage: count - 1 };
199
+ }
200
+
201
+ /**
202
+ * Gets page dimension statistics
203
+ * @returns Statistics about page dimensions
204
+ *
205
+ * @example
206
+ * ```typescript
207
+ * const stats = manager.getPageStatistics();
208
+ * console.log(`Average width: ${stats.averageWidth}`);
209
+ * console.log(`Pages vary in size: ${stats.hasVariableSizes}`);
210
+ * ```
211
+ */
212
+ getPageStatistics(): PageStatistics {
213
+ const pages = this.getAllPageInfo();
214
+
215
+ if (pages.length === 0) {
216
+ return {
217
+ count: 0,
218
+ minWidth: 0,
219
+ maxWidth: 0,
220
+ minHeight: 0,
221
+ maxHeight: 0,
222
+ averageWidth: 0,
223
+ averageHeight: 0,
224
+ hasVariableSizes: false,
225
+ };
226
+ }
227
+
228
+ const widths = pages.map(p => p.width);
229
+ const heights = pages.map(p => p.height);
230
+
231
+ const minWidth = Math.min(...widths);
232
+ const maxWidth = Math.max(...widths);
233
+ const minHeight = Math.min(...heights);
234
+ const maxHeight = Math.max(...heights);
235
+
236
+ const averageWidth = widths.reduce((a, b) => a + b, 0) / widths.length;
237
+ const averageHeight = heights.reduce((a, b) => a + b, 0) / heights.length;
238
+
239
+ return {
240
+ count: pages.length,
241
+ minWidth,
242
+ maxWidth,
243
+ minHeight,
244
+ maxHeight,
245
+ averageWidth,
246
+ averageHeight,
247
+ hasVariableSizes: minWidth !== maxWidth || minHeight !== maxHeight,
248
+ };
249
+ }
250
+
251
+ /**
252
+ * Gets pages within a specific size range
253
+ * @param minWidth - Minimum width
254
+ * @param maxWidth - Maximum width
255
+ * @param minHeight - Minimum height
256
+ * @param maxHeight - Maximum height
257
+ * @returns Matching PageInfo objects
258
+ */
259
+ getPagesInSizeRange(
260
+ minWidth: number,
261
+ maxWidth: number,
262
+ minHeight: number,
263
+ maxHeight: number
264
+ ): PageInfo[] {
265
+ const pages = this.getAllPageInfo();
266
+ return pages.filter(p =>
267
+ p.width >= minWidth && p.width <= maxWidth &&
268
+ p.height >= minHeight && p.height <= maxHeight
269
+ );
270
+ }
271
+
272
+ /**
273
+ * Gets landscape pages
274
+ * @returns Array of landscape PageInfo objects
275
+ */
276
+ getLandscapePages(): PageInfo[] {
277
+ const pages = this.getAllPageInfo();
278
+ return pages.filter(p => p.width > p.height);
279
+ }
280
+
281
+ /**
282
+ * Gets portrait pages
283
+ * @returns Array of portrait PageInfo objects
284
+ */
285
+ getPortraitPages(): PageInfo[] {
286
+ const pages = this.getAllPageInfo();
287
+ return pages.filter(p => p.height > p.width);
288
+ }
289
+ }
@@ -0,0 +1,440 @@
1
+ /**
2
+ * Pattern Detection Manager - TypeScript/Node.js Implementation
3
+ *
4
+ * Provides ML-powered pattern detection for PDF analysis:
5
+ * - Table detection and extraction
6
+ * - Column detection and analysis
7
+ * - Barcode detection and decoding
8
+ * - Form field detection
9
+ * - Layout pattern recognition
10
+ */
11
+
12
+ import type { PdfDocument } from "../types/document-types.js";
13
+
14
+ /**
15
+ * Represents a detected table region on a page.
16
+ */
17
+ export interface TableRegion {
18
+ readonly x: number;
19
+ readonly y: number;
20
+ readonly width: number;
21
+ readonly height: number;
22
+ readonly rowCount?: number;
23
+ readonly columnCount?: number;
24
+ readonly confidence?: number;
25
+ }
26
+
27
+ /**
28
+ * Represents a detected column region on a page.
29
+ */
30
+ export interface ColumnRegion {
31
+ readonly x: number;
32
+ readonly y: number;
33
+ readonly width: number;
34
+ readonly height: number;
35
+ readonly columnIndex?: number;
36
+ readonly confidence?: number;
37
+ }
38
+
39
+ /**
40
+ * Represents a detected barcode.
41
+ */
42
+ export interface BarcodeRegion {
43
+ readonly x: number;
44
+ readonly y: number;
45
+ readonly width: number;
46
+ readonly height: number;
47
+ readonly format: string;
48
+ readonly value: string;
49
+ readonly confidence?: number;
50
+ }
51
+
52
+ /**
53
+ * Represents a detected form field.
54
+ */
55
+ export interface FormFieldRegion {
56
+ readonly x: number;
57
+ readonly y: number;
58
+ readonly width: number;
59
+ readonly height: number;
60
+ readonly fieldType: string;
61
+ readonly fieldName?: string;
62
+ readonly confidence?: number;
63
+ }
64
+
65
+ /**
66
+ * Layout pattern type enumeration.
67
+ */
68
+ export enum LayoutPatternType {
69
+ SINGLE_COLUMN = "single_column",
70
+ MULTI_COLUMN = "multi_column",
71
+ TABLE_BASED = "table_based",
72
+ FORM_BASED = "form_based",
73
+ MAGAZINE_STYLE = "magazine_style",
74
+ COMPLEX_MIXED = "complex_mixed",
75
+ }
76
+
77
+ /**
78
+ * Detected layout pattern.
79
+ */
80
+ export interface LayoutPattern {
81
+ readonly pageIndex: number;
82
+ readonly patternType: LayoutPatternType;
83
+ readonly confidence: number;
84
+ readonly regions: Array<TableRegion | ColumnRegion>;
85
+ }
86
+
87
+ /**
88
+ * Pattern Detection Manager for TypeScript/Node.js
89
+ *
90
+ * Provides detection and analysis of common patterns in PDF documents.
91
+ */
92
+ export class PatternDetectionManager {
93
+ private readonly document: PdfDocument;
94
+ private readonly cache: Map<string, unknown> = new Map();
95
+
96
+ /**
97
+ * Create a new PatternDetectionManager.
98
+ */
99
+ constructor(document: PdfDocument) {
100
+ this.document = document;
101
+ }
102
+
103
+ /**
104
+ * Detect tables on a specific page.
105
+ *
106
+ * @param pageIndex - Index of the page to analyze
107
+ * @returns Array of detected table regions
108
+ */
109
+ async detectTables(pageIndex: number): Promise<TableRegion[]> {
110
+ const cacheKey = `tables:${pageIndex}`;
111
+ const cached = this.cache.get(cacheKey);
112
+ if (cached) {
113
+ return cached as TableRegion[];
114
+ }
115
+
116
+ // Extract text to analyze
117
+ const text = await this.document.extractText(pageIndex);
118
+ if (!text) {
119
+ return [];
120
+ }
121
+
122
+ // Simple heuristic: detect table-like patterns
123
+ const tables: TableRegion[] = [];
124
+ const lines = text.split("\n");
125
+
126
+ // Look for lines with multiple columns (tabs or spaces)
127
+ let currentTableStart = -1;
128
+ let tableLines = 0;
129
+
130
+ for (let i = 0; i < lines.length; i++) {
131
+ const line = lines[i];
132
+ const columnCount = (line?.match(/\t/g) || []).length + 1;
133
+
134
+ if (columnCount >= 2) {
135
+ if (currentTableStart === -1) {
136
+ currentTableStart = i;
137
+ }
138
+ tableLines++;
139
+ } else if (currentTableStart !== -1 && tableLines > 1) {
140
+ // End of table detected
141
+ tables.push({
142
+ x: 50,
143
+ y: 100 + currentTableStart * 15,
144
+ width: 500,
145
+ height: tableLines * 15,
146
+ rowCount: tableLines,
147
+ columnCount: (lines[currentTableStart]?.match(/\t/g) || []).length + 1,
148
+ confidence: 0.7,
149
+ });
150
+ currentTableStart = -1;
151
+ tableLines = 0;
152
+ }
153
+ }
154
+
155
+ this.cache.set(cacheKey, tables);
156
+ return tables;
157
+ }
158
+
159
+ /**
160
+ * Detect columns on a specific page.
161
+ *
162
+ * @param pageIndex - Index of the page to analyze
163
+ * @returns Array of detected column regions
164
+ */
165
+ async detectColumns(pageIndex: number): Promise<ColumnRegion[]> {
166
+ const cacheKey = `columns:${pageIndex}`;
167
+ const cached = this.cache.get(cacheKey);
168
+ if (cached) {
169
+ return cached as ColumnRegion[];
170
+ }
171
+
172
+ const text = await this.document.extractText(pageIndex);
173
+ if (!text) {
174
+ return [];
175
+ }
176
+
177
+ // Simple heuristic: detect multi-column layouts
178
+ const columns: ColumnRegion[] = [];
179
+ const lines = text.split("\n");
180
+
181
+ // Check for indentation patterns suggesting columns
182
+ const indentationPattern = new Map<number, number>();
183
+
184
+ for (const line of lines) {
185
+ if (line.length > 0) {
186
+ const indent = line.search(/\S/);
187
+ if (indent >= 0) {
188
+ indentationPattern.set(indent, (indentationPattern.get(indent) || 0) + 1);
189
+ }
190
+ }
191
+ }
192
+
193
+ // If multiple indentation levels, likely multi-column
194
+ if (indentationPattern.size > 1) {
195
+ const indents = Array.from(indentationPattern.entries())
196
+ .sort((a, b) => b[1] - a[1])
197
+ .slice(0, 2)
198
+ .map(([indent]) => indent);
199
+
200
+ for (let i = 0; i < indents.length; i++) {
201
+ columns.push({
202
+ x: (indents[i] ?? 0) * 8,
203
+ y: 50,
204
+ width: 250,
205
+ height: 700,
206
+ columnIndex: i,
207
+ confidence: 0.6,
208
+ });
209
+ }
210
+ } else {
211
+ // Single column
212
+ columns.push({
213
+ x: 50,
214
+ y: 50,
215
+ width: 500,
216
+ height: 700,
217
+ columnIndex: 0,
218
+ confidence: 0.95,
219
+ });
220
+ }
221
+
222
+ this.cache.set(cacheKey, columns);
223
+ return columns;
224
+ }
225
+
226
+ /**
227
+ * Detect barcodes on a specific page.
228
+ *
229
+ * @param pageIndex - Index of the page to analyze
230
+ * @returns Array of detected barcodes
231
+ */
232
+ async detectBarcodes(pageIndex: number): Promise<BarcodeRegion[]> {
233
+ const cacheKey = `barcodes:${pageIndex}`;
234
+ const cached = this.cache.get(cacheKey);
235
+ if (cached) {
236
+ return cached as BarcodeRegion[];
237
+ }
238
+
239
+ // This would typically involve barcode detection via FFI
240
+ // For now, return empty as this requires image processing
241
+ const barcodes: BarcodeRegion[] = [];
242
+
243
+ this.cache.set(cacheKey, barcodes);
244
+ return barcodes;
245
+ }
246
+
247
+ /**
248
+ * Detect form fields on a specific page.
249
+ *
250
+ * @param pageIndex - Index of the page to analyze
251
+ * @returns Array of detected form fields
252
+ */
253
+ async detectFormFields(pageIndex: number): Promise<FormFieldRegion[]> {
254
+ const cacheKey = `form_fields:${pageIndex}`;
255
+ const cached = this.cache.get(cacheKey);
256
+ if (cached) {
257
+ return cached as FormFieldRegion[];
258
+ }
259
+
260
+ // Try to extract form fields if available
261
+ let formFields: FormFieldRegion[] = [];
262
+
263
+ try {
264
+ const fields = await this.document.extractFormFields();
265
+ formFields = (fields as any[])
266
+ .filter((f: any) => (f as any).pageIndex === pageIndex)
267
+ .map((f: any) => ({
268
+ x: (f as any).x || 0,
269
+ y: (f as any).y || 0,
270
+ width: (f as any).width || 100,
271
+ height: (f as any).height || 20,
272
+ fieldType: (f as any).type || "unknown",
273
+ fieldName: (f as any).name,
274
+ confidence: 0.9,
275
+ }));
276
+ } catch {
277
+ // If extraction fails, return empty array
278
+ }
279
+
280
+ this.cache.set(cacheKey, formFields);
281
+ return formFields;
282
+ }
283
+
284
+ /**
285
+ * Analyze layout pattern of a page.
286
+ *
287
+ * @param pageIndex - Index of the page to analyze
288
+ * @returns Detected layout pattern
289
+ */
290
+ async analyzeLayoutPattern(pageIndex: number): Promise<LayoutPattern> {
291
+ const cacheKey = `layout_pattern:${pageIndex}`;
292
+ const cached = this.cache.get(cacheKey);
293
+ if (cached) {
294
+ return cached as LayoutPattern;
295
+ }
296
+
297
+ // Detect tables and columns
298
+ const tables = await this.detectTables(pageIndex);
299
+ const columns = await this.detectColumns(pageIndex);
300
+
301
+ // Determine pattern type
302
+ let patternType = LayoutPatternType.SINGLE_COLUMN;
303
+ let confidence = 0.5;
304
+
305
+ if (tables.length > 0) {
306
+ patternType = LayoutPatternType.TABLE_BASED;
307
+ confidence = 0.85;
308
+ } else if (columns.length > 1) {
309
+ patternType = LayoutPatternType.MULTI_COLUMN;
310
+ confidence = 0.75;
311
+ }
312
+
313
+ const pattern: LayoutPattern = {
314
+ pageIndex,
315
+ patternType,
316
+ confidence,
317
+ regions: [...tables, ...columns],
318
+ };
319
+
320
+ this.cache.set(cacheKey, pattern);
321
+ return pattern;
322
+ }
323
+
324
+ /**
325
+ * Detect all patterns on a specific page.
326
+ *
327
+ * @param pageIndex - Index of the page to analyze
328
+ * @returns Object with all detected patterns
329
+ */
330
+ async detectAllPatterns(
331
+ pageIndex: number,
332
+ ): Promise<{
333
+ tables: TableRegion[];
334
+ columns: ColumnRegion[];
335
+ barcodes: BarcodeRegion[];
336
+ formFields: FormFieldRegion[];
337
+ layout: LayoutPattern;
338
+ }> {
339
+ const [tables, columns, barcodes, formFields, layout] = await Promise.all([
340
+ this.detectTables(pageIndex),
341
+ this.detectColumns(pageIndex),
342
+ this.detectBarcodes(pageIndex),
343
+ this.detectFormFields(pageIndex),
344
+ this.analyzeLayoutPattern(pageIndex),
345
+ ]);
346
+
347
+ return {
348
+ tables,
349
+ columns,
350
+ barcodes,
351
+ formFields,
352
+ layout,
353
+ };
354
+ }
355
+
356
+ /**
357
+ * Analyze patterns across entire document.
358
+ *
359
+ * @returns Array of layout patterns for each page
360
+ */
361
+ async analyzeDocumentPatterns(): Promise<LayoutPattern[]> {
362
+ const pageCount = await this.document.pageCount();
363
+ const patterns: LayoutPattern[] = [];
364
+
365
+ for (let i = 0; i < pageCount; i++) {
366
+ try {
367
+ const pattern = await this.analyzeLayoutPattern(i);
368
+ patterns.push(pattern);
369
+ } catch {
370
+ // Skip on error
371
+ }
372
+ }
373
+
374
+ return patterns;
375
+ }
376
+
377
+ /**
378
+ * Find pages with specific pattern type.
379
+ *
380
+ * @param patternType - Pattern type to find
381
+ * @returns Array of page indices with the specified pattern
382
+ */
383
+ async findPagesWithPattern(patternType: LayoutPatternType): Promise<number[]> {
384
+ const patterns = await this.analyzeDocumentPatterns();
385
+ return patterns
386
+ .filter((p) => p.patternType === patternType)
387
+ .map((p) => p.pageIndex);
388
+ }
389
+
390
+ /**
391
+ * Get pattern statistics for the document.
392
+ *
393
+ * @returns Statistics about detected patterns
394
+ */
395
+ async getPatternStatistics(): Promise<{
396
+ totalPages: number;
397
+ pagesWithTables: number;
398
+ pagesWithColumns: number;
399
+ avgTablesPerPage: number;
400
+ avgColumnsPerPage: number;
401
+ }> {
402
+ const pageCount = await this.document.pageCount();
403
+ let totalTables = 0;
404
+ let totalColumns = 0;
405
+ let pagesWithTables = 0;
406
+ let pagesWithColumns = 0;
407
+
408
+ for (let i = 0; i < pageCount; i++) {
409
+ const tables = await this.detectTables(i);
410
+ const columns = await this.detectColumns(i);
411
+
412
+ if (tables.length > 0) {
413
+ pagesWithTables++;
414
+ totalTables += tables.length;
415
+ }
416
+
417
+ if (columns.length > 1) {
418
+ pagesWithColumns++;
419
+ totalColumns += columns.length;
420
+ }
421
+ }
422
+
423
+ return {
424
+ totalPages: pageCount,
425
+ pagesWithTables,
426
+ pagesWithColumns,
427
+ avgTablesPerPage: pagesWithTables > 0 ? totalTables / pagesWithTables : 0,
428
+ avgColumnsPerPage: pagesWithColumns > 0 ? totalColumns / pagesWithColumns : 0,
429
+ };
430
+ }
431
+
432
+ /**
433
+ * Clear the internal cache.
434
+ */
435
+ clearCache(): void {
436
+ this.cache.clear();
437
+ }
438
+ }
439
+
440
+ export default PatternDetectionManager;