pdf-oxide 0.3.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -0
- package/binding.gyp +35 -0
- package/package.json +78 -0
- package/src/builders/annotation-builder.ts +367 -0
- package/src/builders/conversion-options-builder.ts +257 -0
- package/src/builders/index.ts +12 -0
- package/src/builders/metadata-builder.ts +317 -0
- package/src/builders/pdf-builder.ts +386 -0
- package/src/builders/search-options-builder.ts +151 -0
- package/src/document-editor-manager.ts +318 -0
- package/src/errors.ts +1629 -0
- package/src/form-field-manager.ts +666 -0
- package/src/hybrid-ml-manager.ts +283 -0
- package/src/index.ts +453 -0
- package/src/managers/accessibility-manager.ts +338 -0
- package/src/managers/annotation-manager.ts +439 -0
- package/src/managers/barcode-manager.ts +235 -0
- package/src/managers/batch-manager.ts +533 -0
- package/src/managers/cache-manager.ts +486 -0
- package/src/managers/compliance-manager.ts +375 -0
- package/src/managers/content-manager.ts +339 -0
- package/src/managers/document-utility-manager.ts +922 -0
- package/src/managers/dom-pdf-creator.ts +365 -0
- package/src/managers/editing-manager.ts +514 -0
- package/src/managers/enterprise-manager.ts +478 -0
- package/src/managers/extended-managers.ts +437 -0
- package/src/managers/extraction-manager.ts +583 -0
- package/src/managers/final-utilities.ts +429 -0
- package/src/managers/hybrid-ml-advanced.ts +479 -0
- package/src/managers/index.ts +239 -0
- package/src/managers/layer-manager.ts +500 -0
- package/src/managers/metadata-manager.ts +303 -0
- package/src/managers/ocr-manager.ts +756 -0
- package/src/managers/optimization-manager.ts +262 -0
- package/src/managers/outline-manager.ts +196 -0
- package/src/managers/page-manager.ts +289 -0
- package/src/managers/pattern-detection.ts +440 -0
- package/src/managers/rendering-manager.ts +863 -0
- package/src/managers/search-manager.ts +385 -0
- package/src/managers/security-manager.ts +345 -0
- package/src/managers/signature-manager.ts +1664 -0
- package/src/managers/streams.ts +618 -0
- package/src/managers/xfa-manager.ts +500 -0
- package/src/pdf-creator-manager.ts +494 -0
- package/src/properties.ts +522 -0
- package/src/result-accessors-manager.ts +867 -0
- package/src/tests/advanced-features.test.ts +414 -0
- package/src/tests/advanced.test.ts +266 -0
- package/src/tests/extended-managers.test.ts +316 -0
- package/src/tests/final-utilities.test.ts +455 -0
- package/src/tests/foundation.test.ts +315 -0
- package/src/tests/high-demand.test.ts +257 -0
- package/src/tests/specialized.test.ts +97 -0
- package/src/thumbnail-manager.ts +272 -0
- package/src/types/common.ts +142 -0
- package/src/types/document-types.ts +457 -0
- package/src/types/index.ts +6 -0
- package/src/types/manager-types.ts +284 -0
- package/src/types/native-bindings.ts +517 -0
- package/src/workers/index.ts +7 -0
- package/src/workers/pool.ts +274 -0
- package/src/workers/worker.ts +131 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Manager for PDF page-level operations
|
|
3
|
+
*
|
|
4
|
+
* Provides methods to query page count, dimensions, and validate indices.
|
|
5
|
+
*
|
|
6
|
+
* @example
|
|
7
|
+
* ```typescript
|
|
8
|
+
* import { PageManager } from 'pdf_oxide';
|
|
9
|
+
*
|
|
10
|
+
* const doc = PdfDocument.open('document.pdf');
|
|
11
|
+
* const pageManager = new PageManager(doc);
|
|
12
|
+
*
|
|
13
|
+
* console.log(`Document has ${pageManager.getPageCount()} pages`);
|
|
14
|
+
*
|
|
15
|
+
* if (pageManager.isValidPageIndex(5)) {
|
|
16
|
+
* const info = pageManager.getPageInfo(5);
|
|
17
|
+
* console.log(`Page 5: ${info.width} x ${info.height} points`);
|
|
18
|
+
* }
|
|
19
|
+
* ```
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
export interface PageInfo {
|
|
23
|
+
index: number;
|
|
24
|
+
width: number;
|
|
25
|
+
height: number;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface PageRange {
|
|
29
|
+
firstPage: number;
|
|
30
|
+
lastPage: number;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface PageStatistics {
|
|
34
|
+
count: number;
|
|
35
|
+
minWidth: number;
|
|
36
|
+
maxWidth: number;
|
|
37
|
+
minHeight: number;
|
|
38
|
+
maxHeight: number;
|
|
39
|
+
averageWidth: number;
|
|
40
|
+
averageHeight: number;
|
|
41
|
+
hasVariableSizes: boolean;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export class PageManager {
|
|
45
|
+
private _document: any;
|
|
46
|
+
private _cache: Map<string, any>;
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Creates a new PageManager for the given document
|
|
50
|
+
* @param document - The PDF document
|
|
51
|
+
* @throws Error if document is null or undefined
|
|
52
|
+
*/
|
|
53
|
+
constructor(document: any) {
|
|
54
|
+
if (!document) {
|
|
55
|
+
throw new Error('Document is required');
|
|
56
|
+
}
|
|
57
|
+
this._document = document;
|
|
58
|
+
this._cache = new Map();
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Clears the page cache
|
|
63
|
+
*/
|
|
64
|
+
clearCache(): void {
|
|
65
|
+
this._cache.clear();
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Gets the total number of pages in the document
|
|
70
|
+
* @returns Number of pages
|
|
71
|
+
*/
|
|
72
|
+
getPageCount(): number {
|
|
73
|
+
const cacheKey = 'page:count';
|
|
74
|
+
if (this._cache.has(cacheKey)) {
|
|
75
|
+
return this._cache.get(cacheKey);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
try {
|
|
79
|
+
const count = this._document.pageCount || 0;
|
|
80
|
+
this._cache.set(cacheKey, count);
|
|
81
|
+
return count;
|
|
82
|
+
} catch (error) {
|
|
83
|
+
return 0;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Checks if a page index is valid for this document
|
|
89
|
+
* @param pageIndex - Page index to validate (0-based)
|
|
90
|
+
* @returns True if the page index is valid
|
|
91
|
+
*/
|
|
92
|
+
isValidPageIndex(pageIndex: number): boolean {
|
|
93
|
+
if (pageIndex < 0) {
|
|
94
|
+
return false;
|
|
95
|
+
}
|
|
96
|
+
return pageIndex < this.getPageCount();
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Gets information about a specific page
|
|
101
|
+
* @param pageIndex - Page index (0-based)
|
|
102
|
+
* @returns PageInfo object with page dimensions
|
|
103
|
+
* @throws Error if page index is invalid
|
|
104
|
+
*
|
|
105
|
+
* @example
|
|
106
|
+
* ```typescript
|
|
107
|
+
* const info = manager.getPageInfo(0);
|
|
108
|
+
* console.log(`Page 0: ${info.width} x ${info.height} points`);
|
|
109
|
+
* ```
|
|
110
|
+
*/
|
|
111
|
+
getPageInfo(pageIndex: number): PageInfo {
|
|
112
|
+
const cacheKey = `page:info:${pageIndex}`;
|
|
113
|
+
if (this._cache.has(cacheKey)) {
|
|
114
|
+
return this._cache.get(cacheKey);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (!this.isValidPageIndex(pageIndex)) {
|
|
118
|
+
throw new Error(`Invalid page index: ${pageIndex}`);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
try {
|
|
122
|
+
const page = this._document.getPage(pageIndex);
|
|
123
|
+
const info: PageInfo = {
|
|
124
|
+
index: pageIndex,
|
|
125
|
+
width: page?.width || 0,
|
|
126
|
+
height: page?.height || 0,
|
|
127
|
+
};
|
|
128
|
+
this._cache.set(cacheKey, info);
|
|
129
|
+
return info;
|
|
130
|
+
} catch (error) {
|
|
131
|
+
return {
|
|
132
|
+
index: pageIndex,
|
|
133
|
+
width: 0,
|
|
134
|
+
height: 0,
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Gets information about all pages in the document
|
|
141
|
+
* @returns Array of PageInfo objects
|
|
142
|
+
*
|
|
143
|
+
* @example
|
|
144
|
+
* ```typescript
|
|
145
|
+
* const pages = manager.getAllPageInfo();
|
|
146
|
+
* pages.forEach(page => {
|
|
147
|
+
* console.log(`Page ${page.index}: ${page.width} x ${page.height}`);
|
|
148
|
+
* });
|
|
149
|
+
* ```
|
|
150
|
+
*/
|
|
151
|
+
getAllPageInfo(): PageInfo[] {
|
|
152
|
+
const cacheKey = 'page:info:all';
|
|
153
|
+
if (this._cache.has(cacheKey)) {
|
|
154
|
+
return this._cache.get(cacheKey);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const count = this.getPageCount();
|
|
158
|
+
const pages: PageInfo[] = [];
|
|
159
|
+
for (let i = 0; i < count; i++) {
|
|
160
|
+
pages.push(this.getPageInfo(i));
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
this._cache.set(cacheKey, pages);
|
|
164
|
+
return pages;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Checks if the document has no pages
|
|
169
|
+
* @returns True if the document has no pages
|
|
170
|
+
*/
|
|
171
|
+
isEmpty(): boolean {
|
|
172
|
+
return this.getPageCount() === 0;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Checks if the document has more than one page
|
|
177
|
+
* @returns True if document has multiple pages
|
|
178
|
+
*/
|
|
179
|
+
hasMultiplePages(): boolean {
|
|
180
|
+
return this.getPageCount() > 1;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/**
|
|
184
|
+
* Gets the valid page range
|
|
185
|
+
* @returns Object with firstPage and lastPage indices
|
|
186
|
+
*
|
|
187
|
+
* @example
|
|
188
|
+
* ```typescript
|
|
189
|
+
* const range = manager.getPageRange();
|
|
190
|
+
* console.log(`Page range: ${range.firstPage} to ${range.lastPage}`);
|
|
191
|
+
* ```
|
|
192
|
+
*/
|
|
193
|
+
getPageRange(): PageRange {
|
|
194
|
+
const count = this.getPageCount();
|
|
195
|
+
if (count === 0) {
|
|
196
|
+
return { firstPage: 0, lastPage: -1 };
|
|
197
|
+
}
|
|
198
|
+
return { firstPage: 0, lastPage: count - 1 };
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Gets page dimension statistics
|
|
203
|
+
* @returns Statistics about page dimensions
|
|
204
|
+
*
|
|
205
|
+
* @example
|
|
206
|
+
* ```typescript
|
|
207
|
+
* const stats = manager.getPageStatistics();
|
|
208
|
+
* console.log(`Average width: ${stats.averageWidth}`);
|
|
209
|
+
* console.log(`Pages vary in size: ${stats.hasVariableSizes}`);
|
|
210
|
+
* ```
|
|
211
|
+
*/
|
|
212
|
+
getPageStatistics(): PageStatistics {
|
|
213
|
+
const pages = this.getAllPageInfo();
|
|
214
|
+
|
|
215
|
+
if (pages.length === 0) {
|
|
216
|
+
return {
|
|
217
|
+
count: 0,
|
|
218
|
+
minWidth: 0,
|
|
219
|
+
maxWidth: 0,
|
|
220
|
+
minHeight: 0,
|
|
221
|
+
maxHeight: 0,
|
|
222
|
+
averageWidth: 0,
|
|
223
|
+
averageHeight: 0,
|
|
224
|
+
hasVariableSizes: false,
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const widths = pages.map(p => p.width);
|
|
229
|
+
const heights = pages.map(p => p.height);
|
|
230
|
+
|
|
231
|
+
const minWidth = Math.min(...widths);
|
|
232
|
+
const maxWidth = Math.max(...widths);
|
|
233
|
+
const minHeight = Math.min(...heights);
|
|
234
|
+
const maxHeight = Math.max(...heights);
|
|
235
|
+
|
|
236
|
+
const averageWidth = widths.reduce((a, b) => a + b, 0) / widths.length;
|
|
237
|
+
const averageHeight = heights.reduce((a, b) => a + b, 0) / heights.length;
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
count: pages.length,
|
|
241
|
+
minWidth,
|
|
242
|
+
maxWidth,
|
|
243
|
+
minHeight,
|
|
244
|
+
maxHeight,
|
|
245
|
+
averageWidth,
|
|
246
|
+
averageHeight,
|
|
247
|
+
hasVariableSizes: minWidth !== maxWidth || minHeight !== maxHeight,
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Gets pages within a specific size range
|
|
253
|
+
* @param minWidth - Minimum width
|
|
254
|
+
* @param maxWidth - Maximum width
|
|
255
|
+
* @param minHeight - Minimum height
|
|
256
|
+
* @param maxHeight - Maximum height
|
|
257
|
+
* @returns Matching PageInfo objects
|
|
258
|
+
*/
|
|
259
|
+
getPagesInSizeRange(
|
|
260
|
+
minWidth: number,
|
|
261
|
+
maxWidth: number,
|
|
262
|
+
minHeight: number,
|
|
263
|
+
maxHeight: number
|
|
264
|
+
): PageInfo[] {
|
|
265
|
+
const pages = this.getAllPageInfo();
|
|
266
|
+
return pages.filter(p =>
|
|
267
|
+
p.width >= minWidth && p.width <= maxWidth &&
|
|
268
|
+
p.height >= minHeight && p.height <= maxHeight
|
|
269
|
+
);
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/**
|
|
273
|
+
* Gets landscape pages
|
|
274
|
+
* @returns Array of landscape PageInfo objects
|
|
275
|
+
*/
|
|
276
|
+
getLandscapePages(): PageInfo[] {
|
|
277
|
+
const pages = this.getAllPageInfo();
|
|
278
|
+
return pages.filter(p => p.width > p.height);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* Gets portrait pages
|
|
283
|
+
* @returns Array of portrait PageInfo objects
|
|
284
|
+
*/
|
|
285
|
+
getPortraitPages(): PageInfo[] {
|
|
286
|
+
const pages = this.getAllPageInfo();
|
|
287
|
+
return pages.filter(p => p.height > p.width);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
@@ -0,0 +1,440 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern Detection Manager - TypeScript/Node.js Implementation
|
|
3
|
+
*
|
|
4
|
+
* Provides ML-powered pattern detection for PDF analysis:
|
|
5
|
+
* - Table detection and extraction
|
|
6
|
+
* - Column detection and analysis
|
|
7
|
+
* - Barcode detection and decoding
|
|
8
|
+
* - Form field detection
|
|
9
|
+
* - Layout pattern recognition
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { PdfDocument } from "../types/document-types.js";
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Represents a detected table region on a page.
|
|
16
|
+
*/
|
|
17
|
+
export interface TableRegion {
|
|
18
|
+
readonly x: number;
|
|
19
|
+
readonly y: number;
|
|
20
|
+
readonly width: number;
|
|
21
|
+
readonly height: number;
|
|
22
|
+
readonly rowCount?: number;
|
|
23
|
+
readonly columnCount?: number;
|
|
24
|
+
readonly confidence?: number;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Represents a detected column region on a page.
|
|
29
|
+
*/
|
|
30
|
+
export interface ColumnRegion {
|
|
31
|
+
readonly x: number;
|
|
32
|
+
readonly y: number;
|
|
33
|
+
readonly width: number;
|
|
34
|
+
readonly height: number;
|
|
35
|
+
readonly columnIndex?: number;
|
|
36
|
+
readonly confidence?: number;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Represents a detected barcode.
|
|
41
|
+
*/
|
|
42
|
+
export interface BarcodeRegion {
|
|
43
|
+
readonly x: number;
|
|
44
|
+
readonly y: number;
|
|
45
|
+
readonly width: number;
|
|
46
|
+
readonly height: number;
|
|
47
|
+
readonly format: string;
|
|
48
|
+
readonly value: string;
|
|
49
|
+
readonly confidence?: number;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Represents a detected form field.
|
|
54
|
+
*/
|
|
55
|
+
export interface FormFieldRegion {
|
|
56
|
+
readonly x: number;
|
|
57
|
+
readonly y: number;
|
|
58
|
+
readonly width: number;
|
|
59
|
+
readonly height: number;
|
|
60
|
+
readonly fieldType: string;
|
|
61
|
+
readonly fieldName?: string;
|
|
62
|
+
readonly confidence?: number;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Layout pattern type enumeration.
|
|
67
|
+
*/
|
|
68
|
+
export enum LayoutPatternType {
|
|
69
|
+
SINGLE_COLUMN = "single_column",
|
|
70
|
+
MULTI_COLUMN = "multi_column",
|
|
71
|
+
TABLE_BASED = "table_based",
|
|
72
|
+
FORM_BASED = "form_based",
|
|
73
|
+
MAGAZINE_STYLE = "magazine_style",
|
|
74
|
+
COMPLEX_MIXED = "complex_mixed",
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Detected layout pattern.
|
|
79
|
+
*/
|
|
80
|
+
export interface LayoutPattern {
|
|
81
|
+
readonly pageIndex: number;
|
|
82
|
+
readonly patternType: LayoutPatternType;
|
|
83
|
+
readonly confidence: number;
|
|
84
|
+
readonly regions: Array<TableRegion | ColumnRegion>;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Pattern Detection Manager for TypeScript/Node.js
|
|
89
|
+
*
|
|
90
|
+
* Provides detection and analysis of common patterns in PDF documents.
|
|
91
|
+
*/
|
|
92
|
+
export class PatternDetectionManager {
|
|
93
|
+
private readonly document: PdfDocument;
|
|
94
|
+
private readonly cache: Map<string, unknown> = new Map();
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Create a new PatternDetectionManager.
|
|
98
|
+
*/
|
|
99
|
+
constructor(document: PdfDocument) {
|
|
100
|
+
this.document = document;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Detect tables on a specific page.
|
|
105
|
+
*
|
|
106
|
+
* @param pageIndex - Index of the page to analyze
|
|
107
|
+
* @returns Array of detected table regions
|
|
108
|
+
*/
|
|
109
|
+
async detectTables(pageIndex: number): Promise<TableRegion[]> {
|
|
110
|
+
const cacheKey = `tables:${pageIndex}`;
|
|
111
|
+
const cached = this.cache.get(cacheKey);
|
|
112
|
+
if (cached) {
|
|
113
|
+
return cached as TableRegion[];
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Extract text to analyze
|
|
117
|
+
const text = await this.document.extractText(pageIndex);
|
|
118
|
+
if (!text) {
|
|
119
|
+
return [];
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Simple heuristic: detect table-like patterns
|
|
123
|
+
const tables: TableRegion[] = [];
|
|
124
|
+
const lines = text.split("\n");
|
|
125
|
+
|
|
126
|
+
// Look for lines with multiple columns (tabs or spaces)
|
|
127
|
+
let currentTableStart = -1;
|
|
128
|
+
let tableLines = 0;
|
|
129
|
+
|
|
130
|
+
for (let i = 0; i < lines.length; i++) {
|
|
131
|
+
const line = lines[i];
|
|
132
|
+
const columnCount = (line?.match(/\t/g) || []).length + 1;
|
|
133
|
+
|
|
134
|
+
if (columnCount >= 2) {
|
|
135
|
+
if (currentTableStart === -1) {
|
|
136
|
+
currentTableStart = i;
|
|
137
|
+
}
|
|
138
|
+
tableLines++;
|
|
139
|
+
} else if (currentTableStart !== -1 && tableLines > 1) {
|
|
140
|
+
// End of table detected
|
|
141
|
+
tables.push({
|
|
142
|
+
x: 50,
|
|
143
|
+
y: 100 + currentTableStart * 15,
|
|
144
|
+
width: 500,
|
|
145
|
+
height: tableLines * 15,
|
|
146
|
+
rowCount: tableLines,
|
|
147
|
+
columnCount: (lines[currentTableStart]?.match(/\t/g) || []).length + 1,
|
|
148
|
+
confidence: 0.7,
|
|
149
|
+
});
|
|
150
|
+
currentTableStart = -1;
|
|
151
|
+
tableLines = 0;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
this.cache.set(cacheKey, tables);
|
|
156
|
+
return tables;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Detect columns on a specific page.
|
|
161
|
+
*
|
|
162
|
+
* @param pageIndex - Index of the page to analyze
|
|
163
|
+
* @returns Array of detected column regions
|
|
164
|
+
*/
|
|
165
|
+
async detectColumns(pageIndex: number): Promise<ColumnRegion[]> {
|
|
166
|
+
const cacheKey = `columns:${pageIndex}`;
|
|
167
|
+
const cached = this.cache.get(cacheKey);
|
|
168
|
+
if (cached) {
|
|
169
|
+
return cached as ColumnRegion[];
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
const text = await this.document.extractText(pageIndex);
|
|
173
|
+
if (!text) {
|
|
174
|
+
return [];
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Simple heuristic: detect multi-column layouts
|
|
178
|
+
const columns: ColumnRegion[] = [];
|
|
179
|
+
const lines = text.split("\n");
|
|
180
|
+
|
|
181
|
+
// Check for indentation patterns suggesting columns
|
|
182
|
+
const indentationPattern = new Map<number, number>();
|
|
183
|
+
|
|
184
|
+
for (const line of lines) {
|
|
185
|
+
if (line.length > 0) {
|
|
186
|
+
const indent = line.search(/\S/);
|
|
187
|
+
if (indent >= 0) {
|
|
188
|
+
indentationPattern.set(indent, (indentationPattern.get(indent) || 0) + 1);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// If multiple indentation levels, likely multi-column
|
|
194
|
+
if (indentationPattern.size > 1) {
|
|
195
|
+
const indents = Array.from(indentationPattern.entries())
|
|
196
|
+
.sort((a, b) => b[1] - a[1])
|
|
197
|
+
.slice(0, 2)
|
|
198
|
+
.map(([indent]) => indent);
|
|
199
|
+
|
|
200
|
+
for (let i = 0; i < indents.length; i++) {
|
|
201
|
+
columns.push({
|
|
202
|
+
x: (indents[i] ?? 0) * 8,
|
|
203
|
+
y: 50,
|
|
204
|
+
width: 250,
|
|
205
|
+
height: 700,
|
|
206
|
+
columnIndex: i,
|
|
207
|
+
confidence: 0.6,
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
} else {
|
|
211
|
+
// Single column
|
|
212
|
+
columns.push({
|
|
213
|
+
x: 50,
|
|
214
|
+
y: 50,
|
|
215
|
+
width: 500,
|
|
216
|
+
height: 700,
|
|
217
|
+
columnIndex: 0,
|
|
218
|
+
confidence: 0.95,
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
this.cache.set(cacheKey, columns);
|
|
223
|
+
return columns;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Detect barcodes on a specific page.
|
|
228
|
+
*
|
|
229
|
+
* @param pageIndex - Index of the page to analyze
|
|
230
|
+
* @returns Array of detected barcodes
|
|
231
|
+
*/
|
|
232
|
+
async detectBarcodes(pageIndex: number): Promise<BarcodeRegion[]> {
|
|
233
|
+
const cacheKey = `barcodes:${pageIndex}`;
|
|
234
|
+
const cached = this.cache.get(cacheKey);
|
|
235
|
+
if (cached) {
|
|
236
|
+
return cached as BarcodeRegion[];
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// This would typically involve barcode detection via FFI
|
|
240
|
+
// For now, return empty as this requires image processing
|
|
241
|
+
const barcodes: BarcodeRegion[] = [];
|
|
242
|
+
|
|
243
|
+
this.cache.set(cacheKey, barcodes);
|
|
244
|
+
return barcodes;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Detect form fields on a specific page.
|
|
249
|
+
*
|
|
250
|
+
* @param pageIndex - Index of the page to analyze
|
|
251
|
+
* @returns Array of detected form fields
|
|
252
|
+
*/
|
|
253
|
+
async detectFormFields(pageIndex: number): Promise<FormFieldRegion[]> {
|
|
254
|
+
const cacheKey = `form_fields:${pageIndex}`;
|
|
255
|
+
const cached = this.cache.get(cacheKey);
|
|
256
|
+
if (cached) {
|
|
257
|
+
return cached as FormFieldRegion[];
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Try to extract form fields if available
|
|
261
|
+
let formFields: FormFieldRegion[] = [];
|
|
262
|
+
|
|
263
|
+
try {
|
|
264
|
+
const fields = await this.document.extractFormFields();
|
|
265
|
+
formFields = (fields as any[])
|
|
266
|
+
.filter((f: any) => (f as any).pageIndex === pageIndex)
|
|
267
|
+
.map((f: any) => ({
|
|
268
|
+
x: (f as any).x || 0,
|
|
269
|
+
y: (f as any).y || 0,
|
|
270
|
+
width: (f as any).width || 100,
|
|
271
|
+
height: (f as any).height || 20,
|
|
272
|
+
fieldType: (f as any).type || "unknown",
|
|
273
|
+
fieldName: (f as any).name,
|
|
274
|
+
confidence: 0.9,
|
|
275
|
+
}));
|
|
276
|
+
} catch {
|
|
277
|
+
// If extraction fails, return empty array
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
this.cache.set(cacheKey, formFields);
|
|
281
|
+
return formFields;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Analyze layout pattern of a page.
|
|
286
|
+
*
|
|
287
|
+
* @param pageIndex - Index of the page to analyze
|
|
288
|
+
* @returns Detected layout pattern
|
|
289
|
+
*/
|
|
290
|
+
async analyzeLayoutPattern(pageIndex: number): Promise<LayoutPattern> {
|
|
291
|
+
const cacheKey = `layout_pattern:${pageIndex}`;
|
|
292
|
+
const cached = this.cache.get(cacheKey);
|
|
293
|
+
if (cached) {
|
|
294
|
+
return cached as LayoutPattern;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Detect tables and columns
|
|
298
|
+
const tables = await this.detectTables(pageIndex);
|
|
299
|
+
const columns = await this.detectColumns(pageIndex);
|
|
300
|
+
|
|
301
|
+
// Determine pattern type
|
|
302
|
+
let patternType = LayoutPatternType.SINGLE_COLUMN;
|
|
303
|
+
let confidence = 0.5;
|
|
304
|
+
|
|
305
|
+
if (tables.length > 0) {
|
|
306
|
+
patternType = LayoutPatternType.TABLE_BASED;
|
|
307
|
+
confidence = 0.85;
|
|
308
|
+
} else if (columns.length > 1) {
|
|
309
|
+
patternType = LayoutPatternType.MULTI_COLUMN;
|
|
310
|
+
confidence = 0.75;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
const pattern: LayoutPattern = {
|
|
314
|
+
pageIndex,
|
|
315
|
+
patternType,
|
|
316
|
+
confidence,
|
|
317
|
+
regions: [...tables, ...columns],
|
|
318
|
+
};
|
|
319
|
+
|
|
320
|
+
this.cache.set(cacheKey, pattern);
|
|
321
|
+
return pattern;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Detect all patterns on a specific page.
|
|
326
|
+
*
|
|
327
|
+
* @param pageIndex - Index of the page to analyze
|
|
328
|
+
* @returns Object with all detected patterns
|
|
329
|
+
*/
|
|
330
|
+
async detectAllPatterns(
|
|
331
|
+
pageIndex: number,
|
|
332
|
+
): Promise<{
|
|
333
|
+
tables: TableRegion[];
|
|
334
|
+
columns: ColumnRegion[];
|
|
335
|
+
barcodes: BarcodeRegion[];
|
|
336
|
+
formFields: FormFieldRegion[];
|
|
337
|
+
layout: LayoutPattern;
|
|
338
|
+
}> {
|
|
339
|
+
const [tables, columns, barcodes, formFields, layout] = await Promise.all([
|
|
340
|
+
this.detectTables(pageIndex),
|
|
341
|
+
this.detectColumns(pageIndex),
|
|
342
|
+
this.detectBarcodes(pageIndex),
|
|
343
|
+
this.detectFormFields(pageIndex),
|
|
344
|
+
this.analyzeLayoutPattern(pageIndex),
|
|
345
|
+
]);
|
|
346
|
+
|
|
347
|
+
return {
|
|
348
|
+
tables,
|
|
349
|
+
columns,
|
|
350
|
+
barcodes,
|
|
351
|
+
formFields,
|
|
352
|
+
layout,
|
|
353
|
+
};
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
/**
|
|
357
|
+
* Analyze patterns across entire document.
|
|
358
|
+
*
|
|
359
|
+
* @returns Array of layout patterns for each page
|
|
360
|
+
*/
|
|
361
|
+
async analyzeDocumentPatterns(): Promise<LayoutPattern[]> {
|
|
362
|
+
const pageCount = await this.document.pageCount();
|
|
363
|
+
const patterns: LayoutPattern[] = [];
|
|
364
|
+
|
|
365
|
+
for (let i = 0; i < pageCount; i++) {
|
|
366
|
+
try {
|
|
367
|
+
const pattern = await this.analyzeLayoutPattern(i);
|
|
368
|
+
patterns.push(pattern);
|
|
369
|
+
} catch {
|
|
370
|
+
// Skip on error
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
return patterns;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* Find pages with specific pattern type.
|
|
379
|
+
*
|
|
380
|
+
* @param patternType - Pattern type to find
|
|
381
|
+
* @returns Array of page indices with the specified pattern
|
|
382
|
+
*/
|
|
383
|
+
async findPagesWithPattern(patternType: LayoutPatternType): Promise<number[]> {
|
|
384
|
+
const patterns = await this.analyzeDocumentPatterns();
|
|
385
|
+
return patterns
|
|
386
|
+
.filter((p) => p.patternType === patternType)
|
|
387
|
+
.map((p) => p.pageIndex);
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/**
|
|
391
|
+
* Get pattern statistics for the document.
|
|
392
|
+
*
|
|
393
|
+
* @returns Statistics about detected patterns
|
|
394
|
+
*/
|
|
395
|
+
async getPatternStatistics(): Promise<{
|
|
396
|
+
totalPages: number;
|
|
397
|
+
pagesWithTables: number;
|
|
398
|
+
pagesWithColumns: number;
|
|
399
|
+
avgTablesPerPage: number;
|
|
400
|
+
avgColumnsPerPage: number;
|
|
401
|
+
}> {
|
|
402
|
+
const pageCount = await this.document.pageCount();
|
|
403
|
+
let totalTables = 0;
|
|
404
|
+
let totalColumns = 0;
|
|
405
|
+
let pagesWithTables = 0;
|
|
406
|
+
let pagesWithColumns = 0;
|
|
407
|
+
|
|
408
|
+
for (let i = 0; i < pageCount; i++) {
|
|
409
|
+
const tables = await this.detectTables(i);
|
|
410
|
+
const columns = await this.detectColumns(i);
|
|
411
|
+
|
|
412
|
+
if (tables.length > 0) {
|
|
413
|
+
pagesWithTables++;
|
|
414
|
+
totalTables += tables.length;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
if (columns.length > 1) {
|
|
418
|
+
pagesWithColumns++;
|
|
419
|
+
totalColumns += columns.length;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return {
|
|
424
|
+
totalPages: pageCount,
|
|
425
|
+
pagesWithTables,
|
|
426
|
+
pagesWithColumns,
|
|
427
|
+
avgTablesPerPage: pagesWithTables > 0 ? totalTables / pagesWithTables : 0,
|
|
428
|
+
avgColumnsPerPage: pagesWithColumns > 0 ? totalColumns / pagesWithColumns : 0,
|
|
429
|
+
};
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
/**
|
|
433
|
+
* Clear the internal cache.
|
|
434
|
+
*/
|
|
435
|
+
clearCache(): void {
|
|
436
|
+
this.cache.clear();
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
export default PatternDetectionManager;
|