pdf-oxide 0.3.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -0
- package/binding.gyp +35 -0
- package/package.json +78 -0
- package/src/builders/annotation-builder.ts +367 -0
- package/src/builders/conversion-options-builder.ts +257 -0
- package/src/builders/index.ts +12 -0
- package/src/builders/metadata-builder.ts +317 -0
- package/src/builders/pdf-builder.ts +386 -0
- package/src/builders/search-options-builder.ts +151 -0
- package/src/document-editor-manager.ts +318 -0
- package/src/errors.ts +1629 -0
- package/src/form-field-manager.ts +666 -0
- package/src/hybrid-ml-manager.ts +283 -0
- package/src/index.ts +453 -0
- package/src/managers/accessibility-manager.ts +338 -0
- package/src/managers/annotation-manager.ts +439 -0
- package/src/managers/barcode-manager.ts +235 -0
- package/src/managers/batch-manager.ts +533 -0
- package/src/managers/cache-manager.ts +486 -0
- package/src/managers/compliance-manager.ts +375 -0
- package/src/managers/content-manager.ts +339 -0
- package/src/managers/document-utility-manager.ts +922 -0
- package/src/managers/dom-pdf-creator.ts +365 -0
- package/src/managers/editing-manager.ts +514 -0
- package/src/managers/enterprise-manager.ts +478 -0
- package/src/managers/extended-managers.ts +437 -0
- package/src/managers/extraction-manager.ts +583 -0
- package/src/managers/final-utilities.ts +429 -0
- package/src/managers/hybrid-ml-advanced.ts +479 -0
- package/src/managers/index.ts +239 -0
- package/src/managers/layer-manager.ts +500 -0
- package/src/managers/metadata-manager.ts +303 -0
- package/src/managers/ocr-manager.ts +756 -0
- package/src/managers/optimization-manager.ts +262 -0
- package/src/managers/outline-manager.ts +196 -0
- package/src/managers/page-manager.ts +289 -0
- package/src/managers/pattern-detection.ts +440 -0
- package/src/managers/rendering-manager.ts +863 -0
- package/src/managers/search-manager.ts +385 -0
- package/src/managers/security-manager.ts +345 -0
- package/src/managers/signature-manager.ts +1664 -0
- package/src/managers/streams.ts +618 -0
- package/src/managers/xfa-manager.ts +500 -0
- package/src/pdf-creator-manager.ts +494 -0
- package/src/properties.ts +522 -0
- package/src/result-accessors-manager.ts +867 -0
- package/src/tests/advanced-features.test.ts +414 -0
- package/src/tests/advanced.test.ts +266 -0
- package/src/tests/extended-managers.test.ts +316 -0
- package/src/tests/final-utilities.test.ts +455 -0
- package/src/tests/foundation.test.ts +315 -0
- package/src/tests/high-demand.test.ts +257 -0
- package/src/tests/specialized.test.ts +97 -0
- package/src/thumbnail-manager.ts +272 -0
- package/src/types/common.ts +142 -0
- package/src/types/document-types.ts +457 -0
- package/src/types/index.ts +6 -0
- package/src/types/manager-types.ts +284 -0
- package/src/types/native-bindings.ts +517 -0
- package/src/workers/index.ts +7 -0
- package/src/workers/pool.ts +274 -0
- package/src/workers/worker.ts +131 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HybridMLManager for advanced PDF analysis using machine learning
|
|
3
|
+
*
|
|
4
|
+
* Analyzes PDF pages for complexity, content type, and optimal extraction strategies.
|
|
5
|
+
* API is consistent with Python, Java, C#, Go, and Swift implementations.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { EventEmitter } from 'events';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Page complexity levels
|
|
12
|
+
*/
|
|
13
|
+
export enum PageComplexity {
|
|
14
|
+
Simple = 'simple',
|
|
15
|
+
Moderate = 'moderate',
|
|
16
|
+
Complex = 'complex',
|
|
17
|
+
VeryComplex = 'very_complex',
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Content type classifications
|
|
22
|
+
*/
|
|
23
|
+
export enum ContentType {
|
|
24
|
+
TextOnly = 'text_only',
|
|
25
|
+
TextImages = 'text_images',
|
|
26
|
+
Tables = 'tables',
|
|
27
|
+
MixedLayout = 'mixed_layout',
|
|
28
|
+
Scanned = 'scanned',
|
|
29
|
+
Form = 'form',
|
|
30
|
+
VectorGraphics = 'vector_graphics',
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Page analysis result
|
|
35
|
+
*/
|
|
36
|
+
export interface PageAnalysisResult {
|
|
37
|
+
pageIndex: number;
|
|
38
|
+
complexity: PageComplexity;
|
|
39
|
+
complexityScore: number;
|
|
40
|
+
contentType: ContentType;
|
|
41
|
+
textDensity: number;
|
|
42
|
+
imageDensity: number;
|
|
43
|
+
hasText: boolean;
|
|
44
|
+
hasImages: boolean;
|
|
45
|
+
hasTables: boolean;
|
|
46
|
+
estimatedProcessingTime: number;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Extraction strategy recommendation
|
|
51
|
+
*/
|
|
52
|
+
export interface ExtractionStrategy {
|
|
53
|
+
pageIndex: number;
|
|
54
|
+
description: string;
|
|
55
|
+
recommendsOcr: boolean;
|
|
56
|
+
recommendedMethod: string;
|
|
57
|
+
confidence: number;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Table region information
|
|
62
|
+
*/
|
|
63
|
+
export interface TableRegion {
|
|
64
|
+
pageIndex: number;
|
|
65
|
+
x: number;
|
|
66
|
+
y: number;
|
|
67
|
+
width: number;
|
|
68
|
+
height: number;
|
|
69
|
+
rowCount: number;
|
|
70
|
+
columnCount: number;
|
|
71
|
+
confidence: number;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Column region information
|
|
76
|
+
*/
|
|
77
|
+
export interface ColumnRegion {
|
|
78
|
+
x: number;
|
|
79
|
+
width: number;
|
|
80
|
+
confidence: number;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Hybrid ML Manager for advanced PDF analysis
|
|
85
|
+
*
|
|
86
|
+
* Provides methods to:
|
|
87
|
+
* - Analyze page complexity
|
|
88
|
+
* - Detect content types
|
|
89
|
+
* - Recommend extraction strategies
|
|
90
|
+
* - Detect tables and columns
|
|
91
|
+
* - Estimate processing time
|
|
92
|
+
*/
|
|
93
|
+
export class HybridMLManager extends EventEmitter {
|
|
94
|
+
private document: any;
|
|
95
|
+
private resultCache = new Map<string, any>();
|
|
96
|
+
private maxCacheSize = 100;
|
|
97
|
+
|
|
98
|
+
constructor(document: any) {
|
|
99
|
+
super();
|
|
100
|
+
this.document = document;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Analyzes a specific page
|
|
105
|
+
* Matches: Python analyzePage(), Java analyzePage(), C# AnalyzePage()
|
|
106
|
+
*/
|
|
107
|
+
async analyzePage(pageIndex: number): Promise<PageAnalysisResult> {
|
|
108
|
+
const cacheKey = `ml:analysis:${pageIndex}`;
|
|
109
|
+
if (this.resultCache.has(cacheKey)) {
|
|
110
|
+
return this.resultCache.get(cacheKey);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
// In real implementation, would call native FFI
|
|
114
|
+
const result: PageAnalysisResult = {
|
|
115
|
+
pageIndex,
|
|
116
|
+
complexity: PageComplexity.Moderate,
|
|
117
|
+
complexityScore: 0.5,
|
|
118
|
+
contentType: ContentType.TextImages,
|
|
119
|
+
textDensity: 0.7,
|
|
120
|
+
imageDensity: 0.3,
|
|
121
|
+
hasText: true,
|
|
122
|
+
hasImages: false,
|
|
123
|
+
hasTables: false,
|
|
124
|
+
estimatedProcessingTime: 100,
|
|
125
|
+
};
|
|
126
|
+
this.setCached(cacheKey, result);
|
|
127
|
+
return result;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Analyzes all pages in the document
|
|
132
|
+
* Matches: Python analyzeDocument(), Java analyzeDocument(), C# AnalyzeDocument()
|
|
133
|
+
*/
|
|
134
|
+
async analyzeDocument(): Promise<PageAnalysisResult[]> {
|
|
135
|
+
const cacheKey = 'ml:analysis:all';
|
|
136
|
+
if (this.resultCache.has(cacheKey)) {
|
|
137
|
+
return this.resultCache.get(cacheKey);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// In real implementation, would call native FFI
|
|
141
|
+
const results: PageAnalysisResult[] = [];
|
|
142
|
+
this.setCached(cacheKey, results);
|
|
143
|
+
return results;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Gets extraction strategy recommendation for a page
|
|
148
|
+
* Matches: Python getExtractionStrategy(), Java getExtractionStrategy(), C# GetExtractionStrategy()
|
|
149
|
+
*/
|
|
150
|
+
async getExtractionStrategy(pageIndex: number): Promise<ExtractionStrategy> {
|
|
151
|
+
const cacheKey = `ml:strategy:${pageIndex}`;
|
|
152
|
+
if (this.resultCache.has(cacheKey)) {
|
|
153
|
+
return this.resultCache.get(cacheKey);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// In real implementation, would call native FFI
|
|
157
|
+
const strategy: ExtractionStrategy = {
|
|
158
|
+
pageIndex,
|
|
159
|
+
description: 'Standard text extraction recommended',
|
|
160
|
+
recommendsOcr: false,
|
|
161
|
+
recommendedMethod: 'text_extraction',
|
|
162
|
+
confidence: 0.9,
|
|
163
|
+
};
|
|
164
|
+
this.setCached(cacheKey, strategy);
|
|
165
|
+
return strategy;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Detects tables on a page
|
|
170
|
+
* Matches: Python detectTables(), Java detectTables(), C# DetectTables()
|
|
171
|
+
*/
|
|
172
|
+
async detectTables(pageIndex: number): Promise<TableRegion[]> {
|
|
173
|
+
const cacheKey = `ml:tables:${pageIndex}`;
|
|
174
|
+
if (this.resultCache.has(cacheKey)) {
|
|
175
|
+
return this.resultCache.get(cacheKey);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// In real implementation, would call native FFI
|
|
179
|
+
const tables: TableRegion[] = [];
|
|
180
|
+
this.setCached(cacheKey, tables);
|
|
181
|
+
return tables;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Detects columns on a page
|
|
186
|
+
* Matches: Python detectColumns(), Java detectColumns(), C# DetectColumns()
|
|
187
|
+
*/
|
|
188
|
+
async detectColumns(pageIndex: number): Promise<ColumnRegion[]> {
|
|
189
|
+
const cacheKey = `ml:columns:${pageIndex}`;
|
|
190
|
+
if (this.resultCache.has(cacheKey)) {
|
|
191
|
+
return this.resultCache.get(cacheKey);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
// In real implementation, would call native FFI
|
|
195
|
+
const columns: ColumnRegion[] = [];
|
|
196
|
+
this.setCached(cacheKey, columns);
|
|
197
|
+
return columns;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Gets average page complexity in document
|
|
202
|
+
* Matches: Python getAverageComplexity(), Java getAverageComplexity(), C# GetAverageComplexity()
|
|
203
|
+
*/
|
|
204
|
+
async getAverageComplexity(): Promise<number> {
|
|
205
|
+
const cacheKey = 'ml:avg_complexity';
|
|
206
|
+
if (this.resultCache.has(cacheKey)) {
|
|
207
|
+
return this.resultCache.get(cacheKey);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// In real implementation, would call native FFI
|
|
211
|
+
const complexity = 0.5;
|
|
212
|
+
this.setCached(cacheKey, complexity);
|
|
213
|
+
return complexity;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Gets most common content type
|
|
218
|
+
* Matches: Python getMostCommonContentType(), Java getMostCommonContentType(), C# GetMostCommonContentType()
|
|
219
|
+
*/
|
|
220
|
+
async getMostCommonContentType(): Promise<ContentType> {
|
|
221
|
+
const cacheKey = 'ml:common_content_type';
|
|
222
|
+
if (this.resultCache.has(cacheKey)) {
|
|
223
|
+
return this.resultCache.get(cacheKey);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// In real implementation, would call native FFI
|
|
227
|
+
const contentType = ContentType.TextImages;
|
|
228
|
+
this.setCached(cacheKey, contentType);
|
|
229
|
+
return contentType;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Estimates total document processing time
|
|
234
|
+
* Matches: Python estimateProcessingTime(), Java estimateProcessingTime(), C# EstimateProcessingTime()
|
|
235
|
+
*/
|
|
236
|
+
async estimateProcessingTime(): Promise<number> {
|
|
237
|
+
const cacheKey = 'ml:estimated_time';
|
|
238
|
+
if (this.resultCache.has(cacheKey)) {
|
|
239
|
+
return this.resultCache.get(cacheKey);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// In real implementation, would call native FFI
|
|
243
|
+
const time = 1000;
|
|
244
|
+
this.setCached(cacheKey, time);
|
|
245
|
+
return time;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Clears the result cache
|
|
250
|
+
* Matches: Python clearCache(), Java clearCache(), C# ClearCache()
|
|
251
|
+
*/
|
|
252
|
+
clearCache(): void {
|
|
253
|
+
this.resultCache.clear();
|
|
254
|
+
this.emit('cacheCleared');
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Gets cache statistics
|
|
259
|
+
* Matches: Python getCacheStats(), Java getCacheStats(), C# GetCacheStats()
|
|
260
|
+
*/
|
|
261
|
+
getCacheStats(): Record<string, any> {
|
|
262
|
+
return {
|
|
263
|
+
cacheSize: this.resultCache.size,
|
|
264
|
+
maxCacheSize: this.maxCacheSize,
|
|
265
|
+
entries: Array.from(this.resultCache.keys()),
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Private helper methods
|
|
270
|
+
private setCached(key: string, value: any): void {
|
|
271
|
+
this.resultCache.set(key, value);
|
|
272
|
+
|
|
273
|
+
// Simple LRU eviction
|
|
274
|
+
if (this.resultCache.size > this.maxCacheSize) {
|
|
275
|
+
const firstKey = this.resultCache.keys().next().value;
|
|
276
|
+
if (firstKey !== undefined) {
|
|
277
|
+
this.resultCache.delete(firstKey);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
export default HybridMLManager;
|