@kreuzberg/wasm 4.0.0-rc.23 → 4.0.0-rc.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/wasm-adapter.d.ts +7 -10
- package/dist/adapters/wasm-adapter.d.ts.map +1 -0
- package/dist/adapters/wasm-adapter.js +41 -19
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.d.ts +23 -240
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +41 -19
- package/dist/index.js.map +1 -1
- package/dist/ocr/registry.d.ts +7 -10
- package/dist/ocr/registry.d.ts.map +1 -0
- package/dist/ocr/tesseract-wasm-backend.d.ts +3 -6
- package/dist/ocr/tesseract-wasm-backend.d.ts.map +1 -0
- package/dist/plugin-registry.d.ts +246 -0
- package/dist/plugin-registry.d.ts.map +1 -0
- package/dist/runtime.d.ts +21 -22
- package/dist/runtime.d.ts.map +1 -0
- package/dist/{types-wVLLDHkl.d.cts → types.d.ts} +24 -25
- package/dist/types.d.ts.map +1 -0
- package/package.json +20 -63
- package/dist/adapters/wasm-adapter.cjs +0 -245
- package/dist/adapters/wasm-adapter.cjs.map +0 -1
- package/dist/adapters/wasm-adapter.d.cts +0 -121
- package/dist/index.cjs +0 -1389
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.cts +0 -639
- package/dist/ocr/registry.cjs +0 -92
- package/dist/ocr/registry.cjs.map +0 -1
- package/dist/ocr/registry.d.cts +0 -102
- package/dist/ocr/tesseract-wasm-backend.cjs +0 -410
- package/dist/ocr/tesseract-wasm-backend.cjs.map +0 -1
- package/dist/ocr/tesseract-wasm-backend.d.cts +0 -257
- package/dist/runtime.cjs +0 -173
- package/dist/runtime.cjs.map +0 -1
- package/dist/runtime.d.cts +0 -256
- package/dist/types-wVLLDHkl.d.ts +0 -364
package/dist/runtime.d.cts
DELETED
|
@@ -1,256 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Runtime detection and environment-specific utilities
|
|
3
|
-
*
|
|
4
|
-
* This module provides utilities for detecting the JavaScript runtime environment,
|
|
5
|
-
* checking for feature availability, and enabling environment-specific WASM loading strategies.
|
|
6
|
-
*
|
|
7
|
-
* @example Basic Runtime Detection
|
|
8
|
-
* ```typescript
|
|
9
|
-
* import { detectRuntime, isBrowser, isNode } from '@kreuzberg/wasm/runtime';
|
|
10
|
-
*
|
|
11
|
-
* if (isBrowser()) {
|
|
12
|
-
* console.log('Running in browser');
|
|
13
|
-
* } else if (isNode()) {
|
|
14
|
-
* console.log('Running in Node.js');
|
|
15
|
-
* }
|
|
16
|
-
* ```
|
|
17
|
-
*
|
|
18
|
-
* @example Feature Detection
|
|
19
|
-
* ```typescript
|
|
20
|
-
* import { hasFileApi, hasWorkers } from '@kreuzberg/wasm/runtime';
|
|
21
|
-
*
|
|
22
|
-
* if (hasFileApi()) {
|
|
23
|
-
* // Can use File API for browser file uploads
|
|
24
|
-
* }
|
|
25
|
-
*
|
|
26
|
-
* if (hasWorkers()) {
|
|
27
|
-
* // Can use Web Workers for parallel processing
|
|
28
|
-
* }
|
|
29
|
-
* ```
|
|
30
|
-
*/
|
|
31
|
-
type RuntimeType = "browser" | "node" | "deno" | "bun" | "unknown";
|
|
32
|
-
/**
|
|
33
|
-
* WebAssembly capabilities available in the runtime
|
|
34
|
-
*/
|
|
35
|
-
interface WasmCapabilities {
|
|
36
|
-
/** Runtime environment type */
|
|
37
|
-
runtime: RuntimeType;
|
|
38
|
-
/** WebAssembly support available */
|
|
39
|
-
hasWasm: boolean;
|
|
40
|
-
/** Streaming WebAssembly instantiation available */
|
|
41
|
-
hasWasmStreaming: boolean;
|
|
42
|
-
/** File API available (browser) */
|
|
43
|
-
hasFileApi: boolean;
|
|
44
|
-
/** Blob API available */
|
|
45
|
-
hasBlob: boolean;
|
|
46
|
-
/** Worker support available */
|
|
47
|
-
hasWorkers: boolean;
|
|
48
|
-
/** SharedArrayBuffer available (may be restricted) */
|
|
49
|
-
hasSharedArrayBuffer: boolean;
|
|
50
|
-
/** Module Workers available */
|
|
51
|
-
hasModuleWorkers: boolean;
|
|
52
|
-
/** BigInt support */
|
|
53
|
-
hasBigInt: boolean;
|
|
54
|
-
/** Specific runtime version if available */
|
|
55
|
-
runtimeVersion?: string;
|
|
56
|
-
}
|
|
57
|
-
/**
|
|
58
|
-
* Detect the current JavaScript runtime
|
|
59
|
-
*
|
|
60
|
-
* Checks for various global objects and properties to determine
|
|
61
|
-
* which JavaScript runtime environment is currently executing.
|
|
62
|
-
*
|
|
63
|
-
* @returns The detected runtime type
|
|
64
|
-
*
|
|
65
|
-
* @example
|
|
66
|
-
* ```typescript
|
|
67
|
-
* import { detectRuntime } from '@kreuzberg/wasm/runtime';
|
|
68
|
-
*
|
|
69
|
-
* const runtime = detectRuntime();
|
|
70
|
-
* switch (runtime) {
|
|
71
|
-
* case 'browser':
|
|
72
|
-
* console.log('Running in browser');
|
|
73
|
-
* break;
|
|
74
|
-
* case 'node':
|
|
75
|
-
* console.log('Running in Node.js');
|
|
76
|
-
* break;
|
|
77
|
-
* case 'deno':
|
|
78
|
-
* console.log('Running in Deno');
|
|
79
|
-
* break;
|
|
80
|
-
* case 'bun':
|
|
81
|
-
* console.log('Running in Bun');
|
|
82
|
-
* break;
|
|
83
|
-
* }
|
|
84
|
-
* ```
|
|
85
|
-
*/
|
|
86
|
-
declare function detectRuntime(): RuntimeType;
|
|
87
|
-
/**
|
|
88
|
-
* Check if running in a browser environment
|
|
89
|
-
*
|
|
90
|
-
* @returns True if running in a browser, false otherwise
|
|
91
|
-
*/
|
|
92
|
-
declare function isBrowser(): boolean;
|
|
93
|
-
/**
|
|
94
|
-
* Check if running in Node.js
|
|
95
|
-
*
|
|
96
|
-
* @returns True if running in Node.js, false otherwise
|
|
97
|
-
*/
|
|
98
|
-
declare function isNode(): boolean;
|
|
99
|
-
/**
|
|
100
|
-
* Check if running in Deno
|
|
101
|
-
*
|
|
102
|
-
* @returns True if running in Deno, false otherwise
|
|
103
|
-
*/
|
|
104
|
-
declare function isDeno(): boolean;
|
|
105
|
-
/**
|
|
106
|
-
* Check if running in Bun
|
|
107
|
-
*
|
|
108
|
-
* @returns True if running in Bun, false otherwise
|
|
109
|
-
*/
|
|
110
|
-
declare function isBun(): boolean;
|
|
111
|
-
/**
|
|
112
|
-
* Check if running in a web environment (browser or similar)
|
|
113
|
-
*
|
|
114
|
-
* @returns True if running in a web browser, false otherwise
|
|
115
|
-
*/
|
|
116
|
-
declare function isWebEnvironment(): boolean;
|
|
117
|
-
/**
|
|
118
|
-
* Check if running in a server-like environment (Node.js, Deno, Bun)
|
|
119
|
-
*
|
|
120
|
-
* @returns True if running on a server runtime, false otherwise
|
|
121
|
-
*/
|
|
122
|
-
declare function isServerEnvironment(): boolean;
|
|
123
|
-
/**
|
|
124
|
-
* Check if File API is available
|
|
125
|
-
*
|
|
126
|
-
* The File API is required for handling browser file uploads.
|
|
127
|
-
*
|
|
128
|
-
* @returns True if File API is available, false otherwise
|
|
129
|
-
*
|
|
130
|
-
* @example
|
|
131
|
-
* ```typescript
|
|
132
|
-
* if (hasFileApi()) {
|
|
133
|
-
* const fileInput = document.getElementById('file');
|
|
134
|
-
* fileInput.addEventListener('change', (e) => {
|
|
135
|
-
* const file = e.target.files?.[0];
|
|
136
|
-
* // Handle file
|
|
137
|
-
* });
|
|
138
|
-
* }
|
|
139
|
-
* ```
|
|
140
|
-
*/
|
|
141
|
-
declare function hasFileApi(): boolean;
|
|
142
|
-
/**
|
|
143
|
-
* Check if Blob API is available
|
|
144
|
-
*
|
|
145
|
-
* @returns True if Blob API is available, false otherwise
|
|
146
|
-
*/
|
|
147
|
-
declare function hasBlob(): boolean;
|
|
148
|
-
/**
|
|
149
|
-
* Check if Web Workers are available
|
|
150
|
-
*
|
|
151
|
-
* @returns True if Web Workers can be created, false otherwise
|
|
152
|
-
*/
|
|
153
|
-
declare function hasWorkers(): boolean;
|
|
154
|
-
/**
|
|
155
|
-
* Check if SharedArrayBuffer is available
|
|
156
|
-
*
|
|
157
|
-
* Note: SharedArrayBuffer is restricted in some browser contexts
|
|
158
|
-
* due to security considerations (Spectre/Meltdown mitigations).
|
|
159
|
-
*
|
|
160
|
-
* @returns True if SharedArrayBuffer is available, false otherwise
|
|
161
|
-
*/
|
|
162
|
-
declare function hasSharedArrayBuffer(): boolean;
|
|
163
|
-
/**
|
|
164
|
-
* Check if module workers are available
|
|
165
|
-
*
|
|
166
|
-
* Module workers allow importing ES modules in worker threads.
|
|
167
|
-
*
|
|
168
|
-
* @returns True if module workers are supported, false otherwise
|
|
169
|
-
*/
|
|
170
|
-
declare function hasModuleWorkers(): boolean;
|
|
171
|
-
/**
|
|
172
|
-
* Check if WebAssembly is available
|
|
173
|
-
*
|
|
174
|
-
* @returns True if WebAssembly is supported, false otherwise
|
|
175
|
-
*/
|
|
176
|
-
declare function hasWasm(): boolean;
|
|
177
|
-
/**
|
|
178
|
-
* Check if WebAssembly.instantiateStreaming is available
|
|
179
|
-
*
|
|
180
|
-
* Streaming instantiation is more efficient than buffering the entire WASM module.
|
|
181
|
-
*
|
|
182
|
-
* @returns True if streaming WebAssembly is supported, false otherwise
|
|
183
|
-
*/
|
|
184
|
-
declare function hasWasmStreaming(): boolean;
|
|
185
|
-
/**
|
|
186
|
-
* Check if BigInt is available
|
|
187
|
-
*
|
|
188
|
-
* @returns True if BigInt type is supported, false otherwise
|
|
189
|
-
*/
|
|
190
|
-
declare function hasBigInt(): boolean;
|
|
191
|
-
/**
|
|
192
|
-
* Get runtime version information
|
|
193
|
-
*
|
|
194
|
-
* @returns Version string if available, undefined otherwise
|
|
195
|
-
*
|
|
196
|
-
* @example
|
|
197
|
-
* ```typescript
|
|
198
|
-
* const version = getRuntimeVersion();
|
|
199
|
-
* console.log(`Running on Node ${version}`); // "Running on Node 18.12.0"
|
|
200
|
-
* ```
|
|
201
|
-
*/
|
|
202
|
-
declare function getRuntimeVersion(): string | undefined;
|
|
203
|
-
/**
|
|
204
|
-
* Get comprehensive WebAssembly capabilities for current runtime
|
|
205
|
-
*
|
|
206
|
-
* Returns detailed information about WASM and related APIs available
|
|
207
|
-
* in the current runtime environment.
|
|
208
|
-
*
|
|
209
|
-
* @returns Object describing available WASM capabilities
|
|
210
|
-
*
|
|
211
|
-
* @example
|
|
212
|
-
* ```typescript
|
|
213
|
-
* import { getWasmCapabilities } from '@kreuzberg/wasm/runtime';
|
|
214
|
-
*
|
|
215
|
-
* const caps = getWasmCapabilities();
|
|
216
|
-
* console.log(`WASM available: ${caps.hasWasm}`);
|
|
217
|
-
* console.log(`Streaming WASM: ${caps.hasWasmStreaming}`);
|
|
218
|
-
* console.log(`Workers available: ${caps.hasWorkers}`);
|
|
219
|
-
*
|
|
220
|
-
* if (caps.hasWasm && caps.hasWorkers) {
|
|
221
|
-
* // Can offload WASM processing to workers
|
|
222
|
-
* }
|
|
223
|
-
* ```
|
|
224
|
-
*/
|
|
225
|
-
declare function getWasmCapabilities(): WasmCapabilities;
|
|
226
|
-
/**
|
|
227
|
-
* Get comprehensive runtime information
|
|
228
|
-
*
|
|
229
|
-
* Returns detailed information about the current runtime environment,
|
|
230
|
-
* capabilities, and identifying information.
|
|
231
|
-
*
|
|
232
|
-
* @returns Object with runtime details and capabilities
|
|
233
|
-
*
|
|
234
|
-
* @example
|
|
235
|
-
* ```typescript
|
|
236
|
-
* const info = getRuntimeInfo();
|
|
237
|
-
* console.log(info.runtime); // 'browser' | 'node' | 'deno' | 'bun'
|
|
238
|
-
* console.log(info.isBrowser); // true/false
|
|
239
|
-
* console.log(info.userAgent); // Browser user agent string
|
|
240
|
-
* console.log(info.capabilities); // Detailed capability information
|
|
241
|
-
* ```
|
|
242
|
-
*/
|
|
243
|
-
declare function getRuntimeInfo(): {
|
|
244
|
-
runtime: RuntimeType;
|
|
245
|
-
isBrowser: boolean;
|
|
246
|
-
isNode: boolean;
|
|
247
|
-
isDeno: boolean;
|
|
248
|
-
isBun: boolean;
|
|
249
|
-
isWeb: boolean;
|
|
250
|
-
isServer: boolean;
|
|
251
|
-
runtimeVersion: string | undefined;
|
|
252
|
-
userAgent: string;
|
|
253
|
-
capabilities: WasmCapabilities;
|
|
254
|
-
};
|
|
255
|
-
|
|
256
|
-
export { type RuntimeType, type WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment };
|
package/dist/types-wVLLDHkl.d.ts
DELETED
|
@@ -1,364 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Type definitions for Kreuzberg WASM bindings
|
|
3
|
-
*
|
|
4
|
-
* These types are generated from the Rust core library and define
|
|
5
|
-
* the interface for extraction, configuration, and results.
|
|
6
|
-
*/
|
|
7
|
-
/**
|
|
8
|
-
* Token reduction configuration
|
|
9
|
-
*/
|
|
10
|
-
interface TokenReductionConfig {
|
|
11
|
-
/** Token reduction mode */
|
|
12
|
-
mode?: string;
|
|
13
|
-
/** Preserve important words during reduction */
|
|
14
|
-
preserveImportantWords?: boolean;
|
|
15
|
-
}
|
|
16
|
-
/**
|
|
17
|
-
* Post-processor configuration
|
|
18
|
-
*/
|
|
19
|
-
interface PostProcessorConfig {
|
|
20
|
-
/** Whether post-processing is enabled */
|
|
21
|
-
enabled?: boolean;
|
|
22
|
-
/** List of enabled processors */
|
|
23
|
-
enabledProcessors?: string[];
|
|
24
|
-
/** List of disabled processors */
|
|
25
|
-
disabledProcessors?: string[];
|
|
26
|
-
}
|
|
27
|
-
/**
|
|
28
|
-
* Keyword extraction algorithm type
|
|
29
|
-
*
|
|
30
|
-
* Supported algorithms:
|
|
31
|
-
* - "yake": YAKE (Yet Another Keyword Extractor) - statistical approach
|
|
32
|
-
* - "rake": RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
|
|
33
|
-
*/
|
|
34
|
-
type KeywordAlgorithm = "yake" | "rake";
|
|
35
|
-
/**
|
|
36
|
-
* YAKE algorithm-specific parameters
|
|
37
|
-
*/
|
|
38
|
-
interface YakeParams {
|
|
39
|
-
/** Window size for co-occurrence analysis (default: 2) */
|
|
40
|
-
windowSize?: number;
|
|
41
|
-
}
|
|
42
|
-
/**
|
|
43
|
-
* RAKE algorithm-specific parameters
|
|
44
|
-
*/
|
|
45
|
-
interface RakeParams {
|
|
46
|
-
/** Minimum word length to consider (default: 1) */
|
|
47
|
-
minWordLength?: number;
|
|
48
|
-
/** Maximum words in a keyword phrase (default: 3) */
|
|
49
|
-
maxWordsPerPhrase?: number;
|
|
50
|
-
}
|
|
51
|
-
/**
|
|
52
|
-
* Keyword extraction configuration
|
|
53
|
-
*
|
|
54
|
-
* Controls how keywords are extracted from text, including algorithm selection,
|
|
55
|
-
* scoring thresholds, n-gram ranges, and language-specific settings.
|
|
56
|
-
*/
|
|
57
|
-
interface KeywordConfig {
|
|
58
|
-
/** Algorithm to use for extraction (default: "yake") */
|
|
59
|
-
algorithm?: KeywordAlgorithm;
|
|
60
|
-
/** Maximum number of keywords to extract (default: 10) */
|
|
61
|
-
maxKeywords?: number;
|
|
62
|
-
/** Minimum score threshold 0.0-1.0 (default: 0.0) */
|
|
63
|
-
minScore?: number;
|
|
64
|
-
/** N-gram range [min, max] for keyword extraction (default: [1, 3]) */
|
|
65
|
-
ngramRange?: [number, number];
|
|
66
|
-
/** Language code for stopword filtering (e.g., "en", "de", "fr") */
|
|
67
|
-
language?: string;
|
|
68
|
-
/** YAKE-specific tuning parameters */
|
|
69
|
-
yakeParams?: YakeParams;
|
|
70
|
-
/** RAKE-specific tuning parameters */
|
|
71
|
-
rakeParams?: RakeParams;
|
|
72
|
-
}
|
|
73
|
-
/**
|
|
74
|
-
* Extracted keyword with relevance metadata
|
|
75
|
-
*
|
|
76
|
-
* Represents a single keyword extracted from text along with its relevance score,
|
|
77
|
-
* the algorithm that extracted it, and optional position information.
|
|
78
|
-
*/
|
|
79
|
-
interface ExtractedKeyword {
|
|
80
|
-
/** The keyword text */
|
|
81
|
-
text: string;
|
|
82
|
-
/** Relevance score (higher is better, algorithm-specific range) */
|
|
83
|
-
score: number;
|
|
84
|
-
/** Algorithm that extracted this keyword */
|
|
85
|
-
algorithm: KeywordAlgorithm;
|
|
86
|
-
/** Optional positions where keyword appears in text (character offsets) */
|
|
87
|
-
positions?: number[];
|
|
88
|
-
}
|
|
89
|
-
/**
|
|
90
|
-
* Configuration for document extraction
|
|
91
|
-
*/
|
|
92
|
-
interface ExtractionConfig {
|
|
93
|
-
/** OCR configuration */
|
|
94
|
-
ocr?: OcrConfig;
|
|
95
|
-
/** Chunking configuration */
|
|
96
|
-
chunking?: ChunkingConfig;
|
|
97
|
-
/** Image extraction configuration */
|
|
98
|
-
images?: ImageExtractionConfig;
|
|
99
|
-
/** Page extraction configuration */
|
|
100
|
-
pages?: PageExtractionConfig;
|
|
101
|
-
/** Language detection configuration */
|
|
102
|
-
languageDetection?: LanguageDetectionConfig;
|
|
103
|
-
/** PDF extraction options */
|
|
104
|
-
pdfOptions?: PdfConfig;
|
|
105
|
-
/** Token reduction configuration */
|
|
106
|
-
tokenReduction?: TokenReductionConfig;
|
|
107
|
-
/** Post-processor configuration */
|
|
108
|
-
postprocessor?: PostProcessorConfig;
|
|
109
|
-
/** Keyword extraction configuration */
|
|
110
|
-
keywords?: KeywordConfig;
|
|
111
|
-
/** Whether to use caching */
|
|
112
|
-
useCache?: boolean;
|
|
113
|
-
/** Enable quality processing */
|
|
114
|
-
enableQualityProcessing?: boolean;
|
|
115
|
-
/** Force OCR even if text is available */
|
|
116
|
-
forceOcr?: boolean;
|
|
117
|
-
/** Maximum concurrent extractions */
|
|
118
|
-
maxConcurrentExtractions?: number;
|
|
119
|
-
}
|
|
120
|
-
/**
|
|
121
|
-
* Tesseract OCR configuration
|
|
122
|
-
*/
|
|
123
|
-
interface TesseractConfig {
|
|
124
|
-
/** Tesseract page segmentation mode */
|
|
125
|
-
psm?: number;
|
|
126
|
-
/** Enable table detection */
|
|
127
|
-
enableTableDetection?: boolean;
|
|
128
|
-
/** Character whitelist for recognition */
|
|
129
|
-
tesseditCharWhitelist?: string;
|
|
130
|
-
}
|
|
131
|
-
/**
|
|
132
|
-
* OCR configuration
|
|
133
|
-
*/
|
|
134
|
-
interface OcrConfig {
|
|
135
|
-
/** OCR backend to use */
|
|
136
|
-
backend?: string;
|
|
137
|
-
/** Language codes (ISO 639) */
|
|
138
|
-
languages?: string[];
|
|
139
|
-
/** Whether to perform OCR */
|
|
140
|
-
enabled?: boolean;
|
|
141
|
-
/** Tesseract-specific configuration */
|
|
142
|
-
tesseractConfig?: TesseractConfig;
|
|
143
|
-
/** Language code for OCR */
|
|
144
|
-
language?: string;
|
|
145
|
-
}
|
|
146
|
-
/**
|
|
147
|
-
* Chunking configuration
|
|
148
|
-
*/
|
|
149
|
-
interface ChunkingConfig {
|
|
150
|
-
/** Maximum characters per chunk */
|
|
151
|
-
maxChars?: number;
|
|
152
|
-
/** Overlap between chunks */
|
|
153
|
-
maxOverlap?: number;
|
|
154
|
-
}
|
|
155
|
-
/**
|
|
156
|
-
* Image extraction configuration
|
|
157
|
-
*/
|
|
158
|
-
interface ImageExtractionConfig {
|
|
159
|
-
/** Whether to extract images */
|
|
160
|
-
enabled?: boolean;
|
|
161
|
-
/** Target DPI for image extraction */
|
|
162
|
-
targetDpi?: number;
|
|
163
|
-
/** Maximum image dimension in pixels */
|
|
164
|
-
maxImageDimension?: number;
|
|
165
|
-
/** Automatically adjust DPI */
|
|
166
|
-
autoAdjustDpi?: boolean;
|
|
167
|
-
/** Minimum DPI threshold */
|
|
168
|
-
minDpi?: number;
|
|
169
|
-
/** Maximum DPI threshold */
|
|
170
|
-
maxDpi?: number;
|
|
171
|
-
}
|
|
172
|
-
/**
|
|
173
|
-
* PDF extraction configuration
|
|
174
|
-
*/
|
|
175
|
-
interface PdfConfig {
|
|
176
|
-
/** Whether to extract images from PDF */
|
|
177
|
-
extractImages?: boolean;
|
|
178
|
-
/** Passwords for encrypted PDFs */
|
|
179
|
-
passwords?: string[];
|
|
180
|
-
/** Whether to extract metadata */
|
|
181
|
-
extractMetadata?: boolean;
|
|
182
|
-
}
|
|
183
|
-
/**
|
|
184
|
-
* Page extraction configuration
|
|
185
|
-
*/
|
|
186
|
-
interface PageExtractionConfig {
|
|
187
|
-
/** Extract pages as separate array (ExtractionResult.pages) */
|
|
188
|
-
extractPages?: boolean;
|
|
189
|
-
/** Insert page markers in main content string */
|
|
190
|
-
insertPageMarkers?: boolean;
|
|
191
|
-
/** Page marker format (use {page_num} placeholder) */
|
|
192
|
-
markerFormat?: string;
|
|
193
|
-
}
|
|
194
|
-
/**
|
|
195
|
-
* Language detection configuration
|
|
196
|
-
*/
|
|
197
|
-
interface LanguageDetectionConfig {
|
|
198
|
-
/** Whether to detect languages */
|
|
199
|
-
enabled?: boolean;
|
|
200
|
-
}
|
|
201
|
-
/**
|
|
202
|
-
* Result of document extraction
|
|
203
|
-
*/
|
|
204
|
-
interface ExtractionResult {
|
|
205
|
-
/** Extracted text content */
|
|
206
|
-
content: string;
|
|
207
|
-
/** MIME type of the document */
|
|
208
|
-
mimeType: string;
|
|
209
|
-
/** Document metadata */
|
|
210
|
-
metadata: Metadata;
|
|
211
|
-
/** Extracted tables */
|
|
212
|
-
tables: Table[];
|
|
213
|
-
/** Detected languages (ISO 639 codes) */
|
|
214
|
-
detectedLanguages?: string[] | null;
|
|
215
|
-
/** Text chunks when chunking is enabled */
|
|
216
|
-
chunks?: Chunk[] | null;
|
|
217
|
-
/** Extracted images */
|
|
218
|
-
images?: ExtractedImage[] | null;
|
|
219
|
-
/** Per-page content */
|
|
220
|
-
pages?: PageContent[] | null;
|
|
221
|
-
/** Extracted keywords when keyword extraction is enabled */
|
|
222
|
-
keywords?: ExtractedKeyword[] | null;
|
|
223
|
-
}
|
|
224
|
-
/**
|
|
225
|
-
* Document metadata
|
|
226
|
-
*/
|
|
227
|
-
interface Metadata {
|
|
228
|
-
/** Document title */
|
|
229
|
-
title?: string;
|
|
230
|
-
/** Document subject or description */
|
|
231
|
-
subject?: string;
|
|
232
|
-
/** Document author(s) */
|
|
233
|
-
authors?: string[];
|
|
234
|
-
/** Keywords/tags */
|
|
235
|
-
keywords?: string[];
|
|
236
|
-
/** Primary language (ISO 639 code) */
|
|
237
|
-
language?: string;
|
|
238
|
-
/** Creation timestamp (ISO 8601 format) */
|
|
239
|
-
createdAt?: string;
|
|
240
|
-
/** Last modification timestamp (ISO 8601 format) */
|
|
241
|
-
modifiedAt?: string;
|
|
242
|
-
/** User who created the document */
|
|
243
|
-
creator?: string;
|
|
244
|
-
/** User who last modified the document */
|
|
245
|
-
lastModifiedBy?: string;
|
|
246
|
-
/** Number of pages/slides */
|
|
247
|
-
pageCount?: number;
|
|
248
|
-
/** Format-specific metadata */
|
|
249
|
-
formatMetadata?: unknown;
|
|
250
|
-
/**
|
|
251
|
-
* Additional fields may be added at runtime by postprocessors.
|
|
252
|
-
* Use bracket notation to safely access unexpected properties.
|
|
253
|
-
*/
|
|
254
|
-
[key: string]: unknown;
|
|
255
|
-
}
|
|
256
|
-
/**
|
|
257
|
-
* Extracted table
|
|
258
|
-
*/
|
|
259
|
-
interface Table {
|
|
260
|
-
/** Table cells/rows */
|
|
261
|
-
cells?: string[][];
|
|
262
|
-
/** Table markdown representation */
|
|
263
|
-
markdown?: string;
|
|
264
|
-
/** Page number if available */
|
|
265
|
-
pageNumber?: number;
|
|
266
|
-
/** Table headers */
|
|
267
|
-
headers?: string[];
|
|
268
|
-
/** Table rows */
|
|
269
|
-
rows?: string[][];
|
|
270
|
-
}
|
|
271
|
-
/**
|
|
272
|
-
* Chunk metadata
|
|
273
|
-
*/
|
|
274
|
-
interface ChunkMetadata {
|
|
275
|
-
/** Character start position in original content */
|
|
276
|
-
charStart: number;
|
|
277
|
-
/** Character end position in original content */
|
|
278
|
-
charEnd: number;
|
|
279
|
-
/** Token count if available */
|
|
280
|
-
tokenCount: number | null;
|
|
281
|
-
/** Index of this chunk */
|
|
282
|
-
chunkIndex: number;
|
|
283
|
-
/** Total number of chunks */
|
|
284
|
-
totalChunks: number;
|
|
285
|
-
}
|
|
286
|
-
/**
|
|
287
|
-
* Text chunk from chunked content
|
|
288
|
-
*/
|
|
289
|
-
interface Chunk {
|
|
290
|
-
/** Chunk text content */
|
|
291
|
-
content: string;
|
|
292
|
-
/** Chunk metadata */
|
|
293
|
-
metadata?: ChunkMetadata;
|
|
294
|
-
/** Character position in original content (legacy) */
|
|
295
|
-
charIndex?: number;
|
|
296
|
-
/** Token count if available (legacy) */
|
|
297
|
-
tokenCount?: number;
|
|
298
|
-
/** Embedding vector if computed */
|
|
299
|
-
embedding?: number[] | null;
|
|
300
|
-
}
|
|
301
|
-
/**
|
|
302
|
-
* Extracted image from document
|
|
303
|
-
*/
|
|
304
|
-
interface ExtractedImage {
|
|
305
|
-
/** Image data as Uint8Array or base64 string */
|
|
306
|
-
data: Uint8Array | string;
|
|
307
|
-
/** Image format/MIME type */
|
|
308
|
-
format?: string;
|
|
309
|
-
/** MIME type of the image */
|
|
310
|
-
mimeType?: string;
|
|
311
|
-
/** Image index in document */
|
|
312
|
-
imageIndex?: number;
|
|
313
|
-
/** Page number if available */
|
|
314
|
-
pageNumber?: number | null;
|
|
315
|
-
/** Image width in pixels */
|
|
316
|
-
width?: number | null;
|
|
317
|
-
/** Image height in pixels */
|
|
318
|
-
height?: number | null;
|
|
319
|
-
/** Color space of the image */
|
|
320
|
-
colorspace?: string | null;
|
|
321
|
-
/** Bits per color component */
|
|
322
|
-
bitsPerComponent?: number | null;
|
|
323
|
-
/** Whether this is a mask image */
|
|
324
|
-
isMask?: boolean;
|
|
325
|
-
/** Image description */
|
|
326
|
-
description?: string | null;
|
|
327
|
-
/** Optional OCR result from the image */
|
|
328
|
-
ocrResult?: ExtractionResult | string | null;
|
|
329
|
-
}
|
|
330
|
-
/**
|
|
331
|
-
* Per-page content
|
|
332
|
-
*/
|
|
333
|
-
interface PageContent {
|
|
334
|
-
/** Page number (1-indexed) */
|
|
335
|
-
pageNumber: number;
|
|
336
|
-
/** Text content of the page */
|
|
337
|
-
content: string;
|
|
338
|
-
/** Tables on this page */
|
|
339
|
-
tables?: Table[];
|
|
340
|
-
/** Images on this page */
|
|
341
|
-
images?: ExtractedImage[];
|
|
342
|
-
}
|
|
343
|
-
/**
|
|
344
|
-
* OCR backend protocol/interface
|
|
345
|
-
*/
|
|
346
|
-
interface OcrBackendProtocol {
|
|
347
|
-
/** Get the backend name */
|
|
348
|
-
name(): string;
|
|
349
|
-
/** Get supported language codes */
|
|
350
|
-
supportedLanguages?(): string[];
|
|
351
|
-
/** Initialize the backend */
|
|
352
|
-
initialize(options?: Record<string, unknown>): void | Promise<void>;
|
|
353
|
-
/** Shutdown the backend */
|
|
354
|
-
shutdown?(): void | Promise<void>;
|
|
355
|
-
/** Process an image with OCR */
|
|
356
|
-
processImage(imageData: Uint8Array | string, language?: string): Promise<{
|
|
357
|
-
content: string;
|
|
358
|
-
mime_type: string;
|
|
359
|
-
metadata?: Record<string, unknown>;
|
|
360
|
-
tables?: unknown[];
|
|
361
|
-
} | string>;
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
export type { Chunk as C, ExtractionResult as E, ImageExtractionConfig as I, KeywordAlgorithm as K, LanguageDetectionConfig as L, Metadata as M, OcrBackendProtocol as O, PageContent as P, RakeParams as R, Table as T, YakeParams as Y, ExtractionConfig as a, ChunkingConfig as b, ChunkMetadata as c, ExtractedImage as d, OcrConfig as e, PageExtractionConfig as f, PdfConfig as g, PostProcessorConfig as h, TesseractConfig as i, TokenReductionConfig as j, KeywordConfig as k, ExtractedKeyword as l };
|