@kreuzberg/wasm 4.0.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,257 @@
1
+ import { O as OcrBackendProtocol } from '../types-CKjcIYcX.cjs';
2
+
3
+ /**
4
+ * Tesseract WASM OCR Backend
5
+ *
6
+ * Provides OCR capabilities using tesseract-wasm library for browser environments.
7
+ * Loads training data on-demand from jsDelivr CDN and implements the OcrBackendProtocol.
8
+ *
9
+ * ## Browser-Only Requirement
10
+ *
11
+ * This backend requires browser APIs like createImageBitmap and Web Workers.
12
+ * It will NOT work in Node.js environments without additional canvas polyfills.
13
+ *
14
+ * ## Supported Languages
15
+ *
16
+ * Common ISO 639-1 and ISO 639-2 codes:
17
+ * - English: "eng"
18
+ * - German: "deu"
19
+ * - French: "fra"
20
+ * - Spanish: "spa"
21
+ * - Italian: "ita"
22
+ * - Portuguese: "por"
23
+ * - Dutch: "nld"
24
+ * - Russian: "rus"
25
+ * - Chinese (Simplified): "chi_sim"
26
+ * - Chinese (Traditional): "chi_tra"
27
+ * - Japanese: "jpn"
28
+ * - Korean: "kor"
29
+ * - Arabic: "ara"
30
+ * - Hindi: "hin"
31
+ *
32
+ * For complete language list, see: https://github.com/naptha/tesseract.js
33
+ *
34
+ * @example Basic Usage
35
+ * ```typescript
36
+ * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';
37
+ * import { registerOcrBackend, extractBytes, initWasm } from '@kreuzberg/wasm';
38
+ *
39
+ * // Initialize
40
+ * await initWasm();
41
+ * const backend = new TesseractWasmBackend();
42
+ * await backend.initialize();
43
+ * registerOcrBackend(backend);
44
+ *
45
+ * // Use in extraction
46
+ * const imageBytes = new Uint8Array(buffer);
47
+ * const result = await extractBytes(imageBytes, 'image/png', {
48
+ * ocr: { backend: 'tesseract-wasm', language: 'eng' }
49
+ * });
50
+ * console.log(result.content); // Extracted text
51
+ * ```
52
+ *
53
+ * @example With Language Auto-Detection
54
+ * ```typescript
55
+ * const backend = new TesseractWasmBackend();
56
+ * await backend.initialize();
57
+ * registerOcrBackend(backend);
58
+ *
59
+ * // Extract without specifying language - backend will auto-detect
60
+ * const result = await extractBytes(imageBytes, 'image/png', {
61
+ * ocr: { backend: 'tesseract-wasm' } // language will auto-detect
62
+ * });
63
+ * ```
64
+ */
65
+
66
+ /**
67
+ * TesseractWasmBackend - OCR backend using tesseract-wasm library
68
+ *
69
+ * Implements the OcrBackendProtocol for Kreuzberg document extraction pipeline.
70
+ * Provides comprehensive OCR support with model caching, error handling, and progress reporting.
71
+ */
72
+ declare class TesseractWasmBackend implements OcrBackendProtocol {
73
+ /** Tesseract WASM client instance */
74
+ private client;
75
+ /** Track which models are currently loaded to avoid redundant loads */
76
+ private loadedLanguages;
77
+ /** Cache for language availability validation */
78
+ private supportedLangsCache;
79
+ /** Progress callback for UI updates */
80
+ private progressCallback;
81
+ /** Base URL for training data CDN */
82
+ private readonly CDN_BASE_URL;
83
+ /**
84
+ * Return the unique name of this OCR backend
85
+ *
86
+ * @returns Backend identifier "tesseract-wasm"
87
+ */
88
+ name(): string;
89
+ /**
90
+ * Return list of supported language codes
91
+ *
92
+ * Returns a curated list of commonly available Tesseract language models.
93
+ * Tesseract supports many more languages through custom models.
94
+ *
95
+ * @returns Array of ISO 639-1/2/3 language codes
96
+ */
97
+ supportedLanguages(): string[];
98
+ /**
99
+ * Initialize the OCR backend
100
+ *
101
+ * Creates the Tesseract WASM client instance. This is called once when
102
+ * the backend is registered with the extraction pipeline.
103
+ *
104
+ * The actual model loading happens in processImage() on-demand to avoid
105
+ * loading all models upfront.
106
+ *
107
+ * @throws {Error} If tesseract-wasm is not available or initialization fails
108
+ *
109
+ * @example
110
+ * ```typescript
111
+ * const backend = new TesseractWasmBackend();
112
+ * try {
113
+ * await backend.initialize();
114
+ * } catch (error) {
115
+ * console.error('Failed to initialize OCR:', error);
116
+ * }
117
+ * ```
118
+ */
119
+ initialize(): Promise<void>;
120
+ /**
121
+ * Process image bytes and extract text via OCR
122
+ *
123
+ * Handles image loading, model loading, OCR processing, and result formatting.
124
+ * Automatically loads the language model on first use and caches it for subsequent calls.
125
+ *
126
+ * @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string
127
+ * @param language - ISO 639-2/3 language code (e.g., "eng", "deu")
128
+ * @returns Promise resolving to OCR result with content and metadata
129
+ * @throws {Error} If image processing fails, model loading fails, or language is unsupported
130
+ *
131
+ * @example
132
+ * ```typescript
133
+ * const backend = new TesseractWasmBackend();
134
+ * await backend.initialize();
135
+ *
136
+ * const imageBuffer = fs.readFileSync('scanned.png');
137
+ * const result = await backend.processImage(
138
+ * new Uint8Array(imageBuffer),
139
+ * 'eng'
140
+ * );
141
+ *
142
+ * console.log(result.content); // Extracted text
143
+ * console.log(result.metadata.confidence); // OCR confidence score
144
+ * ```
145
+ */
146
+ processImage(imageBytes: Uint8Array | string, language: string): Promise<{
147
+ content: string;
148
+ mime_type: string;
149
+ metadata: Record<string, unknown>;
150
+ tables: unknown[];
151
+ }>;
152
+ /**
153
+ * Shutdown the OCR backend and release resources
154
+ *
155
+ * Properly cleans up the Tesseract WASM client, freeing memory and Web Workers.
156
+ * Called when the backend is unregistered or the application shuts down.
157
+ *
158
+ * @throws {Error} If cleanup fails (errors are logged but not critical)
159
+ *
160
+ * @example
161
+ * ```typescript
162
+ * const backend = new TesseractWasmBackend();
163
+ * await backend.initialize();
164
+ * // ... use backend ...
165
+ * await backend.shutdown(); // Clean up resources
166
+ * ```
167
+ */
168
+ shutdown(): Promise<void>;
169
+ /**
170
+ * Set a progress callback for UI updates
171
+ *
172
+ * Allows the UI to display progress during OCR processing.
173
+ * The callback will be called with values from 0 to 100.
174
+ *
175
+ * @param callback - Function to call with progress percentage
176
+ *
177
+ * @example
178
+ * ```typescript
179
+ * const backend = new TesseractWasmBackend();
180
+ * backend.setProgressCallback((progress) => {
181
+ * console.log(`OCR Progress: ${progress}%`);
182
+ * document.getElementById('progress-bar').style.width = `${progress}%`;
183
+ * });
184
+ * ```
185
+ */
186
+ setProgressCallback(callback: (progress: number) => void): void;
187
+ /**
188
+ * Load language model from CDN
189
+ *
190
+ * Fetches the training data for a specific language from jsDelivr CDN.
191
+ * This is an MVP approach - models are cached by the browser.
192
+ *
193
+ * @param language - ISO 639-2/3 language code
194
+ * @throws {Error} If model download fails or language is not available
195
+ *
196
+ * @internal
197
+ */
198
+ private loadLanguageModel;
199
+ /**
200
+ * Convert image bytes or Base64 string to ImageBitmap
201
+ *
202
+ * Handles both Uint8Array and Base64-encoded image data, converting to
203
+ * ImageBitmap format required by Tesseract WASM.
204
+ *
205
+ * @param imageBytes - Image data as Uint8Array or Base64 string
206
+ * @returns Promise resolving to ImageBitmap
207
+ * @throws {Error} If conversion fails (browser API not available or invalid image data)
208
+ *
209
+ * @internal
210
+ */
211
+ private convertToImageBitmap;
212
+ /**
213
+ * Get confidence score from OCR result
214
+ *
215
+ * Attempts to retrieve confidence score from Tesseract.
216
+ * Returns a safe default if unavailable.
217
+ *
218
+ * @returns Confidence score between 0 and 1
219
+ *
220
+ * @internal
221
+ */
222
+ private getConfidenceScore;
223
+ /**
224
+ * Get page metadata from OCR result
225
+ *
226
+ * Retrieves additional metadata like image dimensions and processing info.
227
+ *
228
+ * @returns Metadata object (may be empty if unavailable)
229
+ *
230
+ * @internal
231
+ */
232
+ private getPageMetadata;
233
+ /**
234
+ * Dynamically load tesseract-wasm module
235
+ *
236
+ * Uses dynamic import to load tesseract-wasm only when needed,
237
+ * avoiding hard dependency in browser environments where it may not be bundled.
238
+ *
239
+ * @returns tesseract-wasm module object
240
+ * @throws {Error} If module cannot be imported
241
+ *
242
+ * @internal
243
+ */
244
+ private loadTesseractWasm;
245
+ /**
246
+ * Report progress to progress callback
247
+ *
248
+ * Internal helper for notifying progress updates during OCR processing.
249
+ *
250
+ * @param progress - Progress percentage (0-100)
251
+ *
252
+ * @internal
253
+ */
254
+ private reportProgress;
255
+ }
256
+
257
+ export { TesseractWasmBackend };
@@ -0,0 +1,257 @@
1
+ import { O as OcrBackendProtocol } from '../types-CKjcIYcX.js';
2
+
3
+ /**
4
+ * Tesseract WASM OCR Backend
5
+ *
6
+ * Provides OCR capabilities using tesseract-wasm library for browser environments.
7
+ * Loads training data on-demand from jsDelivr CDN and implements the OcrBackendProtocol.
8
+ *
9
+ * ## Browser-Only Requirement
10
+ *
11
+ * This backend requires browser APIs like createImageBitmap and Web Workers.
12
+ * It will NOT work in Node.js environments without additional canvas polyfills.
13
+ *
14
+ * ## Supported Languages
15
+ *
16
+ * Common ISO 639-1 and ISO 639-2 codes:
17
+ * - English: "eng"
18
+ * - German: "deu"
19
+ * - French: "fra"
20
+ * - Spanish: "spa"
21
+ * - Italian: "ita"
22
+ * - Portuguese: "por"
23
+ * - Dutch: "nld"
24
+ * - Russian: "rus"
25
+ * - Chinese (Simplified): "chi_sim"
26
+ * - Chinese (Traditional): "chi_tra"
27
+ * - Japanese: "jpn"
28
+ * - Korean: "kor"
29
+ * - Arabic: "ara"
30
+ * - Hindi: "hin"
31
+ *
32
+ * For complete language list, see: https://github.com/naptha/tesseract.js
33
+ *
34
+ * @example Basic Usage
35
+ * ```typescript
36
+ * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';
37
+ * import { registerOcrBackend, extractBytes, initWasm } from '@kreuzberg/wasm';
38
+ *
39
+ * // Initialize
40
+ * await initWasm();
41
+ * const backend = new TesseractWasmBackend();
42
+ * await backend.initialize();
43
+ * registerOcrBackend(backend);
44
+ *
45
+ * // Use in extraction
46
+ * const imageBytes = new Uint8Array(buffer);
47
+ * const result = await extractBytes(imageBytes, 'image/png', {
48
+ * ocr: { backend: 'tesseract-wasm', language: 'eng' }
49
+ * });
50
+ * console.log(result.content); // Extracted text
51
+ * ```
52
+ *
53
+ * @example With Language Auto-Detection
54
+ * ```typescript
55
+ * const backend = new TesseractWasmBackend();
56
+ * await backend.initialize();
57
+ * registerOcrBackend(backend);
58
+ *
59
+ * // Extract without specifying language - backend will auto-detect
60
+ * const result = await extractBytes(imageBytes, 'image/png', {
61
+ * ocr: { backend: 'tesseract-wasm' } // language will auto-detect
62
+ * });
63
+ * ```
64
+ */
65
+
66
+ /**
67
+ * TesseractWasmBackend - OCR backend using tesseract-wasm library
68
+ *
69
+ * Implements the OcrBackendProtocol for Kreuzberg document extraction pipeline.
70
+ * Provides comprehensive OCR support with model caching, error handling, and progress reporting.
71
+ */
72
+ declare class TesseractWasmBackend implements OcrBackendProtocol {
73
+ /** Tesseract WASM client instance */
74
+ private client;
75
+ /** Track which models are currently loaded to avoid redundant loads */
76
+ private loadedLanguages;
77
+ /** Cache for language availability validation */
78
+ private supportedLangsCache;
79
+ /** Progress callback for UI updates */
80
+ private progressCallback;
81
+ /** Base URL for training data CDN */
82
+ private readonly CDN_BASE_URL;
83
+ /**
84
+ * Return the unique name of this OCR backend
85
+ *
86
+ * @returns Backend identifier "tesseract-wasm"
87
+ */
88
+ name(): string;
89
+ /**
90
+ * Return list of supported language codes
91
+ *
92
+ * Returns a curated list of commonly available Tesseract language models.
93
+ * Tesseract supports many more languages through custom models.
94
+ *
95
+ * @returns Array of ISO 639-1/2/3 language codes
96
+ */
97
+ supportedLanguages(): string[];
98
+ /**
99
+ * Initialize the OCR backend
100
+ *
101
+ * Creates the Tesseract WASM client instance. This is called once when
102
+ * the backend is registered with the extraction pipeline.
103
+ *
104
+ * The actual model loading happens in processImage() on-demand to avoid
105
+ * loading all models upfront.
106
+ *
107
+ * @throws {Error} If tesseract-wasm is not available or initialization fails
108
+ *
109
+ * @example
110
+ * ```typescript
111
+ * const backend = new TesseractWasmBackend();
112
+ * try {
113
+ * await backend.initialize();
114
+ * } catch (error) {
115
+ * console.error('Failed to initialize OCR:', error);
116
+ * }
117
+ * ```
118
+ */
119
+ initialize(): Promise<void>;
120
+ /**
121
+ * Process image bytes and extract text via OCR
122
+ *
123
+ * Handles image loading, model loading, OCR processing, and result formatting.
124
+ * Automatically loads the language model on first use and caches it for subsequent calls.
125
+ *
126
+ * @param imageBytes - Raw image data (Uint8Array) or Base64-encoded string
127
+ * @param language - ISO 639-2/3 language code (e.g., "eng", "deu")
128
+ * @returns Promise resolving to OCR result with content and metadata
129
+ * @throws {Error} If image processing fails, model loading fails, or language is unsupported
130
+ *
131
+ * @example
132
+ * ```typescript
133
+ * const backend = new TesseractWasmBackend();
134
+ * await backend.initialize();
135
+ *
136
+ * const imageBuffer = fs.readFileSync('scanned.png');
137
+ * const result = await backend.processImage(
138
+ * new Uint8Array(imageBuffer),
139
+ * 'eng'
140
+ * );
141
+ *
142
+ * console.log(result.content); // Extracted text
143
+ * console.log(result.metadata.confidence); // OCR confidence score
144
+ * ```
145
+ */
146
+ processImage(imageBytes: Uint8Array | string, language: string): Promise<{
147
+ content: string;
148
+ mime_type: string;
149
+ metadata: Record<string, unknown>;
150
+ tables: unknown[];
151
+ }>;
152
+ /**
153
+ * Shutdown the OCR backend and release resources
154
+ *
155
+ * Properly cleans up the Tesseract WASM client, freeing memory and Web Workers.
156
+ * Called when the backend is unregistered or the application shuts down.
157
+ *
158
+ * @throws {Error} If cleanup fails (errors are logged but not critical)
159
+ *
160
+ * @example
161
+ * ```typescript
162
+ * const backend = new TesseractWasmBackend();
163
+ * await backend.initialize();
164
+ * // ... use backend ...
165
+ * await backend.shutdown(); // Clean up resources
166
+ * ```
167
+ */
168
+ shutdown(): Promise<void>;
169
+ /**
170
+ * Set a progress callback for UI updates
171
+ *
172
+ * Allows the UI to display progress during OCR processing.
173
+ * The callback will be called with values from 0 to 100.
174
+ *
175
+ * @param callback - Function to call with progress percentage
176
+ *
177
+ * @example
178
+ * ```typescript
179
+ * const backend = new TesseractWasmBackend();
180
+ * backend.setProgressCallback((progress) => {
181
+ * console.log(`OCR Progress: ${progress}%`);
182
+ * document.getElementById('progress-bar').style.width = `${progress}%`;
183
+ * });
184
+ * ```
185
+ */
186
+ setProgressCallback(callback: (progress: number) => void): void;
187
+ /**
188
+ * Load language model from CDN
189
+ *
190
+ * Fetches the training data for a specific language from jsDelivr CDN.
191
+ * This is an MVP approach - models are cached by the browser.
192
+ *
193
+ * @param language - ISO 639-2/3 language code
194
+ * @throws {Error} If model download fails or language is not available
195
+ *
196
+ * @internal
197
+ */
198
+ private loadLanguageModel;
199
+ /**
200
+ * Convert image bytes or Base64 string to ImageBitmap
201
+ *
202
+ * Handles both Uint8Array and Base64-encoded image data, converting to
203
+ * ImageBitmap format required by Tesseract WASM.
204
+ *
205
+ * @param imageBytes - Image data as Uint8Array or Base64 string
206
+ * @returns Promise resolving to ImageBitmap
207
+ * @throws {Error} If conversion fails (browser API not available or invalid image data)
208
+ *
209
+ * @internal
210
+ */
211
+ private convertToImageBitmap;
212
+ /**
213
+ * Get confidence score from OCR result
214
+ *
215
+ * Attempts to retrieve confidence score from Tesseract.
216
+ * Returns a safe default if unavailable.
217
+ *
218
+ * @returns Confidence score between 0 and 1
219
+ *
220
+ * @internal
221
+ */
222
+ private getConfidenceScore;
223
+ /**
224
+ * Get page metadata from OCR result
225
+ *
226
+ * Retrieves additional metadata like image dimensions and processing info.
227
+ *
228
+ * @returns Metadata object (may be empty if unavailable)
229
+ *
230
+ * @internal
231
+ */
232
+ private getPageMetadata;
233
+ /**
234
+ * Dynamically load tesseract-wasm module
235
+ *
236
+ * Uses dynamic import to load tesseract-wasm only when needed,
237
+ * avoiding hard dependency in browser environments where it may not be bundled.
238
+ *
239
+ * @returns tesseract-wasm module object
240
+ * @throws {Error} If module cannot be imported
241
+ *
242
+ * @internal
243
+ */
244
+ private loadTesseractWasm;
245
+ /**
246
+ * Report progress to progress callback
247
+ *
248
+ * Internal helper for notifying progress updates during OCR processing.
249
+ *
250
+ * @param progress - Progress percentage (0-100)
251
+ *
252
+ * @internal
253
+ */
254
+ private reportProgress;
255
+ }
256
+
257
+ export { TesseractWasmBackend };