@kreuzberg/wasm 4.0.0-rc.21 → 4.0.0-rc.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +520 -837
- package/dist/adapters/wasm-adapter.d.ts +7 -10
- package/dist/adapters/wasm-adapter.d.ts.map +1 -0
- package/dist/adapters/wasm-adapter.js +41 -19
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.d.ts +23 -24
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +240 -67
- package/dist/index.js.map +1 -1
- package/dist/ocr/registry.d.ts +7 -10
- package/dist/ocr/registry.d.ts.map +1 -0
- package/dist/ocr/registry.js.map +1 -1
- package/dist/ocr/tesseract-wasm-backend.d.ts +3 -6
- package/dist/ocr/tesseract-wasm-backend.d.ts.map +1 -0
- package/dist/ocr/tesseract-wasm-backend.js +0 -46
- package/dist/ocr/tesseract-wasm-backend.js.map +1 -1
- package/dist/pdfium.js +0 -5
- package/dist/plugin-registry.d.ts +246 -0
- package/dist/plugin-registry.d.ts.map +1 -0
- package/dist/runtime.d.ts +21 -22
- package/dist/runtime.d.ts.map +1 -0
- package/dist/runtime.js +0 -1
- package/dist/runtime.js.map +1 -1
- package/dist/{types-CKjcIYcX.d.ts → types.d.ts} +91 -22
- package/dist/types.d.ts.map +1 -0
- package/package.json +119 -162
- package/dist/adapters/wasm-adapter.cjs +0 -245
- package/dist/adapters/wasm-adapter.cjs.map +0 -1
- package/dist/adapters/wasm-adapter.d.cts +0 -121
- package/dist/index.cjs +0 -1245
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.cts +0 -423
- package/dist/ocr/registry.cjs +0 -92
- package/dist/ocr/registry.cjs.map +0 -1
- package/dist/ocr/registry.d.cts +0 -102
- package/dist/ocr/tesseract-wasm-backend.cjs +0 -456
- package/dist/ocr/tesseract-wasm-backend.cjs.map +0 -1
- package/dist/ocr/tesseract-wasm-backend.d.cts +0 -257
- package/dist/runtime.cjs +0 -174
- package/dist/runtime.cjs.map +0 -1
- package/dist/runtime.d.cts +0 -256
- package/dist/types-CKjcIYcX.d.cts +0 -294
package/dist/index.d.cts
DELETED
|
@@ -1,423 +0,0 @@
|
|
|
1
|
-
import { E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.cjs.js';
|
|
2
|
-
export { C as Chunk, b as ChunkingConfig, c as ChunkMetadata, d as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, O as OcrBackendProtocol, e as OcrConfig, P as PageContent, f as PageExtractionConfig, g as PdfConfig, h as PostProcessorConfig, T as Table, i as TesseractConfig, j as TokenReductionConfig, E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.cjs.js';
|
|
3
|
-
export { configToJS, fileToUint8Array, isValidExtractionResult, jsToExtractionResult, wrapWasmError } from './adapters/wasm-adapter.cjs';
|
|
4
|
-
export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend } from './ocr/registry.cjs';
|
|
5
|
-
export { TesseractWasmBackend } from './ocr/tesseract-wasm-backend.cjs';
|
|
6
|
-
export { type RuntimeType, type WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.d.cts';
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* Kreuzberg - WebAssembly Bindings for Browser and Runtime Environments
|
|
10
|
-
*
|
|
11
|
-
* This module provides WebAssembly bindings for Kreuzberg document intelligence,
|
|
12
|
-
* enabling high-performance document extraction in browser and JavaScript runtime environments.
|
|
13
|
-
*
|
|
14
|
-
* ## Features
|
|
15
|
-
*
|
|
16
|
-
* - Extract text, metadata, and tables from documents
|
|
17
|
-
* - Support for multiple document formats (PDF, Office, images, etc.)
|
|
18
|
-
* - Browser and runtime-compatible WASM bindings
|
|
19
|
-
* - Type-safe TypeScript interfaces
|
|
20
|
-
* - Runtime detection and feature capability checking
|
|
21
|
-
* - Automatic type conversion and error handling
|
|
22
|
-
*
|
|
23
|
-
* ## Installation
|
|
24
|
-
*
|
|
25
|
-
* ```bash
|
|
26
|
-
* npm install @kreuzberg/wasm
|
|
27
|
-
* ```
|
|
28
|
-
*
|
|
29
|
-
* ## Basic Usage
|
|
30
|
-
*
|
|
31
|
-
* ```typescript
|
|
32
|
-
* import { extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
33
|
-
*
|
|
34
|
-
* // Initialize WASM module once at app startup
|
|
35
|
-
* await initWasm();
|
|
36
|
-
*
|
|
37
|
-
* // Extract from bytes
|
|
38
|
-
* const bytes = new Uint8Array(buffer);
|
|
39
|
-
* const result = await extractBytes(bytes, 'application/pdf');
|
|
40
|
-
* console.log(result.content);
|
|
41
|
-
* ```
|
|
42
|
-
*
|
|
43
|
-
* ## Browser Usage with File Input
|
|
44
|
-
*
|
|
45
|
-
* ```typescript
|
|
46
|
-
* import { extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
47
|
-
* import { fileToUint8Array } from '@kreuzberg/wasm/adapters/wasm-adapter';
|
|
48
|
-
*
|
|
49
|
-
* // Initialize once at app startup
|
|
50
|
-
* await initWasm();
|
|
51
|
-
*
|
|
52
|
-
* // Handle file input
|
|
53
|
-
* const fileInput = document.getElementById('file');
|
|
54
|
-
* fileInput.addEventListener('change', async (e) => {
|
|
55
|
-
* const file = e.target.files?.[0];
|
|
56
|
-
* if (file) {
|
|
57
|
-
* const bytes = await fileToUint8Array(file);
|
|
58
|
-
* const result = await extractBytes(bytes, file.type);
|
|
59
|
-
* console.log(result.content);
|
|
60
|
-
* }
|
|
61
|
-
* });
|
|
62
|
-
* ```
|
|
63
|
-
*
|
|
64
|
-
* ## Runtime Detection
|
|
65
|
-
*
|
|
66
|
-
* ```typescript
|
|
67
|
-
* import { detectRuntime, getWasmCapabilities } from '@kreuzberg/wasm/runtime';
|
|
68
|
-
*
|
|
69
|
-
* const runtime = detectRuntime();
|
|
70
|
-
* const caps = getWasmCapabilities();
|
|
71
|
-
*
|
|
72
|
-
* if (caps.hasWorkers) {
|
|
73
|
-
* // Can use Web Workers for parallel processing
|
|
74
|
-
* }
|
|
75
|
-
* ```
|
|
76
|
-
*
|
|
77
|
-
* ## Configuration
|
|
78
|
-
*
|
|
79
|
-
* ```typescript
|
|
80
|
-
* import { extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
81
|
-
* import type { ExtractionConfig } from '@kreuzberg/wasm';
|
|
82
|
-
*
|
|
83
|
-
* await initWasm();
|
|
84
|
-
*
|
|
85
|
-
* const config: ExtractionConfig = {
|
|
86
|
-
* ocr: {
|
|
87
|
-
* backend: 'tesseract',
|
|
88
|
-
* language: 'eng'
|
|
89
|
-
* },
|
|
90
|
-
* chunking: {
|
|
91
|
-
* maxChars: 1000,
|
|
92
|
-
* chunkOverlap: 100
|
|
93
|
-
* },
|
|
94
|
-
* images: {
|
|
95
|
-
* extractImages: true,
|
|
96
|
-
* targetDpi: 150
|
|
97
|
-
* }
|
|
98
|
-
* };
|
|
99
|
-
*
|
|
100
|
-
* const result = await extractBytes(bytes, 'application/pdf', config);
|
|
101
|
-
* ```
|
|
102
|
-
*/
|
|
103
|
-
|
|
104
|
-
declare function initWasm(): Promise<void>;
|
|
105
|
-
/**
|
|
106
|
-
* Check if WASM module is initialized
|
|
107
|
-
*
|
|
108
|
-
* @returns True if WASM module is initialized, false otherwise
|
|
109
|
-
*
|
|
110
|
-
* @example
|
|
111
|
-
* ```typescript
|
|
112
|
-
* if (!isInitialized()) {
|
|
113
|
-
* await initWasm();
|
|
114
|
-
* }
|
|
115
|
-
* ```
|
|
116
|
-
*/
|
|
117
|
-
declare function isInitialized(): boolean;
|
|
118
|
-
/**
|
|
119
|
-
* Get WASM module version
|
|
120
|
-
*
|
|
121
|
-
* @throws {Error} If WASM module is not initialized
|
|
122
|
-
* @returns The version string of the WASM module
|
|
123
|
-
*
|
|
124
|
-
* @example
|
|
125
|
-
* ```typescript
|
|
126
|
-
* const version = getVersion();
|
|
127
|
-
* console.log(`Using Kreuzberg ${version}`);
|
|
128
|
-
* ```
|
|
129
|
-
*/
|
|
130
|
-
declare function getVersion(): string;
|
|
131
|
-
/**
|
|
132
|
-
* Get initialization error if module failed to load
|
|
133
|
-
*
|
|
134
|
-
* @returns The error that occurred during initialization, or null if no error
|
|
135
|
-
*
|
|
136
|
-
* @internal
|
|
137
|
-
*/
|
|
138
|
-
declare function getInitializationError(): Error | null;
|
|
139
|
-
/**
|
|
140
|
-
* Extract content from bytes (document data)
|
|
141
|
-
*
|
|
142
|
-
* Extracts text, metadata, tables, images, and other content from document bytes.
|
|
143
|
-
* Automatically detects document type from MIME type and applies appropriate extraction logic.
|
|
144
|
-
*
|
|
145
|
-
* @param data - The document bytes to extract from
|
|
146
|
-
* @param mimeType - MIME type of the document (e.g., 'application/pdf', 'image/jpeg')
|
|
147
|
-
* @param config - Optional extraction configuration
|
|
148
|
-
* @returns Promise resolving to the extraction result
|
|
149
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
150
|
-
*
|
|
151
|
-
* @example Extract PDF
|
|
152
|
-
* ```typescript
|
|
153
|
-
* const bytes = new Uint8Array(buffer);
|
|
154
|
-
* const result = await extractBytes(bytes, 'application/pdf');
|
|
155
|
-
* console.log(result.content);
|
|
156
|
-
* console.log(result.tables);
|
|
157
|
-
* ```
|
|
158
|
-
*
|
|
159
|
-
* @example Extract with Configuration
|
|
160
|
-
* ```typescript
|
|
161
|
-
* const result = await extractBytes(bytes, 'application/pdf', {
|
|
162
|
-
* ocr: {
|
|
163
|
-
* backend: 'tesseract',
|
|
164
|
-
* language: 'deu' // German
|
|
165
|
-
* },
|
|
166
|
-
* images: {
|
|
167
|
-
* extractImages: true,
|
|
168
|
-
* targetDpi: 200
|
|
169
|
-
* }
|
|
170
|
-
* });
|
|
171
|
-
* ```
|
|
172
|
-
*
|
|
173
|
-
* @example Extract from File
|
|
174
|
-
* ```typescript
|
|
175
|
-
* const file = inputEvent.target.files[0];
|
|
176
|
-
* const bytes = await fileToUint8Array(file);
|
|
177
|
-
* const result = await extractBytes(bytes, file.type);
|
|
178
|
-
* ```
|
|
179
|
-
*/
|
|
180
|
-
declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfig | null): Promise<ExtractionResult>;
|
|
181
|
-
/**
|
|
182
|
-
* Extract content from a file on the file system
|
|
183
|
-
*
|
|
184
|
-
* Node.js and Deno specific function that reads a file from the file system
|
|
185
|
-
* and extracts content from it. Automatically detects MIME type if not provided.
|
|
186
|
-
*
|
|
187
|
-
* @param path - Path to the file to extract from
|
|
188
|
-
* @param mimeType - Optional MIME type of the file. If not provided, will attempt to detect
|
|
189
|
-
* @param config - Optional extraction configuration
|
|
190
|
-
* @returns Promise resolving to the extraction result
|
|
191
|
-
* @throws {Error} If WASM module is not initialized, file doesn't exist, or extraction fails
|
|
192
|
-
*
|
|
193
|
-
* @example Extract with auto-detection
|
|
194
|
-
* ```typescript
|
|
195
|
-
* const result = await extractFile('./document.pdf');
|
|
196
|
-
* console.log(result.content);
|
|
197
|
-
* ```
|
|
198
|
-
*
|
|
199
|
-
* @example Extract with explicit MIME type
|
|
200
|
-
* ```typescript
|
|
201
|
-
* const result = await extractFile('./document.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document');
|
|
202
|
-
* ```
|
|
203
|
-
*
|
|
204
|
-
* @example Extract from Node.js with config
|
|
205
|
-
* ```typescript
|
|
206
|
-
* import { extractFile } from '@kreuzberg/wasm';
|
|
207
|
-
* import { readFile } from 'fs/promises';
|
|
208
|
-
*
|
|
209
|
-
* const result = await extractFile('./report.xlsx', null, {
|
|
210
|
-
* chunking: {
|
|
211
|
-
* maxChars: 1000
|
|
212
|
-
* }
|
|
213
|
-
* });
|
|
214
|
-
* ```
|
|
215
|
-
*/
|
|
216
|
-
declare function extractFile(path: string, mimeType?: string | null, config?: ExtractionConfig | null): Promise<ExtractionResult>;
|
|
217
|
-
/**
|
|
218
|
-
* Extract content from a File or Blob (browser-friendly wrapper)
|
|
219
|
-
*
|
|
220
|
-
* Convenience function that wraps fileToUint8Array and extractBytes,
|
|
221
|
-
* providing a streamlined API for browser applications handling file inputs.
|
|
222
|
-
*
|
|
223
|
-
* @param file - The File or Blob to extract from
|
|
224
|
-
* @param mimeType - Optional MIME type. If not provided, uses file.type if available
|
|
225
|
-
* @param config - Optional extraction configuration
|
|
226
|
-
* @returns Promise resolving to the extraction result
|
|
227
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
228
|
-
*
|
|
229
|
-
* @example Simple file extraction
|
|
230
|
-
* ```typescript
|
|
231
|
-
* const fileInput = document.getElementById('file');
|
|
232
|
-
* fileInput.addEventListener('change', async (e) => {
|
|
233
|
-
* const file = e.target.files?.[0];
|
|
234
|
-
* if (file) {
|
|
235
|
-
* const result = await extractFromFile(file);
|
|
236
|
-
* console.log(result.content);
|
|
237
|
-
* }
|
|
238
|
-
* });
|
|
239
|
-
* ```
|
|
240
|
-
*
|
|
241
|
-
* @example With configuration
|
|
242
|
-
* ```typescript
|
|
243
|
-
* const result = await extractFromFile(file, file.type, {
|
|
244
|
-
* chunking: { maxChars: 1000 },
|
|
245
|
-
* images: { extractImages: true }
|
|
246
|
-
* });
|
|
247
|
-
* ```
|
|
248
|
-
*/
|
|
249
|
-
declare function extractFromFile(file: File | Blob, mimeType?: string | null, config?: ExtractionConfig | null): Promise<ExtractionResult>;
|
|
250
|
-
/**
|
|
251
|
-
* Extract content from bytes synchronously
|
|
252
|
-
*
|
|
253
|
-
* Synchronous version of extractBytes. Performs extraction without async operations.
|
|
254
|
-
* Note: Some extraction features may still be async internally, but the wrapper is synchronous.
|
|
255
|
-
*
|
|
256
|
-
* @param data - The document bytes to extract from
|
|
257
|
-
* @param mimeType - MIME type of the document
|
|
258
|
-
* @param config - Optional extraction configuration
|
|
259
|
-
* @returns The extraction result
|
|
260
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
261
|
-
*
|
|
262
|
-
* @example
|
|
263
|
-
* ```typescript
|
|
264
|
-
* const bytes = new Uint8Array(buffer);
|
|
265
|
-
* const result = extractBytesSync(bytes, 'application/pdf');
|
|
266
|
-
* console.log(result.content);
|
|
267
|
-
* ```
|
|
268
|
-
*/
|
|
269
|
-
declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfig | null): ExtractionResult;
|
|
270
|
-
/**
|
|
271
|
-
* Batch extract content from multiple byte arrays asynchronously
|
|
272
|
-
*
|
|
273
|
-
* Extracts content from multiple documents in a single batch operation,
|
|
274
|
-
* allowing for more efficient processing of multiple files.
|
|
275
|
-
*
|
|
276
|
-
* @param files - Array of objects containing data (Uint8Array) and mimeType (string)
|
|
277
|
-
* @param config - Optional extraction configuration applied to all files
|
|
278
|
-
* @returns Promise resolving to array of extraction results
|
|
279
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
280
|
-
*
|
|
281
|
-
* @example
|
|
282
|
-
* ```typescript
|
|
283
|
-
* const files = [
|
|
284
|
-
* { data: pdfBytes, mimeType: 'application/pdf' },
|
|
285
|
-
* { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
|
|
286
|
-
* ];
|
|
287
|
-
* const results = await batchExtractBytes(files);
|
|
288
|
-
* results.forEach((result) => console.log(result.content));
|
|
289
|
-
* ```
|
|
290
|
-
*/
|
|
291
|
-
declare function batchExtractBytes(files: Array<{
|
|
292
|
-
data: Uint8Array;
|
|
293
|
-
mimeType: string;
|
|
294
|
-
}>, config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
|
|
295
|
-
/**
|
|
296
|
-
* Batch extract content from multiple byte arrays synchronously
|
|
297
|
-
*
|
|
298
|
-
* Synchronous version of batchExtractBytes. Extracts content from multiple documents
|
|
299
|
-
* in a single batch operation without async operations.
|
|
300
|
-
*
|
|
301
|
-
* @param files - Array of objects containing data (Uint8Array) and mimeType (string)
|
|
302
|
-
* @param config - Optional extraction configuration applied to all files
|
|
303
|
-
* @returns Array of extraction results
|
|
304
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
305
|
-
*
|
|
306
|
-
* @example
|
|
307
|
-
* ```typescript
|
|
308
|
-
* const files = [
|
|
309
|
-
* { data: pdfBytes, mimeType: 'application/pdf' },
|
|
310
|
-
* { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
|
|
311
|
-
* ];
|
|
312
|
-
* const results = batchExtractBytesSync(files);
|
|
313
|
-
* results.forEach((result) => console.log(result.content));
|
|
314
|
-
* ```
|
|
315
|
-
*/
|
|
316
|
-
declare function batchExtractBytesSync(files: Array<{
|
|
317
|
-
data: Uint8Array;
|
|
318
|
-
mimeType: string;
|
|
319
|
-
}>, config?: ExtractionConfig | null): ExtractionResult[];
|
|
320
|
-
/**
|
|
321
|
-
* Batch extract content from multiple File objects asynchronously
|
|
322
|
-
*
|
|
323
|
-
* Convenience function that converts File objects to Uint8Array and calls batchExtractBytes.
|
|
324
|
-
* Automatically uses the file.type as MIME type if available.
|
|
325
|
-
*
|
|
326
|
-
* @param files - Array of File objects to extract from
|
|
327
|
-
* @param config - Optional extraction configuration applied to all files
|
|
328
|
-
* @returns Promise resolving to array of extraction results
|
|
329
|
-
* @throws {Error} If WASM module is not initialized, files cannot be read, or extraction fails
|
|
330
|
-
*
|
|
331
|
-
* @example
|
|
332
|
-
* ```typescript
|
|
333
|
-
* const fileInput = document.getElementById('files');
|
|
334
|
-
* const files = Array.from(fileInput.files ?? []);
|
|
335
|
-
* const results = await batchExtractFiles(files);
|
|
336
|
-
* results.forEach((result, index) => {
|
|
337
|
-
* console.log(`File ${index}: ${result.content.substring(0, 50)}...`);
|
|
338
|
-
* });
|
|
339
|
-
* ```
|
|
340
|
-
*/
|
|
341
|
-
declare function batchExtractFiles(files: File[], config?: ExtractionConfig | null): Promise<ExtractionResult[]>;
|
|
342
|
-
/**
|
|
343
|
-
* Enable OCR functionality with tesseract-wasm backend
|
|
344
|
-
*
|
|
345
|
-
* Convenience function that automatically initializes and registers the Tesseract WASM backend.
|
|
346
|
-
* This is the recommended approach for enabling OCR in WASM-based applications.
|
|
347
|
-
*
|
|
348
|
-
* ## Browser Requirement
|
|
349
|
-
*
|
|
350
|
-
* This function requires a browser environment with support for:
|
|
351
|
-
* - WebWorkers (for Tesseract processing)
|
|
352
|
-
* - createImageBitmap (for image conversion)
|
|
353
|
-
* - Blob API
|
|
354
|
-
*
|
|
355
|
-
* ## Network Requirement
|
|
356
|
-
*
|
|
357
|
-
* Training data will be loaded from jsDelivr CDN on first use of each language.
|
|
358
|
-
* Ensure network access to cdn.jsdelivr.net is available.
|
|
359
|
-
*
|
|
360
|
-
* @throws {Error} If not in browser environment or tesseract-wasm is not available
|
|
361
|
-
*
|
|
362
|
-
* @example Basic Usage
|
|
363
|
-
* ```typescript
|
|
364
|
-
* import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
365
|
-
*
|
|
366
|
-
* async function main() {
|
|
367
|
-
* // Initialize WASM module
|
|
368
|
-
* await initWasm();
|
|
369
|
-
*
|
|
370
|
-
* // Enable OCR with tesseract-wasm
|
|
371
|
-
* await enableOcr();
|
|
372
|
-
*
|
|
373
|
-
* // Now you can use OCR in extraction
|
|
374
|
-
* const imageBytes = new Uint8Array(buffer);
|
|
375
|
-
* const result = await extractBytes(imageBytes, 'image/png', {
|
|
376
|
-
* ocr: { backend: 'tesseract-wasm', language: 'eng' }
|
|
377
|
-
* });
|
|
378
|
-
*
|
|
379
|
-
* console.log(result.content); // Extracted text
|
|
380
|
-
* }
|
|
381
|
-
*
|
|
382
|
-
* main().catch(console.error);
|
|
383
|
-
* ```
|
|
384
|
-
*
|
|
385
|
-
* @example With Progress Tracking
|
|
386
|
-
* ```typescript
|
|
387
|
-
* import { enableOcr, TesseractWasmBackend } from '@kreuzberg/wasm';
|
|
388
|
-
*
|
|
389
|
-
* async function setupOcrWithProgress() {
|
|
390
|
-
* const backend = new TesseractWasmBackend();
|
|
391
|
-
* backend.setProgressCallback((progress) => {
|
|
392
|
-
* console.log(`OCR Progress: ${progress}%`);
|
|
393
|
-
* updateProgressBar(progress);
|
|
394
|
-
* });
|
|
395
|
-
*
|
|
396
|
-
* await backend.initialize();
|
|
397
|
-
* registerOcrBackend(backend);
|
|
398
|
-
* }
|
|
399
|
-
*
|
|
400
|
-
* setupOcrWithProgress().catch(console.error);
|
|
401
|
-
* ```
|
|
402
|
-
*
|
|
403
|
-
* @example Multiple Languages
|
|
404
|
-
* ```typescript
|
|
405
|
-
* import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
406
|
-
*
|
|
407
|
-
* await initWasm();
|
|
408
|
-
* await enableOcr();
|
|
409
|
-
*
|
|
410
|
-
* // Extract English text
|
|
411
|
-
* const englishResult = await extractBytes(engImageBytes, 'image/png', {
|
|
412
|
-
* ocr: { backend: 'tesseract-wasm', language: 'eng' }
|
|
413
|
-
* });
|
|
414
|
-
*
|
|
415
|
-
* // Extract German text - model is cached after first use
|
|
416
|
-
* const germanResult = await extractBytes(deImageBytes, 'image/png', {
|
|
417
|
-
* ocr: { backend: 'tesseract-wasm', language: 'deu' }
|
|
418
|
-
* });
|
|
419
|
-
* ```
|
|
420
|
-
*/
|
|
421
|
-
declare function enableOcr(): Promise<void>;
|
|
422
|
-
|
|
423
|
-
export { batchExtractBytes, batchExtractBytesSync, batchExtractFiles, enableOcr, extractBytes, extractBytesSync, extractFile, extractFromFile, getInitializationError, getVersion, initWasm, isInitialized };
|
package/dist/ocr/registry.cjs
DELETED
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __defProp = Object.defineProperty;
|
|
3
|
-
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
-
var __export = (target, all) => {
|
|
7
|
-
for (var name in all)
|
|
8
|
-
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
-
};
|
|
10
|
-
var __copyProps = (to, from, except, desc) => {
|
|
11
|
-
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
-
for (let key of __getOwnPropNames(from))
|
|
13
|
-
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
-
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
-
}
|
|
16
|
-
return to;
|
|
17
|
-
};
|
|
18
|
-
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
-
|
|
20
|
-
// typescript/ocr/registry.ts
|
|
21
|
-
var registry_exports = {};
|
|
22
|
-
__export(registry_exports, {
|
|
23
|
-
clearOcrBackends: () => clearOcrBackends,
|
|
24
|
-
getOcrBackend: () => getOcrBackend,
|
|
25
|
-
listOcrBackends: () => listOcrBackends,
|
|
26
|
-
registerOcrBackend: () => registerOcrBackend,
|
|
27
|
-
unregisterOcrBackend: () => unregisterOcrBackend
|
|
28
|
-
});
|
|
29
|
-
module.exports = __toCommonJS(registry_exports);
|
|
30
|
-
var ocrBackendRegistry = /* @__PURE__ */ new Map();
|
|
31
|
-
function registerOcrBackend(backend) {
|
|
32
|
-
if (!backend) {
|
|
33
|
-
throw new Error("Backend cannot be null or undefined");
|
|
34
|
-
}
|
|
35
|
-
if (typeof backend.name !== "function") {
|
|
36
|
-
throw new Error("Backend must implement name() method");
|
|
37
|
-
}
|
|
38
|
-
if (typeof backend.supportedLanguages !== "function") {
|
|
39
|
-
throw new Error("Backend must implement supportedLanguages() method");
|
|
40
|
-
}
|
|
41
|
-
if (typeof backend.processImage !== "function") {
|
|
42
|
-
throw new Error("Backend must implement processImage() method");
|
|
43
|
-
}
|
|
44
|
-
const backendName = backend.name();
|
|
45
|
-
if (!backendName || typeof backendName !== "string") {
|
|
46
|
-
throw new Error("Backend name must be a non-empty string");
|
|
47
|
-
}
|
|
48
|
-
if (ocrBackendRegistry.has(backendName)) {
|
|
49
|
-
console.warn(`OCR backend "${backendName}" is already registered and will be replaced`);
|
|
50
|
-
}
|
|
51
|
-
ocrBackendRegistry.set(backendName, backend);
|
|
52
|
-
}
|
|
53
|
-
function getOcrBackend(name) {
|
|
54
|
-
return ocrBackendRegistry.get(name);
|
|
55
|
-
}
|
|
56
|
-
function listOcrBackends() {
|
|
57
|
-
return Array.from(ocrBackendRegistry.keys());
|
|
58
|
-
}
|
|
59
|
-
async function unregisterOcrBackend(name) {
|
|
60
|
-
const backend = ocrBackendRegistry.get(name);
|
|
61
|
-
if (!backend) {
|
|
62
|
-
throw new Error(
|
|
63
|
-
`OCR backend "${name}" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(", ")}`
|
|
64
|
-
);
|
|
65
|
-
}
|
|
66
|
-
if (typeof backend.shutdown === "function") {
|
|
67
|
-
try {
|
|
68
|
-
await backend.shutdown();
|
|
69
|
-
} catch (error) {
|
|
70
|
-
console.warn(
|
|
71
|
-
`Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
|
|
72
|
-
);
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
ocrBackendRegistry.delete(name);
|
|
76
|
-
}
|
|
77
|
-
async function clearOcrBackends() {
|
|
78
|
-
const backends = Array.from(ocrBackendRegistry.entries());
|
|
79
|
-
for (const [name, backend] of backends) {
|
|
80
|
-
if (typeof backend.shutdown === "function") {
|
|
81
|
-
try {
|
|
82
|
-
await backend.shutdown();
|
|
83
|
-
} catch (error) {
|
|
84
|
-
console.warn(
|
|
85
|
-
`Error shutting down OCR backend "${name}": ${error instanceof Error ? error.message : String(error)}`
|
|
86
|
-
);
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
ocrBackendRegistry.clear();
|
|
91
|
-
}
|
|
92
|
-
//# sourceMappingURL=registry.cjs.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../../typescript/ocr/registry.ts"],"sourcesContent":["/**\n * OCR Backend Registry\n *\n * Provides a registry for OCR backends in the WASM environment.\n * This enables auto-registration and management of OCR backends.\n *\n * Note: The WASM package provides a lightweight registry in the browser.\n * For more advanced features like Rust integration, use @kreuzberg/node or @kreuzberg/deno.\n *\n * @example\n * ```typescript\n * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';\n * import { enableOcr } from '@kreuzberg/wasm';\n *\n * // Simple auto-registration\n * await enableOcr();\n * ```\n */\n\nimport type { OcrBackendProtocol } from \"../types.js\";\n\n/** Global registry of OCR backends */\nconst ocrBackendRegistry = new Map<string, OcrBackendProtocol>();\n\n/**\n * Register an OCR backend\n *\n * Registers an OCR backend with the WASM extraction pipeline.\n * If a backend with the same name is already registered, it will be replaced.\n *\n * @param backend - OCR backend implementing OcrBackendProtocol\n * @throws {Error} If backend validation fails\n *\n * @example\n * ```typescript\n * import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';\n * import { registerOcrBackend } from '@kreuzberg/wasm/ocr/registry';\n *\n * const backend = new TesseractWasmBackend();\n * await backend.initialize();\n * registerOcrBackend(backend);\n * ```\n */\nexport function registerOcrBackend(backend: OcrBackendProtocol): void {\n\t// Validate backend\n\tif (!backend) {\n\t\tthrow new Error(\"Backend cannot be null or undefined\");\n\t}\n\n\tif (typeof backend.name !== \"function\") {\n\t\tthrow new Error(\"Backend must implement name() method\");\n\t}\n\n\tif (typeof backend.supportedLanguages !== \"function\") {\n\t\tthrow new Error(\"Backend must implement supportedLanguages() method\");\n\t}\n\n\tif (typeof backend.processImage !== \"function\") {\n\t\tthrow new Error(\"Backend must implement processImage() method\");\n\t}\n\n\tconst backendName = backend.name();\n\n\tif (!backendName || typeof backendName !== \"string\") {\n\t\tthrow new Error(\"Backend name must be a non-empty string\");\n\t}\n\n\t// Check for duplicate registration (allow overwriting with warning)\n\tif (ocrBackendRegistry.has(backendName)) {\n\t\tconsole.warn(`OCR backend \"${backendName}\" is already registered and will be replaced`);\n\t}\n\n\t// Register the backend\n\tocrBackendRegistry.set(backendName, backend);\n}\n\n/**\n * Get a registered OCR backend by name\n *\n * @param name - Backend name\n * @returns The OCR backend or undefined if not found\n *\n * @example\n * ```typescript\n * import { getOcrBackend } from '@kreuzberg/wasm/ocr/registry';\n *\n * const backend = getOcrBackend('tesseract-wasm');\n * if (backend) {\n * console.log('Available languages:', backend.supportedLanguages());\n * }\n * ```\n */\nexport function getOcrBackend(name: string): OcrBackendProtocol | undefined {\n\treturn ocrBackendRegistry.get(name);\n}\n\n/**\n * List all registered OCR backends\n *\n * @returns Array of registered backend names\n *\n * @example\n * ```typescript\n * import { listOcrBackends } from '@kreuzberg/wasm/ocr/registry';\n *\n * const backends = listOcrBackends();\n * console.log('Available OCR backends:', backends);\n * ```\n */\nexport function listOcrBackends(): string[] {\n\treturn Array.from(ocrBackendRegistry.keys());\n}\n\n/**\n * Unregister an OCR backend\n *\n * @param name - Backend name to unregister\n * @throws {Error} If backend is not found\n *\n * @example\n * ```typescript\n * import { unregisterOcrBackend } from '@kreuzberg/wasm/ocr/registry';\n *\n * unregisterOcrBackend('tesseract-wasm');\n * ```\n */\nexport async function unregisterOcrBackend(name: string): Promise<void> {\n\tconst backend = ocrBackendRegistry.get(name);\n\n\tif (!backend) {\n\t\tthrow new Error(\n\t\t\t`OCR backend \"${name}\" is not registered. Available backends: ${Array.from(ocrBackendRegistry.keys()).join(\", \")}`,\n\t\t);\n\t}\n\n\t// Call shutdown if available\n\tif (typeof backend.shutdown === \"function\") {\n\t\ttry {\n\t\t\tawait backend.shutdown();\n\t\t} catch (error) {\n\t\t\tconsole.warn(\n\t\t\t\t`Error shutting down OCR backend \"${name}\": ${error instanceof Error ? error.message : String(error)}`,\n\t\t\t);\n\t\t}\n\t}\n\n\tocrBackendRegistry.delete(name);\n}\n\n/**\n * Clear all registered OCR backends\n *\n * Unregisters all OCR backends and calls their shutdown methods.\n *\n * @example\n * ```typescript\n * import { clearOcrBackends } from '@kreuzberg/wasm/ocr/registry';\n *\n * // Clean up all backends when shutting down\n * await clearOcrBackends();\n * ```\n */\nexport async function clearOcrBackends(): Promise<void> {\n\tconst backends = Array.from(ocrBackendRegistry.entries());\n\n\tfor (const [name, backend] of backends) {\n\t\tif (typeof backend.shutdown === \"function\") {\n\t\t\ttry {\n\t\t\t\tawait backend.shutdown();\n\t\t\t} catch (error) {\n\t\t\t\tconsole.warn(\n\t\t\t\t\t`Error shutting down OCR backend \"${name}\": ${error instanceof Error ? error.message : String(error)}`,\n\t\t\t\t);\n\t\t\t}\n\t\t}\n\t}\n\n\tocrBackendRegistry.clear();\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAsBA,IAAM,qBAAqB,oBAAI,IAAgC;AAqBxD,SAAS,mBAAmB,SAAmC;AAErE,MAAI,CAAC,SAAS;AACb,UAAM,IAAI,MAAM,qCAAqC;AAAA,EACtD;AAEA,MAAI,OAAO,QAAQ,SAAS,YAAY;AACvC,UAAM,IAAI,MAAM,sCAAsC;AAAA,EACvD;AAEA,MAAI,OAAO,QAAQ,uBAAuB,YAAY;AACrD,UAAM,IAAI,MAAM,oDAAoD;AAAA,EACrE;AAEA,MAAI,OAAO,QAAQ,iBAAiB,YAAY;AAC/C,UAAM,IAAI,MAAM,8CAA8C;AAAA,EAC/D;AAEA,QAAM,cAAc,QAAQ,KAAK;AAEjC,MAAI,CAAC,eAAe,OAAO,gBAAgB,UAAU;AACpD,UAAM,IAAI,MAAM,yCAAyC;AAAA,EAC1D;AAGA,MAAI,mBAAmB,IAAI,WAAW,GAAG;AACxC,YAAQ,KAAK,gBAAgB,WAAW,8CAA8C;AAAA,EACvF;AAGA,qBAAmB,IAAI,aAAa,OAAO;AAC5C;AAkBO,SAAS,cAAc,MAA8C;AAC3E,SAAO,mBAAmB,IAAI,IAAI;AACnC;AAeO,SAAS,kBAA4B;AAC3C,SAAO,MAAM,KAAK,mBAAmB,KAAK,CAAC;AAC5C;AAeA,eAAsB,qBAAqB,MAA6B;AACvE,QAAM,UAAU,mBAAmB,IAAI,IAAI;AAE3C,MAAI,CAAC,SAAS;AACb,UAAM,IAAI;AAAA,MACT,gBAAgB,IAAI,4CAA4C,MAAM,KAAK,mBAAmB,KAAK,CAAC,EAAE,KAAK,IAAI,CAAC;AAAA,IACjH;AAAA,EACD;AAGA,MAAI,OAAO,QAAQ,aAAa,YAAY;AAC3C,QAAI;AACH,YAAM,QAAQ,SAAS;AAAA,IACxB,SAAS,OAAO;AACf,cAAQ;AAAA,QACP,oCAAoC,IAAI,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AAAA,MACrG;AAAA,IACD;AAAA,EACD;AAEA,qBAAmB,OAAO,IAAI;AAC/B;AAeA,eAAsB,mBAAkC;AACvD,QAAM,WAAW,MAAM,KAAK,mBAAmB,QAAQ,CAAC;AAExD,aAAW,CAAC,MAAM,OAAO,KAAK,UAAU;AACvC,QAAI,OAAO,QAAQ,aAAa,YAAY;AAC3C,UAAI;AACH,cAAM,QAAQ,SAAS;AAAA,MACxB,SAAS,OAAO;AACf,gBAAQ;AAAA,UACP,oCAAoC,IAAI,MAAM,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC;AAAA,QACrG;AAAA,MACD;AAAA,IACD;AAAA,EACD;AAEA,qBAAmB,MAAM;AAC1B;","names":[]}
|
package/dist/ocr/registry.d.cts
DELETED
|
@@ -1,102 +0,0 @@
|
|
|
1
|
-
import { O as OcrBackendProtocol } from '../types-CKjcIYcX.cjs';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* OCR Backend Registry
|
|
5
|
-
*
|
|
6
|
-
* Provides a registry for OCR backends in the WASM environment.
|
|
7
|
-
* This enables auto-registration and management of OCR backends.
|
|
8
|
-
*
|
|
9
|
-
* Note: The WASM package provides a lightweight registry in the browser.
|
|
10
|
-
* For more advanced features like Rust integration, use @kreuzberg/node or @kreuzberg/deno.
|
|
11
|
-
*
|
|
12
|
-
* @example
|
|
13
|
-
* ```typescript
|
|
14
|
-
* import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';
|
|
15
|
-
* import { enableOcr } from '@kreuzberg/wasm';
|
|
16
|
-
*
|
|
17
|
-
* // Simple auto-registration
|
|
18
|
-
* await enableOcr();
|
|
19
|
-
* ```
|
|
20
|
-
*/
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* Register an OCR backend
|
|
24
|
-
*
|
|
25
|
-
* Registers an OCR backend with the WASM extraction pipeline.
|
|
26
|
-
* If a backend with the same name is already registered, it will be replaced.
|
|
27
|
-
*
|
|
28
|
-
* @param backend - OCR backend implementing OcrBackendProtocol
|
|
29
|
-
* @throws {Error} If backend validation fails
|
|
30
|
-
*
|
|
31
|
-
* @example
|
|
32
|
-
* ```typescript
|
|
33
|
-
* import { TesseractWasmBackend } from '@kreuzberg/wasm/ocr/tesseract-wasm-backend';
|
|
34
|
-
* import { registerOcrBackend } from '@kreuzberg/wasm/ocr/registry';
|
|
35
|
-
*
|
|
36
|
-
* const backend = new TesseractWasmBackend();
|
|
37
|
-
* await backend.initialize();
|
|
38
|
-
* registerOcrBackend(backend);
|
|
39
|
-
* ```
|
|
40
|
-
*/
|
|
41
|
-
declare function registerOcrBackend(backend: OcrBackendProtocol): void;
|
|
42
|
-
/**
|
|
43
|
-
* Get a registered OCR backend by name
|
|
44
|
-
*
|
|
45
|
-
* @param name - Backend name
|
|
46
|
-
* @returns The OCR backend or undefined if not found
|
|
47
|
-
*
|
|
48
|
-
* @example
|
|
49
|
-
* ```typescript
|
|
50
|
-
* import { getOcrBackend } from '@kreuzberg/wasm/ocr/registry';
|
|
51
|
-
*
|
|
52
|
-
* const backend = getOcrBackend('tesseract-wasm');
|
|
53
|
-
* if (backend) {
|
|
54
|
-
* console.log('Available languages:', backend.supportedLanguages());
|
|
55
|
-
* }
|
|
56
|
-
* ```
|
|
57
|
-
*/
|
|
58
|
-
declare function getOcrBackend(name: string): OcrBackendProtocol | undefined;
|
|
59
|
-
/**
|
|
60
|
-
* List all registered OCR backends
|
|
61
|
-
*
|
|
62
|
-
* @returns Array of registered backend names
|
|
63
|
-
*
|
|
64
|
-
* @example
|
|
65
|
-
* ```typescript
|
|
66
|
-
* import { listOcrBackends } from '@kreuzberg/wasm/ocr/registry';
|
|
67
|
-
*
|
|
68
|
-
* const backends = listOcrBackends();
|
|
69
|
-
* console.log('Available OCR backends:', backends);
|
|
70
|
-
* ```
|
|
71
|
-
*/
|
|
72
|
-
declare function listOcrBackends(): string[];
|
|
73
|
-
/**
|
|
74
|
-
* Unregister an OCR backend
|
|
75
|
-
*
|
|
76
|
-
* @param name - Backend name to unregister
|
|
77
|
-
* @throws {Error} If backend is not found
|
|
78
|
-
*
|
|
79
|
-
* @example
|
|
80
|
-
* ```typescript
|
|
81
|
-
* import { unregisterOcrBackend } from '@kreuzberg/wasm/ocr/registry';
|
|
82
|
-
*
|
|
83
|
-
* unregisterOcrBackend('tesseract-wasm');
|
|
84
|
-
* ```
|
|
85
|
-
*/
|
|
86
|
-
declare function unregisterOcrBackend(name: string): Promise<void>;
|
|
87
|
-
/**
|
|
88
|
-
* Clear all registered OCR backends
|
|
89
|
-
*
|
|
90
|
-
* Unregisters all OCR backends and calls their shutdown methods.
|
|
91
|
-
*
|
|
92
|
-
* @example
|
|
93
|
-
* ```typescript
|
|
94
|
-
* import { clearOcrBackends } from '@kreuzberg/wasm/ocr/registry';
|
|
95
|
-
*
|
|
96
|
-
* // Clean up all backends when shutting down
|
|
97
|
-
* await clearOcrBackends();
|
|
98
|
-
* ```
|
|
99
|
-
*/
|
|
100
|
-
declare function clearOcrBackends(): Promise<void>;
|
|
101
|
-
|
|
102
|
-
export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend };
|