@kreuzberg/wasm 4.3.7 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +40 -8
- package/dist/adapters/wasm-adapter.d.ts.map +1 -1
- package/dist/adapters/wasm-adapter.js +15 -7
- package/dist/adapters/wasm-adapter.js.map +1 -1
- package/dist/index.js +232 -45
- package/dist/index.js.map +1 -1
- package/dist/initialization/pdfium-loader.d.ts +22 -3
- package/dist/initialization/pdfium-loader.d.ts.map +1 -1
- package/dist/initialization/state.d.ts +4 -0
- package/dist/initialization/state.d.ts.map +1 -1
- package/dist/ocr/enabler.d.ts +15 -60
- package/dist/ocr/enabler.d.ts.map +1 -1
- package/dist/pdfium.esm.wasm +0 -0
- package/dist/pdfium.js +10 -73
- package/dist/pkg/README.md +40 -8
- package/dist/pkg/kreuzberg_wasm.d.ts +80 -76
- package/dist/pkg/kreuzberg_wasm.js +465 -285
- package/dist/pkg/kreuzberg_wasm_bg.js +377 -173
- package/dist/pkg/kreuzberg_wasm_bg.wasm +0 -0
- package/dist/pkg/kreuzberg_wasm_bg.wasm.d.ts +18 -9
- package/dist/types.d.ts +15 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +11 -11
- package/dist/pkg/package.json +0 -31
package/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -57,6 +57,8 @@
|
|
|
57
57
|
|
|
58
58
|
Extract text, tables, images, and metadata from 75+ file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Deno, and Cloudflare Workers with portable deployment and multi-threading support.
|
|
59
59
|
|
|
60
|
+
> **Full Feature Parity** — The WASM package supports all extraction capabilities at full parity with native bindings: PDF (via PDFium), Excel/spreadsheets (via Calamine), archives (ZIP, TAR, 7z, GZIP), and OCR (via built-in Tesseract-WASM). No external dependencies required.
|
|
61
|
+
|
|
60
62
|
|
|
61
63
|
## Installation
|
|
62
64
|
|
|
@@ -95,7 +97,7 @@ yarn add @kreuzberg/wasm
|
|
|
95
97
|
### System Requirements
|
|
96
98
|
|
|
97
99
|
- Modern browser with WebAssembly support, or Deno 1.0+, or Cloudflare Workers
|
|
98
|
-
-
|
|
100
|
+
- OCR is built-in via Tesseract-WASM (enable at runtime with `enableOcr()`)
|
|
99
101
|
|
|
100
102
|
|
|
101
103
|
|
|
@@ -174,6 +176,40 @@ extractWithOcr().catch(console.error);
|
|
|
174
176
|
See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
|
|
175
177
|
|
|
176
178
|
|
|
179
|
+
#### Excel/Spreadsheet Extraction
|
|
180
|
+
|
|
181
|
+
Extract structured data from Excel files directly in the browser or server-side runtimes:
|
|
182
|
+
|
|
183
|
+
```ts
|
|
184
|
+
import { extractBytes, initWasm } from "@kreuzberg/wasm";
|
|
185
|
+
|
|
186
|
+
async function extractSpreadsheet() {
|
|
187
|
+
await initWasm();
|
|
188
|
+
|
|
189
|
+
const bytes = new Uint8Array(
|
|
190
|
+
await fetch("report.xlsx").then((r) => r.arrayBuffer()),
|
|
191
|
+
);
|
|
192
|
+
|
|
193
|
+
const result = await extractBytes(
|
|
194
|
+
bytes,
|
|
195
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
196
|
+
);
|
|
197
|
+
|
|
198
|
+
console.log("Spreadsheet content:");
|
|
199
|
+
console.log(result.content);
|
|
200
|
+
|
|
201
|
+
if (result.tables && result.tables.length > 0) {
|
|
202
|
+
result.tables.forEach((table, index) => {
|
|
203
|
+
console.log(`\nSheet ${index + 1}:`);
|
|
204
|
+
console.log(table.markdown);
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
extractSpreadsheet().catch(console.error);
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
|
|
177
213
|
|
|
178
214
|
#### Processing Multiple Files
|
|
179
215
|
|
|
@@ -318,14 +354,10 @@ extractDocuments(fileBytes, mimes)
|
|
|
318
354
|
- **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
|
|
319
355
|
- **Table Extraction** - Parse tables with structure and cell content preservation
|
|
320
356
|
- **Image Extraction** - Extract embedded images and render page previews
|
|
321
|
-
- **OCR Support** -
|
|
322
|
-
|
|
357
|
+
- **OCR Support** - Built-in Tesseract-WASM for scanned documents and images
|
|
358
|
+
- **Full Feature Parity** - All extraction capabilities at parity with native bindings: PDF, Excel, archives, OCR, and 75+ formats
|
|
323
359
|
- **Async/Await** - Non-blocking document processing with concurrent operations
|
|
324
|
-
|
|
325
|
-
|
|
326
360
|
- **Plugin System** - Extensible post-processing for custom text transformation
|
|
327
|
-
|
|
328
|
-
|
|
329
361
|
- **Batch Processing** - Efficiently process multiple documents in parallel
|
|
330
362
|
- **Memory Efficient** - Stream large files without loading entirely into memory
|
|
331
363
|
- **Language Detection** - Detect and support multiple languages in documents
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"wasm-adapter.d.ts","sourceRoot":"","sources":["../../typescript/adapters/wasm-adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAEH,OAAO,KAAK,EAMX,gBAAgB,EAChB,gBAAgB,
|
|
1
|
+
{"version":3,"file":"wasm-adapter.d.ts","sourceRoot":"","sources":["../../typescript/adapters/wasm-adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAEH,OAAO,KAAK,EAMX,gBAAgB,EAChB,gBAAgB,EAOhB,MAAM,aAAa,CAAC;AAoCrB;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAsB,gBAAgB,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI,GAAG,OAAO,CAAC,UAAU,CAAC,CAa7E;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,IAAI,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAsCnF;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,OAAO,GAAG,gBAAgB,CAwOvE;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,GAAG,KAAK,CASpE;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,IAAI,gBAAgB,CAajF"}
|
|
@@ -26,7 +26,7 @@ function configToJS(config) {
|
|
|
26
26
|
if (!config) {
|
|
27
27
|
return {};
|
|
28
28
|
}
|
|
29
|
-
const
|
|
29
|
+
const toSnakeCase = (str) => str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
30
30
|
const normalizeValue = (value) => {
|
|
31
31
|
if (value === null || value === void 0) {
|
|
32
32
|
return null;
|
|
@@ -40,17 +40,18 @@ function configToJS(config) {
|
|
|
40
40
|
for (const [key, val] of Object.entries(obj)) {
|
|
41
41
|
const normalizedVal = normalizeValue(val);
|
|
42
42
|
if (normalizedVal !== null && normalizedVal !== void 0) {
|
|
43
|
-
normalized2[key] = normalizedVal;
|
|
43
|
+
normalized2[toSnakeCase(key)] = normalizedVal;
|
|
44
44
|
}
|
|
45
45
|
}
|
|
46
46
|
return Object.keys(normalized2).length > 0 ? normalized2 : null;
|
|
47
47
|
}
|
|
48
48
|
return value;
|
|
49
49
|
};
|
|
50
|
+
const normalized = {};
|
|
50
51
|
for (const [key, value] of Object.entries(config)) {
|
|
51
52
|
const normalizedValue = normalizeValue(value);
|
|
52
53
|
if (normalizedValue !== null && normalizedValue !== void 0) {
|
|
53
|
-
normalized[key] = normalizedValue;
|
|
54
|
+
normalized[toSnakeCase(key)] = normalizedValue;
|
|
54
55
|
}
|
|
55
56
|
}
|
|
56
57
|
return normalized;
|
|
@@ -153,8 +154,13 @@ function jsToExtractionResult(jsValue) {
|
|
|
153
154
|
throw new Error("Invalid image structure");
|
|
154
155
|
}
|
|
155
156
|
const img = image;
|
|
156
|
-
|
|
157
|
-
|
|
157
|
+
let imageData;
|
|
158
|
+
if (img.data instanceof Uint8Array) {
|
|
159
|
+
imageData = img.data;
|
|
160
|
+
} else if (Array.isArray(img.data)) {
|
|
161
|
+
imageData = new Uint8Array(img.data);
|
|
162
|
+
} else {
|
|
163
|
+
throw new Error("Invalid image: data must be Uint8Array or array");
|
|
158
164
|
}
|
|
159
165
|
if (typeof img.format !== "string") {
|
|
160
166
|
throw new Error("Invalid image: missing format");
|
|
@@ -189,7 +195,7 @@ function jsToExtractionResult(jsValue) {
|
|
|
189
195
|
throw new Error("Invalid image: description must be a string or null");
|
|
190
196
|
}
|
|
191
197
|
return {
|
|
192
|
-
data:
|
|
198
|
+
data: imageData,
|
|
193
199
|
format: img.format,
|
|
194
200
|
imageIndex,
|
|
195
201
|
pageNumber: pageNumber ?? null,
|
|
@@ -217,6 +223,7 @@ function jsToExtractionResult(jsValue) {
|
|
|
217
223
|
const ocrElements = result.ocrElements ?? result.ocr_elements ?? null;
|
|
218
224
|
const document = result.document ?? null;
|
|
219
225
|
const pages = result.pages ?? null;
|
|
226
|
+
const annotations = result.annotations ?? null;
|
|
220
227
|
return {
|
|
221
228
|
content: result.content,
|
|
222
229
|
mimeType,
|
|
@@ -231,7 +238,8 @@ function jsToExtractionResult(jsValue) {
|
|
|
231
238
|
processingWarnings,
|
|
232
239
|
elements,
|
|
233
240
|
ocrElements,
|
|
234
|
-
document
|
|
241
|
+
document,
|
|
242
|
+
annotations
|
|
235
243
|
};
|
|
236
244
|
}
|
|
237
245
|
function wrapWasmError(error, context) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../typescript/adapters/wasm-adapter.ts"],"sourcesContent":["/**\n * WASM Type Adapter\n *\n * This module provides type adapters for converting between JavaScript/TypeScript\n * types and WASM-compatible types, handling File/Blob conversions, config normalization,\n * and result parsing.\n *\n * @example File Conversion\n * ```typescript\n * import { fileToUint8Array } from '@kreuzberg/wasm/adapters/wasm-adapter';\n *\n * const file = event.target.files[0];\n * const bytes = await fileToUint8Array(file);\n * const result = await extractBytes(bytes, file.type);\n * ```\n *\n * @example Config Normalization\n * ```typescript\n * import { configToJS } from '@kreuzberg/wasm/adapters/wasm-adapter';\n *\n * const config = {\n * ocr: { backend: 'tesseract', language: 'eng' },\n * chunking: { maxChars: 1000 }\n * };\n * const normalized = configToJS(config);\n * ```\n */\n\nimport type {\n\tChunk,\n\tDocumentStructure,\n\tElement,\n\tExtractedImage,\n\tExtractedKeyword,\n\tExtractionConfig,\n\tExtractionResult,\n\tMetadata,\n\tOcrElement,\n\tPageContent,\n\tProcessingWarning,\n\tTable,\n} from \"../types.js\";\n\n/**\n * Maximum file size for processing (512 MB)\n *\n * @internal\n */\nconst MAX_FILE_SIZE = 512 * 1024 * 1024;\n\n/**\n * Type predicate to validate numeric value or null\n *\n * @internal\n */\nfunction isNumberOrNull(value: unknown): value is number | null {\n\treturn typeof value === \"number\" || value === null || value === undefined;\n}\n\n/**\n * Type predicate to validate string value or null\n *\n * @internal\n */\nfunction isStringOrNull(value: unknown): value is string | null {\n\treturn typeof value === \"string\" || value === null || value === undefined;\n}\n\n/**\n * Type predicate to validate boolean value\n *\n * @internal\n */\nfunction isBoolean(value: unknown): value is boolean {\n\treturn typeof value === \"boolean\" || value === undefined;\n}\n\n/**\n * Convert a File or Blob to Uint8Array\n *\n * Handles both browser File API and server-side Blob-like objects,\n * providing a unified interface for reading binary data.\n *\n * @param file - The File or Blob to convert\n * @returns Promise resolving to the byte array\n * @throws {Error} If the file cannot be read or exceeds size limit\n *\n * @example\n * ```typescript\n * const file = document.getElementById('input').files[0];\n * const bytes = await fileToUint8Array(file);\n * const result = await extractBytes(bytes, 'application/pdf');\n * ```\n */\nexport async function fileToUint8Array(file: File | Blob): Promise<Uint8Array> {\n\ttry {\n\t\tif (file.size > MAX_FILE_SIZE) {\n\t\t\tthrow new Error(\n\t\t\t\t`File size (${file.size} bytes) exceeds maximum (${MAX_FILE_SIZE} bytes). Maximum file size is 512 MB.`,\n\t\t\t);\n\t\t}\n\n\t\tconst arrayBuffer = await file.arrayBuffer();\n\t\treturn new Uint8Array(arrayBuffer);\n\t} catch (error) {\n\t\tthrow new Error(`Failed to read file: ${error instanceof Error ? error.message : String(error)}`);\n\t}\n}\n\n/**\n * Normalize ExtractionConfig for WASM processing\n *\n * Converts TypeScript configuration objects to a WASM-compatible format,\n * handling null values, undefined properties, and nested structures.\n *\n * @param config - The extraction configuration or null\n * @returns Normalized configuration object suitable for WASM\n *\n * @example\n * ```typescript\n * const config: ExtractionConfig = {\n * ocr: { backend: 'tesseract' },\n * chunking: { maxChars: 1000 }\n * };\n * const wasmConfig = configToJS(config);\n * ```\n */\nexport function configToJS(config: ExtractionConfig | null): Record<string, unknown> {\n\tif (!config) {\n\t\treturn {};\n\t}\n\n\tconst normalized: Record<string, unknown> = {};\n\n\tconst normalizeValue = (value: unknown): unknown => {\n\t\tif (value === null || value === undefined) {\n\t\t\treturn null;\n\t\t}\n\t\tif (typeof value === \"object\") {\n\t\t\tif (Array.isArray(value)) {\n\t\t\t\treturn value.map(normalizeValue);\n\t\t\t}\n\t\t\tconst obj = value as Record<string, unknown>;\n\t\t\tconst normalized: Record<string, unknown> = {};\n\t\t\tfor (const [key, val] of Object.entries(obj)) {\n\t\t\t\tconst normalizedVal = normalizeValue(val);\n\t\t\t\tif (normalizedVal !== null && normalizedVal !== undefined) {\n\t\t\t\t\tnormalized[key] = normalizedVal;\n\t\t\t\t}\n\t\t\t}\n\t\t\treturn Object.keys(normalized).length > 0 ? normalized : null;\n\t\t}\n\t\treturn value;\n\t};\n\n\tfor (const [key, value] of Object.entries(config)) {\n\t\tconst normalizedValue = normalizeValue(value);\n\t\tif (normalizedValue !== null && normalizedValue !== undefined) {\n\t\t\tnormalized[key] = normalizedValue;\n\t\t}\n\t}\n\n\treturn normalized;\n}\n\n/**\n * Parse WASM extraction result and convert to TypeScript type\n *\n * Handles conversion of WASM-returned objects to proper ExtractionResult types,\n * including proper array conversions and type assertions for tables, chunks, and images.\n *\n * @param jsValue - The raw WASM result value\n * @returns Properly typed ExtractionResult\n * @throws {Error} If the result structure is invalid\n *\n * @example\n * ```typescript\n * const wasmResult = await wasmExtract(bytes, mimeType, config);\n * const result = jsToExtractionResult(wasmResult);\n * console.log(result.content);\n * ```\n */\nexport function jsToExtractionResult(jsValue: unknown): ExtractionResult {\n\tif (!jsValue || typeof jsValue !== \"object\") {\n\t\tthrow new Error(\"Invalid extraction result: value is not an object\");\n\t}\n\n\tconst result = jsValue as Record<string, unknown>;\n\tconst mimeType =\n\t\ttypeof result.mimeType === \"string\"\n\t\t\t? result.mimeType\n\t\t\t: typeof result.mime_type === \"string\"\n\t\t\t\t? result.mime_type\n\t\t\t\t: null;\n\n\tif (typeof result.content !== \"string\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid content\");\n\t}\n\tif (typeof mimeType !== \"string\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid mimeType\");\n\t}\n\tif (!result.metadata || typeof result.metadata !== \"object\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid metadata\");\n\t}\n\n\tconst tables: Table[] = [];\n\tif (Array.isArray(result.tables)) {\n\t\tfor (const table of result.tables) {\n\t\t\tif (table && typeof table === \"object\") {\n\t\t\t\tconst t = table as Record<string, unknown>;\n\t\t\t\tconst pageNumber =\n\t\t\t\t\ttypeof t.pageNumber === \"number\" ? t.pageNumber : typeof t.page_number === \"number\" ? t.page_number : null;\n\t\t\t\tif (\n\t\t\t\t\tArray.isArray(t.cells) &&\n\t\t\t\t\tt.cells.every((row) => Array.isArray(row) && row.every((cell) => typeof cell === \"string\")) &&\n\t\t\t\t\ttypeof t.markdown === \"string\" &&\n\t\t\t\t\tpageNumber !== null\n\t\t\t\t) {\n\t\t\t\t\ttables.push({\n\t\t\t\t\t\tcells: t.cells as string[][],\n\t\t\t\t\t\tmarkdown: t.markdown,\n\t\t\t\t\t\tpageNumber,\n\t\t\t\t\t});\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\tconst chunks: Chunk[] | null = Array.isArray(result.chunks)\n\t\t? result.chunks.map((chunk) => {\n\t\t\t\tif (!chunk || typeof chunk !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk structure\");\n\t\t\t\t}\n\t\t\t\tconst c = chunk as Record<string, unknown>;\n\t\t\t\tif (typeof c.content !== \"string\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk: missing content\");\n\t\t\t\t}\n\t\t\t\tif (!c.metadata || typeof c.metadata !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk: missing metadata\");\n\t\t\t\t}\n\t\t\t\tconst metadata = c.metadata as Record<string, unknown>;\n\n\t\t\t\tlet embedding: number[] | null = null;\n\t\t\t\tif (Array.isArray(c.embedding)) {\n\t\t\t\t\tif (!c.embedding.every((item) => typeof item === \"number\")) {\n\t\t\t\t\t\tthrow new Error(\"Invalid chunk: embedding must contain only numbers\");\n\t\t\t\t\t}\n\t\t\t\t\tembedding = c.embedding;\n\t\t\t\t}\n\n\t\t\t\t// Coerce numeric values - handle BigInt, strings, and numbers\n\t\t\t\tconst coerceToNumber = (value: unknown, fieldName: string): number => {\n\t\t\t\t\tif (typeof value === \"number\") {\n\t\t\t\t\t\treturn value;\n\t\t\t\t\t}\n\t\t\t\t\tif (typeof value === \"bigint\") {\n\t\t\t\t\t\treturn Number(value);\n\t\t\t\t\t}\n\t\t\t\t\tif (typeof value === \"string\") {\n\t\t\t\t\t\tconst parsed = parseInt(value, 10);\n\t\t\t\t\t\tif (Number.isNaN(parsed)) {\n\t\t\t\t\t\t\tthrow new Error(`Invalid chunk metadata: ${fieldName} must be a valid number, got \"${value}\"`);\n\t\t\t\t\t\t}\n\t\t\t\t\t\treturn parsed;\n\t\t\t\t\t}\n\t\t\t\t\tthrow new Error(`Invalid chunk metadata: ${fieldName} must be a number, got ${typeof value}`);\n\t\t\t\t};\n\n\t\t\t\t// The Rust code uses snake_case field names (byte_start, byte_end, etc)\n\t\t\t\t// but TypeScript expects camelCase (charStart, charEnd, etc)\n\t\t\t\t// For now, treat byte offsets as character offsets since the content is UTF-8\n\t\t\t\tconst charStart = coerceToNumber(\n\t\t\t\t\tmetadata.charStart ?? metadata.char_start ?? metadata.byteStart ?? metadata.byte_start,\n\t\t\t\t\t\"charStart\",\n\t\t\t\t);\n\t\t\t\tconst charEnd = coerceToNumber(\n\t\t\t\t\tmetadata.charEnd ?? metadata.char_end ?? metadata.byteEnd ?? metadata.byte_end,\n\t\t\t\t\t\"charEnd\",\n\t\t\t\t);\n\t\t\t\tconst chunkIndex = coerceToNumber(metadata.chunkIndex ?? metadata.chunk_index, \"chunkIndex\");\n\t\t\t\tconst totalChunks = coerceToNumber(metadata.totalChunks ?? metadata.total_chunks, \"totalChunks\");\n\n\t\t\t\tlet tokenCount: number | null = null;\n\t\t\t\tconst tokenCountValue = metadata.tokenCount ?? metadata.token_count;\n\t\t\t\tif (tokenCountValue !== null && tokenCountValue !== undefined) {\n\t\t\t\t\ttokenCount = coerceToNumber(tokenCountValue, \"tokenCount\");\n\t\t\t\t}\n\n\t\t\t\treturn {\n\t\t\t\t\tcontent: c.content,\n\t\t\t\t\tembedding,\n\t\t\t\t\tmetadata: {\n\t\t\t\t\t\tcharStart,\n\t\t\t\t\t\tcharEnd,\n\t\t\t\t\t\ttokenCount,\n\t\t\t\t\t\tchunkIndex,\n\t\t\t\t\t\ttotalChunks,\n\t\t\t\t\t},\n\t\t\t\t};\n\t\t\t})\n\t\t: null;\n\n\tconst images: ExtractedImage[] | null = Array.isArray(result.images)\n\t\t? result.images.map((image) => {\n\t\t\t\tif (!image || typeof image !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid image structure\");\n\t\t\t\t}\n\t\t\t\tconst img = image as Record<string, unknown>;\n\t\t\t\tif (!(img.data instanceof Uint8Array)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: data must be Uint8Array\");\n\t\t\t\t}\n\t\t\t\tif (typeof img.format !== \"string\") {\n\t\t\t\t\tthrow new Error(\"Invalid image: missing format\");\n\t\t\t\t}\n\n\t\t\t\t// Support both camelCase and snake_case field names (Rust serde uses snake_case)\n\t\t\t\tconst imageIndex = img.imageIndex ?? img.image_index;\n\t\t\t\tconst pageNumber = img.pageNumber ?? img.page_number;\n\t\t\t\tconst bitsPerComponent = img.bitsPerComponent ?? img.bits_per_component;\n\t\t\t\tconst isMask = img.isMask ?? img.is_mask;\n\t\t\t\tconst ocrResult = img.ocrResult ?? img.ocr_result;\n\n\t\t\t\tif (typeof imageIndex !== \"number\") {\n\t\t\t\t\tthrow new Error(\"Invalid image: imageIndex must be a number\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(pageNumber)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: pageNumber must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(img.width)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: width must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(img.height)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: height must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(bitsPerComponent)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: bitsPerComponent must be a number or null\");\n\t\t\t\t}\n\n\t\t\t\tif (!isBoolean(isMask)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: isMask must be a boolean\");\n\t\t\t\t}\n\n\t\t\t\tif (!isStringOrNull(img.colorspace)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: colorspace must be a string or null\");\n\t\t\t\t}\n\t\t\t\tif (!isStringOrNull(img.description)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: description must be a string or null\");\n\t\t\t\t}\n\n\t\t\t\treturn {\n\t\t\t\t\tdata: img.data,\n\t\t\t\t\tformat: img.format,\n\t\t\t\t\timageIndex: imageIndex,\n\t\t\t\t\tpageNumber: pageNumber ?? null,\n\t\t\t\t\twidth: (img.width as number) ?? null,\n\t\t\t\t\theight: (img.height as number) ?? null,\n\t\t\t\t\tcolorspace: (img.colorspace as string) ?? null,\n\t\t\t\t\tbitsPerComponent: bitsPerComponent ?? null,\n\t\t\t\t\tisMask: isMask ?? false,\n\t\t\t\t\tdescription: (img.description as string) ?? null,\n\t\t\t\t\tocrResult: ocrResult ? jsToExtractionResult(ocrResult) : null,\n\t\t\t\t};\n\t\t\t})\n\t\t: null;\n\n\tlet detectedLanguages: string[] | null = null;\n\tconst detectedLanguagesRaw = Array.isArray(result.detectedLanguages)\n\t\t? result.detectedLanguages\n\t\t: result.detected_languages;\n\tif (Array.isArray(detectedLanguagesRaw)) {\n\t\tif (!detectedLanguagesRaw.every((lang) => typeof lang === \"string\")) {\n\t\t\tthrow new Error(\"Invalid result: detectedLanguages must contain only strings\");\n\t\t}\n\t\tdetectedLanguages = detectedLanguagesRaw;\n\t}\n\n\tconst extractedKeywords = (result.extractedKeywords ?? result.extracted_keywords ?? null) as\n\t\t| ExtractedKeyword[]\n\t\t| null;\n\tconst qualityScore =\n\t\ttypeof (result.qualityScore ?? result.quality_score) === \"number\"\n\t\t\t? ((result.qualityScore ?? result.quality_score) as number)\n\t\t\t: null;\n\tconst processingWarnings = (result.processingWarnings ?? result.processing_warnings ?? null) as\n\t\t| ProcessingWarning[]\n\t\t| null;\n\tconst elements = (result.elements ?? null) as Element[] | null;\n\tconst ocrElements = (result.ocrElements ?? result.ocr_elements ?? null) as OcrElement[] | null;\n\tconst document = (result.document ?? null) as DocumentStructure | null;\n\tconst pages = (result.pages ?? null) as PageContent[] | null;\n\n\treturn {\n\t\tcontent: result.content,\n\t\tmimeType,\n\t\tmetadata: (result.metadata ?? {}) as Metadata,\n\t\ttables,\n\t\tdetectedLanguages,\n\t\tchunks,\n\t\timages,\n\t\tpages,\n\t\textractedKeywords,\n\t\tqualityScore,\n\t\tprocessingWarnings,\n\t\telements,\n\t\tocrElements,\n\t\tdocument,\n\t};\n}\n\n/**\n * Wrap and format WASM errors with context\n *\n * Converts WASM error messages to JavaScript Error objects with proper context\n * and stack trace information when available.\n *\n * @param error - The error from WASM\n * @param context - Additional context about what operation failed\n * @returns A formatted Error object\n *\n * @internal\n *\n * @example\n * ```typescript\n * try {\n * await wasmExtract(bytes, mimeType);\n * } catch (error) {\n * throw wrapWasmError(error, 'extracting document');\n * }\n * ```\n */\nexport function wrapWasmError(error: unknown, context: string): Error {\n\tif (error instanceof Error) {\n\t\treturn new Error(`Error ${context}: ${error.message}`, {\n\t\t\tcause: error,\n\t\t});\n\t}\n\n\tconst message = String(error);\n\treturn new Error(`Error ${context}: ${message}`);\n}\n\n/**\n * Validate that a WASM-returned value conforms to ExtractionResult structure\n *\n * Performs structural validation without full type checking,\n * useful for runtime validation of WASM output.\n *\n * @param value - The value to validate\n * @returns True if value appears to be a valid ExtractionResult\n *\n * @internal\n */\nexport function isValidExtractionResult(value: unknown): value is ExtractionResult {\n\tif (!value || typeof value !== \"object\") {\n\t\treturn false;\n\t}\n\n\tconst obj = value as Record<string, unknown>;\n\treturn (\n\t\ttypeof obj.content === \"string\" &&\n\t\t(typeof obj.mimeType === \"string\" || typeof obj.mime_type === \"string\") &&\n\t\tobj.metadata !== null &&\n\t\ttypeof obj.metadata === \"object\" &&\n\t\tArray.isArray(obj.tables)\n\t);\n}\n"],"mappings":";AAgDA,IAAM,gBAAgB,MAAM,OAAO;AAOnC,SAAS,eAAe,OAAwC;AAC/D,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,UAAU;AACjE;AAOA,SAAS,eAAe,OAAwC;AAC/D,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,UAAU;AACjE;AAOA,SAAS,UAAU,OAAkC;AACpD,SAAO,OAAO,UAAU,aAAa,UAAU;AAChD;AAmBA,eAAsB,iBAAiB,MAAwC;AAC9E,MAAI;AACH,QAAI,KAAK,OAAO,eAAe;AAC9B,YAAM,IAAI;AAAA,QACT,cAAc,KAAK,IAAI,4BAA4B,aAAa;AAAA,MACjE;AAAA,IACD;AAEA,UAAM,cAAc,MAAM,KAAK,YAAY;AAC3C,WAAO,IAAI,WAAW,WAAW;AAAA,EAClC,SAAS,OAAO;AACf,UAAM,IAAI,MAAM,wBAAwB,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC,EAAE;AAAA,EACjG;AACD;AAoBO,SAAS,WAAW,QAA0D;AACpF,MAAI,CAAC,QAAQ;AACZ,WAAO,CAAC;AAAA,EACT;AAEA,QAAM,aAAsC,CAAC;AAE7C,QAAM,iBAAiB,CAAC,UAA4B;AACnD,QAAI,UAAU,QAAQ,UAAU,QAAW;AAC1C,aAAO;AAAA,IACR;AACA,QAAI,OAAO,UAAU,UAAU;AAC9B,UAAI,MAAM,QAAQ,KAAK,GAAG;AACzB,eAAO,MAAM,IAAI,cAAc;AAAA,MAChC;AACA,YAAM,MAAM;AACZ,YAAMA,cAAsC,CAAC;AAC7C,iBAAW,CAAC,KAAK,GAAG,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC7C,cAAM,gBAAgB,eAAe,GAAG;AACxC,YAAI,kBAAkB,QAAQ,kBAAkB,QAAW;AAC1D,UAAAA,YAAW,GAAG,IAAI;AAAA,QACnB;AAAA,MACD;AACA,aAAO,OAAO,KAAKA,WAAU,EAAE,SAAS,IAAIA,cAAa;AAAA,IAC1D;AACA,WAAO;AAAA,EACR;AAEA,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,MAAM,GAAG;AAClD,UAAM,kBAAkB,eAAe,KAAK;AAC5C,QAAI,oBAAoB,QAAQ,oBAAoB,QAAW;AAC9D,iBAAW,GAAG,IAAI;AAAA,IACnB;AAAA,EACD;AAEA,SAAO;AACR;AAmBO,SAAS,qBAAqB,SAAoC;AACxE,MAAI,CAAC,WAAW,OAAO,YAAY,UAAU;AAC5C,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACpE;AAEA,QAAM,SAAS;AACf,QAAM,WACL,OAAO,OAAO,aAAa,WACxB,OAAO,WACP,OAAO,OAAO,cAAc,WAC3B,OAAO,YACP;AAEL,MAAI,OAAO,OAAO,YAAY,UAAU;AACvC,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACxE;AACA,MAAI,OAAO,aAAa,UAAU;AACjC,UAAM,IAAI,MAAM,wDAAwD;AAAA,EACzE;AACA,MAAI,CAAC,OAAO,YAAY,OAAO,OAAO,aAAa,UAAU;AAC5D,UAAM,IAAI,MAAM,wDAAwD;AAAA,EACzE;AAEA,QAAM,SAAkB,CAAC;AACzB,MAAI,MAAM,QAAQ,OAAO,MAAM,GAAG;AACjC,eAAW,SAAS,OAAO,QAAQ;AAClC,UAAI,SAAS,OAAO,UAAU,UAAU;AACvC,cAAM,IAAI;AACV,cAAM,aACL,OAAO,EAAE,eAAe,WAAW,EAAE,aAAa,OAAO,EAAE,gBAAgB,WAAW,EAAE,cAAc;AACvG,YACC,MAAM,QAAQ,EAAE,KAAK,KACrB,EAAE,MAAM,MAAM,CAAC,QAAQ,MAAM,QAAQ,GAAG,KAAK,IAAI,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,CAAC,KAC1F,OAAO,EAAE,aAAa,YACtB,eAAe,MACd;AACD,iBAAO,KAAK;AAAA,YACX,OAAO,EAAE;AAAA,YACT,UAAU,EAAE;AAAA,YACZ;AAAA,UACD,CAAC;AAAA,QACF;AAAA,MACD;AAAA,IACD;AAAA,EACD;AAEA,QAAM,SAAyB,MAAM,QAAQ,OAAO,MAAM,IACvD,OAAO,OAAO,IAAI,CAAC,UAAU;AAC7B,QAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,YAAM,IAAI,MAAM,yBAAyB;AAAA,IAC1C;AACA,UAAM,IAAI;AACV,QAAI,OAAO,EAAE,YAAY,UAAU;AAClC,YAAM,IAAI,MAAM,gCAAgC;AAAA,IACjD;AACA,QAAI,CAAC,EAAE,YAAY,OAAO,EAAE,aAAa,UAAU;AAClD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IAClD;AACA,UAAM,WAAW,EAAE;AAEnB,QAAI,YAA6B;AACjC,QAAI,MAAM,QAAQ,EAAE,SAAS,GAAG;AAC/B,UAAI,CAAC,EAAE,UAAU,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,GAAG;AAC3D,cAAM,IAAI,MAAM,oDAAoD;AAAA,MACrE;AACA,kBAAY,EAAE;AAAA,IACf;AAGA,UAAM,iBAAiB,CAAC,OAAgB,cAA8B;AACrE,UAAI,OAAO,UAAU,UAAU;AAC9B,eAAO;AAAA,MACR;AACA,UAAI,OAAO,UAAU,UAAU;AAC9B,eAAO,OAAO,KAAK;AAAA,MACpB;AACA,UAAI,OAAO,UAAU,UAAU;AAC9B,cAAM,SAAS,SAAS,OAAO,EAAE;AACjC,YAAI,OAAO,MAAM,MAAM,GAAG;AACzB,gBAAM,IAAI,MAAM,2BAA2B,SAAS,iCAAiC,KAAK,GAAG;AAAA,QAC9F;AACA,eAAO;AAAA,MACR;AACA,YAAM,IAAI,MAAM,2BAA2B,SAAS,0BAA0B,OAAO,KAAK,EAAE;AAAA,IAC7F;AAKA,UAAM,YAAY;AAAA,MACjB,SAAS,aAAa,SAAS,cAAc,SAAS,aAAa,SAAS;AAAA,MAC5E;AAAA,IACD;AACA,UAAM,UAAU;AAAA,MACf,SAAS,WAAW,SAAS,YAAY,SAAS,WAAW,SAAS;AAAA,MACtE;AAAA,IACD;AACA,UAAM,aAAa,eAAe,SAAS,cAAc,SAAS,aAAa,YAAY;AAC3F,UAAM,cAAc,eAAe,SAAS,eAAe,SAAS,cAAc,aAAa;AAE/F,QAAI,aAA4B;AAChC,UAAM,kBAAkB,SAAS,cAAc,SAAS;AACxD,QAAI,oBAAoB,QAAQ,oBAAoB,QAAW;AAC9D,mBAAa,eAAe,iBAAiB,YAAY;AAAA,IAC1D;AAEA,WAAO;AAAA,MACN,SAAS,EAAE;AAAA,MACX;AAAA,MACA,UAAU;AAAA,QACT;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,MACD;AAAA,IACD;AAAA,EACD,CAAC,IACA;AAEH,QAAM,SAAkC,MAAM,QAAQ,OAAO,MAAM,IAChE,OAAO,OAAO,IAAI,CAAC,UAAU;AAC7B,QAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,YAAM,IAAI,MAAM,yBAAyB;AAAA,IAC1C;AACA,UAAM,MAAM;AACZ,QAAI,EAAE,IAAI,gBAAgB,aAAa;AACtC,YAAM,IAAI,MAAM,wCAAwC;AAAA,IACzD;AACA,QAAI,OAAO,IAAI,WAAW,UAAU;AACnC,YAAM,IAAI,MAAM,+BAA+B;AAAA,IAChD;AAGA,UAAM,aAAa,IAAI,cAAc,IAAI;AACzC,UAAM,aAAa,IAAI,cAAc,IAAI;AACzC,UAAM,mBAAmB,IAAI,oBAAoB,IAAI;AACrD,UAAM,SAAS,IAAI,UAAU,IAAI;AACjC,UAAM,YAAY,IAAI,aAAa,IAAI;AAEvC,QAAI,OAAO,eAAe,UAAU;AACnC,YAAM,IAAI,MAAM,4CAA4C;AAAA,IAC7D;AACA,QAAI,CAAC,eAAe,UAAU,GAAG;AAChC,YAAM,IAAI,MAAM,oDAAoD;AAAA,IACrE;AACA,QAAI,CAAC,eAAe,IAAI,KAAK,GAAG;AAC/B,YAAM,IAAI,MAAM,+CAA+C;AAAA,IAChE;AACA,QAAI,CAAC,eAAe,IAAI,MAAM,GAAG;AAChC,YAAM,IAAI,MAAM,gDAAgD;AAAA,IACjE;AACA,QAAI,CAAC,eAAe,gBAAgB,GAAG;AACtC,YAAM,IAAI,MAAM,0DAA0D;AAAA,IAC3E;AAEA,QAAI,CAAC,UAAU,MAAM,GAAG;AACvB,YAAM,IAAI,MAAM,yCAAyC;AAAA,IAC1D;AAEA,QAAI,CAAC,eAAe,IAAI,UAAU,GAAG;AACpC,YAAM,IAAI,MAAM,oDAAoD;AAAA,IACrE;AACA,QAAI,CAAC,eAAe,IAAI,WAAW,GAAG;AACrC,YAAM,IAAI,MAAM,qDAAqD;AAAA,IACtE;AAEA,WAAO;AAAA,MACN,MAAM,IAAI;AAAA,MACV,QAAQ,IAAI;AAAA,MACZ;AAAA,MACA,YAAY,cAAc;AAAA,MAC1B,OAAQ,IAAI,SAAoB;AAAA,MAChC,QAAS,IAAI,UAAqB;AAAA,MAClC,YAAa,IAAI,cAAyB;AAAA,MAC1C,kBAAkB,oBAAoB;AAAA,MACtC,QAAQ,UAAU;AAAA,MAClB,aAAc,IAAI,eAA0B;AAAA,MAC5C,WAAW,YAAY,qBAAqB,SAAS,IAAI;AAAA,IAC1D;AAAA,EACD,CAAC,IACA;AAEH,MAAI,oBAAqC;AACzC,QAAM,uBAAuB,MAAM,QAAQ,OAAO,iBAAiB,IAChE,OAAO,oBACP,OAAO;AACV,MAAI,MAAM,QAAQ,oBAAoB,GAAG;AACxC,QAAI,CAAC,qBAAqB,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,GAAG;AACpE,YAAM,IAAI,MAAM,6DAA6D;AAAA,IAC9E;AACA,wBAAoB;AAAA,EACrB;AAEA,QAAM,oBAAqB,OAAO,qBAAqB,OAAO,sBAAsB;AAGpF,QAAM,eACL,QAAQ,OAAO,gBAAgB,OAAO,mBAAmB,WACpD,OAAO,gBAAgB,OAAO,gBAChC;AACJ,QAAM,qBAAsB,OAAO,sBAAsB,OAAO,uBAAuB;AAGvF,QAAM,WAAY,OAAO,YAAY;AACrC,QAAM,cAAe,OAAO,eAAe,OAAO,gBAAgB;AAClE,QAAM,WAAY,OAAO,YAAY;AACrC,QAAM,QAAS,OAAO,SAAS;AAE/B,SAAO;AAAA,IACN,SAAS,OAAO;AAAA,IAChB;AAAA,IACA,UAAW,OAAO,YAAY,CAAC;AAAA,IAC/B;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACD;AACD;AAuBO,SAAS,cAAc,OAAgB,SAAwB;AACrE,MAAI,iBAAiB,OAAO;AAC3B,WAAO,IAAI,MAAM,SAAS,OAAO,KAAK,MAAM,OAAO,IAAI;AAAA,MACtD,OAAO;AAAA,IACR,CAAC;AAAA,EACF;AAEA,QAAM,UAAU,OAAO,KAAK;AAC5B,SAAO,IAAI,MAAM,SAAS,OAAO,KAAK,OAAO,EAAE;AAChD;AAaO,SAAS,wBAAwB,OAA2C;AAClF,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,WAAO;AAAA,EACR;AAEA,QAAM,MAAM;AACZ,SACC,OAAO,IAAI,YAAY,aACtB,OAAO,IAAI,aAAa,YAAY,OAAO,IAAI,cAAc,aAC9D,IAAI,aAAa,QACjB,OAAO,IAAI,aAAa,YACxB,MAAM,QAAQ,IAAI,MAAM;AAE1B;","names":["normalized"]}
|
|
1
|
+
{"version":3,"sources":["../../typescript/adapters/wasm-adapter.ts"],"sourcesContent":["/**\n * WASM Type Adapter\n *\n * This module provides type adapters for converting between JavaScript/TypeScript\n * types and WASM-compatible types, handling File/Blob conversions, config normalization,\n * and result parsing.\n *\n * @example File Conversion\n * ```typescript\n * import { fileToUint8Array } from '@kreuzberg/wasm/adapters/wasm-adapter';\n *\n * const file = event.target.files[0];\n * const bytes = await fileToUint8Array(file);\n * const result = await extractBytes(bytes, file.type);\n * ```\n *\n * @example Config Normalization\n * ```typescript\n * import { configToJS } from '@kreuzberg/wasm/adapters/wasm-adapter';\n *\n * const config = {\n * ocr: { backend: 'tesseract', language: 'eng' },\n * chunking: { maxChars: 1000 }\n * };\n * const normalized = configToJS(config);\n * ```\n */\n\nimport type {\n\tChunk,\n\tDocumentStructure,\n\tElement,\n\tExtractedImage,\n\tExtractedKeyword,\n\tExtractionConfig,\n\tExtractionResult,\n\tMetadata,\n\tOcrElement,\n\tPageContent,\n\tPdfAnnotation,\n\tProcessingWarning,\n\tTable,\n} from \"../types.js\";\n\n/**\n * Maximum file size for processing (512 MB)\n *\n * @internal\n */\nconst MAX_FILE_SIZE = 512 * 1024 * 1024;\n\n/**\n * Type predicate to validate numeric value or null\n *\n * @internal\n */\nfunction isNumberOrNull(value: unknown): value is number | null {\n\treturn typeof value === \"number\" || value === null || value === undefined;\n}\n\n/**\n * Type predicate to validate string value or null\n *\n * @internal\n */\nfunction isStringOrNull(value: unknown): value is string | null {\n\treturn typeof value === \"string\" || value === null || value === undefined;\n}\n\n/**\n * Type predicate to validate boolean value\n *\n * @internal\n */\nfunction isBoolean(value: unknown): value is boolean {\n\treturn typeof value === \"boolean\" || value === undefined;\n}\n\n/**\n * Convert a File or Blob to Uint8Array\n *\n * Handles both browser File API and server-side Blob-like objects,\n * providing a unified interface for reading binary data.\n *\n * @param file - The File or Blob to convert\n * @returns Promise resolving to the byte array\n * @throws {Error} If the file cannot be read or exceeds size limit\n *\n * @example\n * ```typescript\n * const file = document.getElementById('input').files[0];\n * const bytes = await fileToUint8Array(file);\n * const result = await extractBytes(bytes, 'application/pdf');\n * ```\n */\nexport async function fileToUint8Array(file: File | Blob): Promise<Uint8Array> {\n\ttry {\n\t\tif (file.size > MAX_FILE_SIZE) {\n\t\t\tthrow new Error(\n\t\t\t\t`File size (${file.size} bytes) exceeds maximum (${MAX_FILE_SIZE} bytes). Maximum file size is 512 MB.`,\n\t\t\t);\n\t\t}\n\n\t\tconst arrayBuffer = await file.arrayBuffer();\n\t\treturn new Uint8Array(arrayBuffer);\n\t} catch (error) {\n\t\tthrow new Error(`Failed to read file: ${error instanceof Error ? error.message : String(error)}`);\n\t}\n}\n\n/**\n * Normalize ExtractionConfig for WASM processing\n *\n * Converts TypeScript configuration objects to a WASM-compatible format,\n * handling null values, undefined properties, and nested structures.\n *\n * @param config - The extraction configuration or null\n * @returns Normalized configuration object suitable for WASM\n *\n * @example\n * ```typescript\n * const config: ExtractionConfig = {\n * ocr: { backend: 'tesseract' },\n * chunking: { maxChars: 1000 }\n * };\n * const wasmConfig = configToJS(config);\n * ```\n */\nexport function configToJS(config: ExtractionConfig | null): Record<string, unknown> {\n\tif (!config) {\n\t\treturn {};\n\t}\n\n\t// Convert camelCase key to snake_case to match Rust serde field names.\n\tconst toSnakeCase = (str: string): string => str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);\n\n\tconst normalizeValue = (value: unknown): unknown => {\n\t\tif (value === null || value === undefined) {\n\t\t\treturn null;\n\t\t}\n\t\tif (typeof value === \"object\") {\n\t\t\tif (Array.isArray(value)) {\n\t\t\t\treturn value.map(normalizeValue);\n\t\t\t}\n\t\t\tconst obj = value as Record<string, unknown>;\n\t\t\tconst normalized: Record<string, unknown> = {};\n\t\t\tfor (const [key, val] of Object.entries(obj)) {\n\t\t\t\tconst normalizedVal = normalizeValue(val);\n\t\t\t\tif (normalizedVal !== null && normalizedVal !== undefined) {\n\t\t\t\t\tnormalized[toSnakeCase(key)] = normalizedVal;\n\t\t\t\t}\n\t\t\t}\n\t\t\treturn Object.keys(normalized).length > 0 ? normalized : null;\n\t\t}\n\t\treturn value;\n\t};\n\n\tconst normalized: Record<string, unknown> = {};\n\tfor (const [key, value] of Object.entries(config)) {\n\t\tconst normalizedValue = normalizeValue(value);\n\t\tif (normalizedValue !== null && normalizedValue !== undefined) {\n\t\t\tnormalized[toSnakeCase(key)] = normalizedValue;\n\t\t}\n\t}\n\n\treturn normalized;\n}\n\n/**\n * Parse WASM extraction result and convert to TypeScript type\n *\n * Handles conversion of WASM-returned objects to proper ExtractionResult types,\n * including proper array conversions and type assertions for tables, chunks, and images.\n *\n * @param jsValue - The raw WASM result value\n * @returns Properly typed ExtractionResult\n * @throws {Error} If the result structure is invalid\n *\n * @example\n * ```typescript\n * const wasmResult = await wasmExtract(bytes, mimeType, config);\n * const result = jsToExtractionResult(wasmResult);\n * console.log(result.content);\n * ```\n */\nexport function jsToExtractionResult(jsValue: unknown): ExtractionResult {\n\tif (!jsValue || typeof jsValue !== \"object\") {\n\t\tthrow new Error(\"Invalid extraction result: value is not an object\");\n\t}\n\n\tconst result = jsValue as Record<string, unknown>;\n\tconst mimeType =\n\t\ttypeof result.mimeType === \"string\"\n\t\t\t? result.mimeType\n\t\t\t: typeof result.mime_type === \"string\"\n\t\t\t\t? result.mime_type\n\t\t\t\t: null;\n\n\tif (typeof result.content !== \"string\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid content\");\n\t}\n\tif (typeof mimeType !== \"string\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid mimeType\");\n\t}\n\tif (!result.metadata || typeof result.metadata !== \"object\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid metadata\");\n\t}\n\n\tconst tables: Table[] = [];\n\tif (Array.isArray(result.tables)) {\n\t\tfor (const table of result.tables) {\n\t\t\tif (table && typeof table === \"object\") {\n\t\t\t\tconst t = table as Record<string, unknown>;\n\t\t\t\tconst pageNumber =\n\t\t\t\t\ttypeof t.pageNumber === \"number\" ? t.pageNumber : typeof t.page_number === \"number\" ? t.page_number : null;\n\t\t\t\tif (\n\t\t\t\t\tArray.isArray(t.cells) &&\n\t\t\t\t\tt.cells.every((row) => Array.isArray(row) && row.every((cell) => typeof cell === \"string\")) &&\n\t\t\t\t\ttypeof t.markdown === \"string\" &&\n\t\t\t\t\tpageNumber !== null\n\t\t\t\t) {\n\t\t\t\t\ttables.push({\n\t\t\t\t\t\tcells: t.cells as string[][],\n\t\t\t\t\t\tmarkdown: t.markdown,\n\t\t\t\t\t\tpageNumber,\n\t\t\t\t\t});\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\tconst chunks: Chunk[] | null = Array.isArray(result.chunks)\n\t\t? result.chunks.map((chunk) => {\n\t\t\t\tif (!chunk || typeof chunk !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk structure\");\n\t\t\t\t}\n\t\t\t\tconst c = chunk as Record<string, unknown>;\n\t\t\t\tif (typeof c.content !== \"string\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk: missing content\");\n\t\t\t\t}\n\t\t\t\tif (!c.metadata || typeof c.metadata !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk: missing metadata\");\n\t\t\t\t}\n\t\t\t\tconst metadata = c.metadata as Record<string, unknown>;\n\n\t\t\t\tlet embedding: number[] | null = null;\n\t\t\t\tif (Array.isArray(c.embedding)) {\n\t\t\t\t\tif (!c.embedding.every((item) => typeof item === \"number\")) {\n\t\t\t\t\t\tthrow new Error(\"Invalid chunk: embedding must contain only numbers\");\n\t\t\t\t\t}\n\t\t\t\t\tembedding = c.embedding;\n\t\t\t\t}\n\n\t\t\t\t// Coerce numeric values - handle BigInt, strings, and numbers\n\t\t\t\tconst coerceToNumber = (value: unknown, fieldName: string): number => {\n\t\t\t\t\tif (typeof value === \"number\") {\n\t\t\t\t\t\treturn value;\n\t\t\t\t\t}\n\t\t\t\t\tif (typeof value === \"bigint\") {\n\t\t\t\t\t\treturn Number(value);\n\t\t\t\t\t}\n\t\t\t\t\tif (typeof value === \"string\") {\n\t\t\t\t\t\tconst parsed = parseInt(value, 10);\n\t\t\t\t\t\tif (Number.isNaN(parsed)) {\n\t\t\t\t\t\t\tthrow new Error(`Invalid chunk metadata: ${fieldName} must be a valid number, got \"${value}\"`);\n\t\t\t\t\t\t}\n\t\t\t\t\t\treturn parsed;\n\t\t\t\t\t}\n\t\t\t\t\tthrow new Error(`Invalid chunk metadata: ${fieldName} must be a number, got ${typeof value}`);\n\t\t\t\t};\n\n\t\t\t\t// The Rust code uses snake_case field names (byte_start, byte_end, etc)\n\t\t\t\t// but TypeScript expects camelCase (charStart, charEnd, etc)\n\t\t\t\t// For now, treat byte offsets as character offsets since the content is UTF-8\n\t\t\t\tconst charStart = coerceToNumber(\n\t\t\t\t\tmetadata.charStart ?? metadata.char_start ?? metadata.byteStart ?? metadata.byte_start,\n\t\t\t\t\t\"charStart\",\n\t\t\t\t);\n\t\t\t\tconst charEnd = coerceToNumber(\n\t\t\t\t\tmetadata.charEnd ?? metadata.char_end ?? metadata.byteEnd ?? metadata.byte_end,\n\t\t\t\t\t\"charEnd\",\n\t\t\t\t);\n\t\t\t\tconst chunkIndex = coerceToNumber(metadata.chunkIndex ?? metadata.chunk_index, \"chunkIndex\");\n\t\t\t\tconst totalChunks = coerceToNumber(metadata.totalChunks ?? metadata.total_chunks, \"totalChunks\");\n\n\t\t\t\tlet tokenCount: number | null = null;\n\t\t\t\tconst tokenCountValue = metadata.tokenCount ?? metadata.token_count;\n\t\t\t\tif (tokenCountValue !== null && tokenCountValue !== undefined) {\n\t\t\t\t\ttokenCount = coerceToNumber(tokenCountValue, \"tokenCount\");\n\t\t\t\t}\n\n\t\t\t\treturn {\n\t\t\t\t\tcontent: c.content,\n\t\t\t\t\tembedding,\n\t\t\t\t\tmetadata: {\n\t\t\t\t\t\tcharStart,\n\t\t\t\t\t\tcharEnd,\n\t\t\t\t\t\ttokenCount,\n\t\t\t\t\t\tchunkIndex,\n\t\t\t\t\t\ttotalChunks,\n\t\t\t\t\t},\n\t\t\t\t};\n\t\t\t})\n\t\t: null;\n\n\tconst images: ExtractedImage[] | null = Array.isArray(result.images)\n\t\t? result.images.map((image) => {\n\t\t\t\tif (!image || typeof image !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid image structure\");\n\t\t\t\t}\n\t\t\t\tconst img = image as Record<string, unknown>;\n\t\t\t\tlet imageData: Uint8Array;\n\t\t\t\tif (img.data instanceof Uint8Array) {\n\t\t\t\t\timageData = img.data;\n\t\t\t\t} else if (Array.isArray(img.data)) {\n\t\t\t\t\timageData = new Uint8Array(img.data as number[]);\n\t\t\t\t} else {\n\t\t\t\t\tthrow new Error(\"Invalid image: data must be Uint8Array or array\");\n\t\t\t\t}\n\t\t\t\tif (typeof img.format !== \"string\") {\n\t\t\t\t\tthrow new Error(\"Invalid image: missing format\");\n\t\t\t\t}\n\n\t\t\t\t// Support both camelCase and snake_case field names (Rust serde uses snake_case)\n\t\t\t\tconst imageIndex = img.imageIndex ?? img.image_index;\n\t\t\t\tconst pageNumber = img.pageNumber ?? img.page_number;\n\t\t\t\tconst bitsPerComponent = img.bitsPerComponent ?? img.bits_per_component;\n\t\t\t\tconst isMask = img.isMask ?? img.is_mask;\n\t\t\t\tconst ocrResult = img.ocrResult ?? img.ocr_result;\n\n\t\t\t\tif (typeof imageIndex !== \"number\") {\n\t\t\t\t\tthrow new Error(\"Invalid image: imageIndex must be a number\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(pageNumber)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: pageNumber must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(img.width)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: width must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(img.height)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: height must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(bitsPerComponent)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: bitsPerComponent must be a number or null\");\n\t\t\t\t}\n\n\t\t\t\tif (!isBoolean(isMask)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: isMask must be a boolean\");\n\t\t\t\t}\n\n\t\t\t\tif (!isStringOrNull(img.colorspace)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: colorspace must be a string or null\");\n\t\t\t\t}\n\t\t\t\tif (!isStringOrNull(img.description)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: description must be a string or null\");\n\t\t\t\t}\n\n\t\t\t\treturn {\n\t\t\t\t\tdata: imageData,\n\t\t\t\t\tformat: img.format,\n\t\t\t\t\timageIndex: imageIndex,\n\t\t\t\t\tpageNumber: pageNumber ?? null,\n\t\t\t\t\twidth: (img.width as number) ?? null,\n\t\t\t\t\theight: (img.height as number) ?? null,\n\t\t\t\t\tcolorspace: (img.colorspace as string) ?? null,\n\t\t\t\t\tbitsPerComponent: bitsPerComponent ?? null,\n\t\t\t\t\tisMask: isMask ?? false,\n\t\t\t\t\tdescription: (img.description as string) ?? null,\n\t\t\t\t\tocrResult: ocrResult ? jsToExtractionResult(ocrResult) : null,\n\t\t\t\t};\n\t\t\t})\n\t\t: null;\n\n\tlet detectedLanguages: string[] | null = null;\n\tconst detectedLanguagesRaw = Array.isArray(result.detectedLanguages)\n\t\t? result.detectedLanguages\n\t\t: result.detected_languages;\n\tif (Array.isArray(detectedLanguagesRaw)) {\n\t\tif (!detectedLanguagesRaw.every((lang) => typeof lang === \"string\")) {\n\t\t\tthrow new Error(\"Invalid result: detectedLanguages must contain only strings\");\n\t\t}\n\t\tdetectedLanguages = detectedLanguagesRaw;\n\t}\n\n\tconst extractedKeywords = (result.extractedKeywords ?? result.extracted_keywords ?? null) as\n\t\t| ExtractedKeyword[]\n\t\t| null;\n\tconst qualityScore =\n\t\ttypeof (result.qualityScore ?? result.quality_score) === \"number\"\n\t\t\t? ((result.qualityScore ?? result.quality_score) as number)\n\t\t\t: null;\n\tconst processingWarnings = (result.processingWarnings ?? result.processing_warnings ?? null) as\n\t\t| ProcessingWarning[]\n\t\t| null;\n\tconst elements = (result.elements ?? null) as Element[] | null;\n\tconst ocrElements = (result.ocrElements ?? result.ocr_elements ?? null) as OcrElement[] | null;\n\tconst document = (result.document ?? null) as DocumentStructure | null;\n\tconst pages = (result.pages ?? null) as PageContent[] | null;\n\tconst annotations = (result.annotations ?? null) as PdfAnnotation[] | null;\n\n\treturn {\n\t\tcontent: result.content,\n\t\tmimeType,\n\t\tmetadata: (result.metadata ?? {}) as Metadata,\n\t\ttables,\n\t\tdetectedLanguages,\n\t\tchunks,\n\t\timages,\n\t\tpages,\n\t\textractedKeywords,\n\t\tqualityScore,\n\t\tprocessingWarnings,\n\t\telements,\n\t\tocrElements,\n\t\tdocument,\n\t\tannotations,\n\t};\n}\n\n/**\n * Wrap and format WASM errors with context\n *\n * Converts WASM error messages to JavaScript Error objects with proper context\n * and stack trace information when available.\n *\n * @param error - The error from WASM\n * @param context - Additional context about what operation failed\n * @returns A formatted Error object\n *\n * @internal\n *\n * @example\n * ```typescript\n * try {\n * await wasmExtract(bytes, mimeType);\n * } catch (error) {\n * throw wrapWasmError(error, 'extracting document');\n * }\n * ```\n */\nexport function wrapWasmError(error: unknown, context: string): Error {\n\tif (error instanceof Error) {\n\t\treturn new Error(`Error ${context}: ${error.message}`, {\n\t\t\tcause: error,\n\t\t});\n\t}\n\n\tconst message = String(error);\n\treturn new Error(`Error ${context}: ${message}`);\n}\n\n/**\n * Validate that a WASM-returned value conforms to ExtractionResult structure\n *\n * Performs structural validation without full type checking,\n * useful for runtime validation of WASM output.\n *\n * @param value - The value to validate\n * @returns True if value appears to be a valid ExtractionResult\n *\n * @internal\n */\nexport function isValidExtractionResult(value: unknown): value is ExtractionResult {\n\tif (!value || typeof value !== \"object\") {\n\t\treturn false;\n\t}\n\n\tconst obj = value as Record<string, unknown>;\n\treturn (\n\t\ttypeof obj.content === \"string\" &&\n\t\t(typeof obj.mimeType === \"string\" || typeof obj.mime_type === \"string\") &&\n\t\tobj.metadata !== null &&\n\t\ttypeof obj.metadata === \"object\" &&\n\t\tArray.isArray(obj.tables)\n\t);\n}\n"],"mappings":";AAiDA,IAAM,gBAAgB,MAAM,OAAO;AAOnC,SAAS,eAAe,OAAwC;AAC/D,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,UAAU;AACjE;AAOA,SAAS,eAAe,OAAwC;AAC/D,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,UAAU;AACjE;AAOA,SAAS,UAAU,OAAkC;AACpD,SAAO,OAAO,UAAU,aAAa,UAAU;AAChD;AAmBA,eAAsB,iBAAiB,MAAwC;AAC9E,MAAI;AACH,QAAI,KAAK,OAAO,eAAe;AAC9B,YAAM,IAAI;AAAA,QACT,cAAc,KAAK,IAAI,4BAA4B,aAAa;AAAA,MACjE;AAAA,IACD;AAEA,UAAM,cAAc,MAAM,KAAK,YAAY;AAC3C,WAAO,IAAI,WAAW,WAAW;AAAA,EAClC,SAAS,OAAO;AACf,UAAM,IAAI,MAAM,wBAAwB,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC,EAAE;AAAA,EACjG;AACD;AAoBO,SAAS,WAAW,QAA0D;AACpF,MAAI,CAAC,QAAQ;AACZ,WAAO,CAAC;AAAA,EACT;AAGA,QAAM,cAAc,CAAC,QAAwB,IAAI,QAAQ,UAAU,CAAC,WAAW,IAAI,OAAO,YAAY,CAAC,EAAE;AAEzG,QAAM,iBAAiB,CAAC,UAA4B;AACnD,QAAI,UAAU,QAAQ,UAAU,QAAW;AAC1C,aAAO;AAAA,IACR;AACA,QAAI,OAAO,UAAU,UAAU;AAC9B,UAAI,MAAM,QAAQ,KAAK,GAAG;AACzB,eAAO,MAAM,IAAI,cAAc;AAAA,MAChC;AACA,YAAM,MAAM;AACZ,YAAMA,cAAsC,CAAC;AAC7C,iBAAW,CAAC,KAAK,GAAG,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC7C,cAAM,gBAAgB,eAAe,GAAG;AACxC,YAAI,kBAAkB,QAAQ,kBAAkB,QAAW;AAC1D,UAAAA,YAAW,YAAY,GAAG,CAAC,IAAI;AAAA,QAChC;AAAA,MACD;AACA,aAAO,OAAO,KAAKA,WAAU,EAAE,SAAS,IAAIA,cAAa;AAAA,IAC1D;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAsC,CAAC;AAC7C,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,MAAM,GAAG;AAClD,UAAM,kBAAkB,eAAe,KAAK;AAC5C,QAAI,oBAAoB,QAAQ,oBAAoB,QAAW;AAC9D,iBAAW,YAAY,GAAG,CAAC,IAAI;AAAA,IAChC;AAAA,EACD;AAEA,SAAO;AACR;AAmBO,SAAS,qBAAqB,SAAoC;AACxE,MAAI,CAAC,WAAW,OAAO,YAAY,UAAU;AAC5C,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACpE;AAEA,QAAM,SAAS;AACf,QAAM,WACL,OAAO,OAAO,aAAa,WACxB,OAAO,WACP,OAAO,OAAO,cAAc,WAC3B,OAAO,YACP;AAEL,MAAI,OAAO,OAAO,YAAY,UAAU;AACvC,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACxE;AACA,MAAI,OAAO,aAAa,UAAU;AACjC,UAAM,IAAI,MAAM,wDAAwD;AAAA,EACzE;AACA,MAAI,CAAC,OAAO,YAAY,OAAO,OAAO,aAAa,UAAU;AAC5D,UAAM,IAAI,MAAM,wDAAwD;AAAA,EACzE;AAEA,QAAM,SAAkB,CAAC;AACzB,MAAI,MAAM,QAAQ,OAAO,MAAM,GAAG;AACjC,eAAW,SAAS,OAAO,QAAQ;AAClC,UAAI,SAAS,OAAO,UAAU,UAAU;AACvC,cAAM,IAAI;AACV,cAAM,aACL,OAAO,EAAE,eAAe,WAAW,EAAE,aAAa,OAAO,EAAE,gBAAgB,WAAW,EAAE,cAAc;AACvG,YACC,MAAM,QAAQ,EAAE,KAAK,KACrB,EAAE,MAAM,MAAM,CAAC,QAAQ,MAAM,QAAQ,GAAG,KAAK,IAAI,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,CAAC,KAC1F,OAAO,EAAE,aAAa,YACtB,eAAe,MACd;AACD,iBAAO,KAAK;AAAA,YACX,OAAO,EAAE;AAAA,YACT,UAAU,EAAE;AAAA,YACZ;AAAA,UACD,CAAC;AAAA,QACF;AAAA,MACD;AAAA,IACD;AAAA,EACD;AAEA,QAAM,SAAyB,MAAM,QAAQ,OAAO,MAAM,IACvD,OAAO,OAAO,IAAI,CAAC,UAAU;AAC7B,QAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,YAAM,IAAI,MAAM,yBAAyB;AAAA,IAC1C;AACA,UAAM,IAAI;AACV,QAAI,OAAO,EAAE,YAAY,UAAU;AAClC,YAAM,IAAI,MAAM,gCAAgC;AAAA,IACjD;AACA,QAAI,CAAC,EAAE,YAAY,OAAO,EAAE,aAAa,UAAU;AAClD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IAClD;AACA,UAAM,WAAW,EAAE;AAEnB,QAAI,YAA6B;AACjC,QAAI,MAAM,QAAQ,EAAE,SAAS,GAAG;AAC/B,UAAI,CAAC,EAAE,UAAU,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,GAAG;AAC3D,cAAM,IAAI,MAAM,oDAAoD;AAAA,MACrE;AACA,kBAAY,EAAE;AAAA,IACf;AAGA,UAAM,iBAAiB,CAAC,OAAgB,cAA8B;AACrE,UAAI,OAAO,UAAU,UAAU;AAC9B,eAAO;AAAA,MACR;AACA,UAAI,OAAO,UAAU,UAAU;AAC9B,eAAO,OAAO,KAAK;AAAA,MACpB;AACA,UAAI,OAAO,UAAU,UAAU;AAC9B,cAAM,SAAS,SAAS,OAAO,EAAE;AACjC,YAAI,OAAO,MAAM,MAAM,GAAG;AACzB,gBAAM,IAAI,MAAM,2BAA2B,SAAS,iCAAiC,KAAK,GAAG;AAAA,QAC9F;AACA,eAAO;AAAA,MACR;AACA,YAAM,IAAI,MAAM,2BAA2B,SAAS,0BAA0B,OAAO,KAAK,EAAE;AAAA,IAC7F;AAKA,UAAM,YAAY;AAAA,MACjB,SAAS,aAAa,SAAS,cAAc,SAAS,aAAa,SAAS;AAAA,MAC5E;AAAA,IACD;AACA,UAAM,UAAU;AAAA,MACf,SAAS,WAAW,SAAS,YAAY,SAAS,WAAW,SAAS;AAAA,MACtE;AAAA,IACD;AACA,UAAM,aAAa,eAAe,SAAS,cAAc,SAAS,aAAa,YAAY;AAC3F,UAAM,cAAc,eAAe,SAAS,eAAe,SAAS,cAAc,aAAa;AAE/F,QAAI,aAA4B;AAChC,UAAM,kBAAkB,SAAS,cAAc,SAAS;AACxD,QAAI,oBAAoB,QAAQ,oBAAoB,QAAW;AAC9D,mBAAa,eAAe,iBAAiB,YAAY;AAAA,IAC1D;AAEA,WAAO;AAAA,MACN,SAAS,EAAE;AAAA,MACX;AAAA,MACA,UAAU;AAAA,QACT;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,MACD;AAAA,IACD;AAAA,EACD,CAAC,IACA;AAEH,QAAM,SAAkC,MAAM,QAAQ,OAAO,MAAM,IAChE,OAAO,OAAO,IAAI,CAAC,UAAU;AAC7B,QAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,YAAM,IAAI,MAAM,yBAAyB;AAAA,IAC1C;AACA,UAAM,MAAM;AACZ,QAAI;AACJ,QAAI,IAAI,gBAAgB,YAAY;AACnC,kBAAY,IAAI;AAAA,IACjB,WAAW,MAAM,QAAQ,IAAI,IAAI,GAAG;AACnC,kBAAY,IAAI,WAAW,IAAI,IAAgB;AAAA,IAChD,OAAO;AACN,YAAM,IAAI,MAAM,iDAAiD;AAAA,IAClE;AACA,QAAI,OAAO,IAAI,WAAW,UAAU;AACnC,YAAM,IAAI,MAAM,+BAA+B;AAAA,IAChD;AAGA,UAAM,aAAa,IAAI,cAAc,IAAI;AACzC,UAAM,aAAa,IAAI,cAAc,IAAI;AACzC,UAAM,mBAAmB,IAAI,oBAAoB,IAAI;AACrD,UAAM,SAAS,IAAI,UAAU,IAAI;AACjC,UAAM,YAAY,IAAI,aAAa,IAAI;AAEvC,QAAI,OAAO,eAAe,UAAU;AACnC,YAAM,IAAI,MAAM,4CAA4C;AAAA,IAC7D;AACA,QAAI,CAAC,eAAe,UAAU,GAAG;AAChC,YAAM,IAAI,MAAM,oDAAoD;AAAA,IACrE;AACA,QAAI,CAAC,eAAe,IAAI,KAAK,GAAG;AAC/B,YAAM,IAAI,MAAM,+CAA+C;AAAA,IAChE;AACA,QAAI,CAAC,eAAe,IAAI,MAAM,GAAG;AAChC,YAAM,IAAI,MAAM,gDAAgD;AAAA,IACjE;AACA,QAAI,CAAC,eAAe,gBAAgB,GAAG;AACtC,YAAM,IAAI,MAAM,0DAA0D;AAAA,IAC3E;AAEA,QAAI,CAAC,UAAU,MAAM,GAAG;AACvB,YAAM,IAAI,MAAM,yCAAyC;AAAA,IAC1D;AAEA,QAAI,CAAC,eAAe,IAAI,UAAU,GAAG;AACpC,YAAM,IAAI,MAAM,oDAAoD;AAAA,IACrE;AACA,QAAI,CAAC,eAAe,IAAI,WAAW,GAAG;AACrC,YAAM,IAAI,MAAM,qDAAqD;AAAA,IACtE;AAEA,WAAO;AAAA,MACN,MAAM;AAAA,MACN,QAAQ,IAAI;AAAA,MACZ;AAAA,MACA,YAAY,cAAc;AAAA,MAC1B,OAAQ,IAAI,SAAoB;AAAA,MAChC,QAAS,IAAI,UAAqB;AAAA,MAClC,YAAa,IAAI,cAAyB;AAAA,MAC1C,kBAAkB,oBAAoB;AAAA,MACtC,QAAQ,UAAU;AAAA,MAClB,aAAc,IAAI,eAA0B;AAAA,MAC5C,WAAW,YAAY,qBAAqB,SAAS,IAAI;AAAA,IAC1D;AAAA,EACD,CAAC,IACA;AAEH,MAAI,oBAAqC;AACzC,QAAM,uBAAuB,MAAM,QAAQ,OAAO,iBAAiB,IAChE,OAAO,oBACP,OAAO;AACV,MAAI,MAAM,QAAQ,oBAAoB,GAAG;AACxC,QAAI,CAAC,qBAAqB,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,GAAG;AACpE,YAAM,IAAI,MAAM,6DAA6D;AAAA,IAC9E;AACA,wBAAoB;AAAA,EACrB;AAEA,QAAM,oBAAqB,OAAO,qBAAqB,OAAO,sBAAsB;AAGpF,QAAM,eACL,QAAQ,OAAO,gBAAgB,OAAO,mBAAmB,WACpD,OAAO,gBAAgB,OAAO,gBAChC;AACJ,QAAM,qBAAsB,OAAO,sBAAsB,OAAO,uBAAuB;AAGvF,QAAM,WAAY,OAAO,YAAY;AACrC,QAAM,cAAe,OAAO,eAAe,OAAO,gBAAgB;AAClE,QAAM,WAAY,OAAO,YAAY;AACrC,QAAM,QAAS,OAAO,SAAS;AAC/B,QAAM,cAAe,OAAO,eAAe;AAE3C,SAAO;AAAA,IACN,SAAS,OAAO;AAAA,IAChB;AAAA,IACA,UAAW,OAAO,YAAY,CAAC;AAAA,IAC/B;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACD;AACD;AAuBO,SAAS,cAAc,OAAgB,SAAwB;AACrE,MAAI,iBAAiB,OAAO;AAC3B,WAAO,IAAI,MAAM,SAAS,OAAO,KAAK,MAAM,OAAO,IAAI;AAAA,MACtD,OAAO;AAAA,IACR,CAAC;AAAA,EACF;AAEA,QAAM,UAAU,OAAO,KAAK;AAC5B,SAAO,IAAI,MAAM,SAAS,OAAO,KAAK,OAAO,EAAE;AAChD;AAaO,SAAS,wBAAwB,OAA2C;AAClF,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,WAAO;AAAA,EACR;AAEA,QAAM,MAAM;AACZ,SACC,OAAO,IAAI,YAAY,aACtB,OAAO,IAAI,aAAa,YAAY,OAAO,IAAI,cAAc,aAC9D,IAAI,aAAa,QACjB,OAAO,IAAI,aAAa,YACxB,MAAM,QAAQ,IAAI,MAAM;AAE1B;","names":["normalized"]}
|
package/dist/index.js
CHANGED
|
@@ -1,29 +1,3 @@
|
|
|
1
|
-
var __defProp = Object.defineProperty;
|
|
2
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
3
|
-
var __esm = (fn, res) => function __init() {
|
|
4
|
-
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
5
|
-
};
|
|
6
|
-
var __export = (target, all) => {
|
|
7
|
-
for (var name in all)
|
|
8
|
-
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
-
};
|
|
10
|
-
|
|
11
|
-
// typescript/pdfium.js
|
|
12
|
-
var pdfium_exports = {};
|
|
13
|
-
__export(pdfium_exports, {
|
|
14
|
-
default: () => initPdfium
|
|
15
|
-
});
|
|
16
|
-
async function initPdfium() {
|
|
17
|
-
return {
|
|
18
|
-
// Dummy implementation for testing
|
|
19
|
-
};
|
|
20
|
-
}
|
|
21
|
-
var init_pdfium = __esm({
|
|
22
|
-
"typescript/pdfium.js"() {
|
|
23
|
-
"use strict";
|
|
24
|
-
}
|
|
25
|
-
});
|
|
26
|
-
|
|
27
1
|
// typescript/runtime.ts
|
|
28
2
|
function detectRuntime() {
|
|
29
3
|
const globalCaches = globalThis.caches;
|
|
@@ -174,16 +148,72 @@ function getRuntimeInfo() {
|
|
|
174
148
|
}
|
|
175
149
|
|
|
176
150
|
// typescript/initialization/pdfium-loader.ts
|
|
151
|
+
async function loadPdfiumForNode() {
|
|
152
|
+
try {
|
|
153
|
+
const fs = await import(
|
|
154
|
+
/* @vite-ignore */
|
|
155
|
+
"fs/promises"
|
|
156
|
+
);
|
|
157
|
+
const path = await import(
|
|
158
|
+
/* @vite-ignore */
|
|
159
|
+
"path"
|
|
160
|
+
);
|
|
161
|
+
const url = await import(
|
|
162
|
+
/* @vite-ignore */
|
|
163
|
+
"url"
|
|
164
|
+
);
|
|
165
|
+
const __dirname = path.dirname(url.fileURLToPath(import.meta.url));
|
|
166
|
+
const envPath = process.env.KREUZBERG_PDFIUM_PATH;
|
|
167
|
+
const candidates = [];
|
|
168
|
+
if (envPath) {
|
|
169
|
+
candidates.push(path.join(envPath, "pdfium.js"));
|
|
170
|
+
candidates.push(envPath);
|
|
171
|
+
}
|
|
172
|
+
candidates.push(
|
|
173
|
+
path.join(__dirname, "..", "pdfium.js"),
|
|
174
|
+
// dist/pdfium.js
|
|
175
|
+
path.join(__dirname, "pdfium.js"),
|
|
176
|
+
// dist/initialization/pdfium.js
|
|
177
|
+
path.join(__dirname, "..", "..", "pdfium.js")
|
|
178
|
+
// package root pdfium.js
|
|
179
|
+
);
|
|
180
|
+
for (const candidate of candidates) {
|
|
181
|
+
try {
|
|
182
|
+
await fs.access(candidate);
|
|
183
|
+
const moduleUrl = url.pathToFileURL(candidate).href;
|
|
184
|
+
return await import(
|
|
185
|
+
/* @vite-ignore */
|
|
186
|
+
moduleUrl
|
|
187
|
+
);
|
|
188
|
+
} catch {
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
return null;
|
|
192
|
+
} catch {
|
|
193
|
+
return null;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
async function loadPdfiumModule() {
|
|
197
|
+
if (isNode()) {
|
|
198
|
+
return loadPdfiumForNode();
|
|
199
|
+
}
|
|
200
|
+
try {
|
|
201
|
+
return await import("./pdfium.js");
|
|
202
|
+
} catch {
|
|
203
|
+
return null;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
177
206
|
async function initializePdfiumAsync(wasmModule) {
|
|
178
207
|
if (!wasmModule || typeof wasmModule.initialize_pdfium_render !== "function") {
|
|
179
208
|
return;
|
|
180
209
|
}
|
|
181
|
-
if (!isBrowser()) {
|
|
182
|
-
console.debug("PDFium initialization skipped (non-browser environment)");
|
|
183
|
-
return;
|
|
184
|
-
}
|
|
185
210
|
try {
|
|
186
|
-
const pdfiumModule = await
|
|
211
|
+
const pdfiumModule = await loadPdfiumModule();
|
|
212
|
+
if (!pdfiumModule) {
|
|
213
|
+
console.debug("PDFium module not found, PDF extraction will not be available");
|
|
214
|
+
console.debug("To enable PDF support, provide pdfium.js via KREUZBERG_PDFIUM_PATH or manual initialization");
|
|
215
|
+
return;
|
|
216
|
+
}
|
|
187
217
|
const pdfium = typeof pdfiumModule.default === "function" ? await pdfiumModule.default() : pdfiumModule;
|
|
188
218
|
const success = wasmModule.initialize_pdfium_render(pdfium, wasmModule, false);
|
|
189
219
|
if (!success) {
|
|
@@ -222,7 +252,7 @@ function configToJS(config) {
|
|
|
222
252
|
if (!config) {
|
|
223
253
|
return {};
|
|
224
254
|
}
|
|
225
|
-
const
|
|
255
|
+
const toSnakeCase = (str) => str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
|
|
226
256
|
const normalizeValue = (value) => {
|
|
227
257
|
if (value === null || value === void 0) {
|
|
228
258
|
return null;
|
|
@@ -236,17 +266,18 @@ function configToJS(config) {
|
|
|
236
266
|
for (const [key, val] of Object.entries(obj)) {
|
|
237
267
|
const normalizedVal = normalizeValue(val);
|
|
238
268
|
if (normalizedVal !== null && normalizedVal !== void 0) {
|
|
239
|
-
normalized2[key] = normalizedVal;
|
|
269
|
+
normalized2[toSnakeCase(key)] = normalizedVal;
|
|
240
270
|
}
|
|
241
271
|
}
|
|
242
272
|
return Object.keys(normalized2).length > 0 ? normalized2 : null;
|
|
243
273
|
}
|
|
244
274
|
return value;
|
|
245
275
|
};
|
|
276
|
+
const normalized = {};
|
|
246
277
|
for (const [key, value] of Object.entries(config)) {
|
|
247
278
|
const normalizedValue = normalizeValue(value);
|
|
248
279
|
if (normalizedValue !== null && normalizedValue !== void 0) {
|
|
249
|
-
normalized[key] = normalizedValue;
|
|
280
|
+
normalized[toSnakeCase(key)] = normalizedValue;
|
|
250
281
|
}
|
|
251
282
|
}
|
|
252
283
|
return normalized;
|
|
@@ -349,8 +380,13 @@ function jsToExtractionResult(jsValue) {
|
|
|
349
380
|
throw new Error("Invalid image structure");
|
|
350
381
|
}
|
|
351
382
|
const img = image;
|
|
352
|
-
|
|
353
|
-
|
|
383
|
+
let imageData;
|
|
384
|
+
if (img.data instanceof Uint8Array) {
|
|
385
|
+
imageData = img.data;
|
|
386
|
+
} else if (Array.isArray(img.data)) {
|
|
387
|
+
imageData = new Uint8Array(img.data);
|
|
388
|
+
} else {
|
|
389
|
+
throw new Error("Invalid image: data must be Uint8Array or array");
|
|
354
390
|
}
|
|
355
391
|
if (typeof img.format !== "string") {
|
|
356
392
|
throw new Error("Invalid image: missing format");
|
|
@@ -385,7 +421,7 @@ function jsToExtractionResult(jsValue) {
|
|
|
385
421
|
throw new Error("Invalid image: description must be a string or null");
|
|
386
422
|
}
|
|
387
423
|
return {
|
|
388
|
-
data:
|
|
424
|
+
data: imageData,
|
|
389
425
|
format: img.format,
|
|
390
426
|
imageIndex,
|
|
391
427
|
pageNumber: pageNumber ?? null,
|
|
@@ -413,6 +449,7 @@ function jsToExtractionResult(jsValue) {
|
|
|
413
449
|
const ocrElements = result.ocrElements ?? result.ocr_elements ?? null;
|
|
414
450
|
const document2 = result.document ?? null;
|
|
415
451
|
const pages = result.pages ?? null;
|
|
452
|
+
const annotations = result.annotations ?? null;
|
|
416
453
|
return {
|
|
417
454
|
content: result.content,
|
|
418
455
|
mimeType,
|
|
@@ -427,7 +464,8 @@ function jsToExtractionResult(jsValue) {
|
|
|
427
464
|
processingWarnings,
|
|
428
465
|
elements,
|
|
429
466
|
ocrElements,
|
|
430
|
-
document: document2
|
|
467
|
+
document: document2,
|
|
468
|
+
annotations
|
|
431
469
|
};
|
|
432
470
|
}
|
|
433
471
|
function wrapWasmError(error, context) {
|
|
@@ -1261,24 +1299,173 @@ var TesseractWasmBackend = class {
|
|
|
1261
1299
|
};
|
|
1262
1300
|
|
|
1263
1301
|
// typescript/ocr/enabler.ts
|
|
1302
|
+
var TESSDATA_CDN_BASE = "https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/main";
|
|
1303
|
+
var NativeWasmOcrBackend = class {
|
|
1304
|
+
tessdataCache = /* @__PURE__ */ new Map();
|
|
1305
|
+
tessdataCdnBase = TESSDATA_CDN_BASE;
|
|
1306
|
+
progressCallback = null;
|
|
1307
|
+
name() {
|
|
1308
|
+
return "kreuzberg-tesseract";
|
|
1309
|
+
}
|
|
1310
|
+
supportedLanguages() {
|
|
1311
|
+
return [
|
|
1312
|
+
"eng",
|
|
1313
|
+
"deu",
|
|
1314
|
+
"fra",
|
|
1315
|
+
"spa",
|
|
1316
|
+
"ita",
|
|
1317
|
+
"por",
|
|
1318
|
+
"nld",
|
|
1319
|
+
"rus",
|
|
1320
|
+
"jpn",
|
|
1321
|
+
"kor",
|
|
1322
|
+
"chi_sim",
|
|
1323
|
+
"chi_tra",
|
|
1324
|
+
"pol",
|
|
1325
|
+
"tur",
|
|
1326
|
+
"swe",
|
|
1327
|
+
"dan",
|
|
1328
|
+
"fin",
|
|
1329
|
+
"nor",
|
|
1330
|
+
"ces",
|
|
1331
|
+
"slk",
|
|
1332
|
+
"ron",
|
|
1333
|
+
"hun",
|
|
1334
|
+
"hrv",
|
|
1335
|
+
"srp",
|
|
1336
|
+
"bul",
|
|
1337
|
+
"ukr",
|
|
1338
|
+
"ell",
|
|
1339
|
+
"ara",
|
|
1340
|
+
"heb",
|
|
1341
|
+
"hin",
|
|
1342
|
+
"tha",
|
|
1343
|
+
"vie",
|
|
1344
|
+
"mkd",
|
|
1345
|
+
"ben",
|
|
1346
|
+
"tam",
|
|
1347
|
+
"tel",
|
|
1348
|
+
"kan",
|
|
1349
|
+
"mal",
|
|
1350
|
+
"mya",
|
|
1351
|
+
"khm",
|
|
1352
|
+
"lao",
|
|
1353
|
+
"sin"
|
|
1354
|
+
];
|
|
1355
|
+
}
|
|
1356
|
+
async initialize() {
|
|
1357
|
+
const wasm2 = getWasmModule();
|
|
1358
|
+
if (!wasm2?.ocrIsAvailable || !wasm2.ocrIsAvailable()) {
|
|
1359
|
+
throw new Error(
|
|
1360
|
+
"Native WASM OCR is not available. Build with the 'ocr-wasm' feature to enable kreuzberg-tesseract."
|
|
1361
|
+
);
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
async shutdown() {
|
|
1365
|
+
this.tessdataCache.clear();
|
|
1366
|
+
this.progressCallback = null;
|
|
1367
|
+
}
|
|
1368
|
+
setProgressCallback(callback) {
|
|
1369
|
+
this.progressCallback = callback;
|
|
1370
|
+
}
|
|
1371
|
+
async processImage(imageBytes, language) {
|
|
1372
|
+
const wasm2 = getWasmModule();
|
|
1373
|
+
if (!wasm2?.ocrRecognize) {
|
|
1374
|
+
throw new Error("Native WASM OCR function not available");
|
|
1375
|
+
}
|
|
1376
|
+
const normalizedLang = language.toLowerCase();
|
|
1377
|
+
this.reportProgress(10);
|
|
1378
|
+
const tessdata = await this.getTessdata(normalizedLang);
|
|
1379
|
+
this.reportProgress(40);
|
|
1380
|
+
let imageData;
|
|
1381
|
+
if (typeof imageBytes === "string") {
|
|
1382
|
+
const binaryString = atob(imageBytes);
|
|
1383
|
+
imageData = new Uint8Array(binaryString.length);
|
|
1384
|
+
for (let i = 0; i < binaryString.length; i++) {
|
|
1385
|
+
imageData[i] = binaryString.charCodeAt(i);
|
|
1386
|
+
}
|
|
1387
|
+
} else {
|
|
1388
|
+
imageData = imageBytes;
|
|
1389
|
+
}
|
|
1390
|
+
this.reportProgress(50);
|
|
1391
|
+
const text = wasm2.ocrRecognize(imageData, tessdata, normalizedLang);
|
|
1392
|
+
this.reportProgress(90);
|
|
1393
|
+
return {
|
|
1394
|
+
content: text,
|
|
1395
|
+
mime_type: "text/plain",
|
|
1396
|
+
metadata: { language: normalizedLang },
|
|
1397
|
+
tables: []
|
|
1398
|
+
};
|
|
1399
|
+
}
|
|
1400
|
+
async getTessdata(language) {
|
|
1401
|
+
const cached = this.tessdataCache.get(language);
|
|
1402
|
+
if (cached) {
|
|
1403
|
+
return cached;
|
|
1404
|
+
}
|
|
1405
|
+
const url = `${this.tessdataCdnBase}/${language}.traineddata`;
|
|
1406
|
+
const response = await fetch(url);
|
|
1407
|
+
if (!response.ok) {
|
|
1408
|
+
throw new Error(`Failed to download tessdata for "${language}" from ${url}: ${response.status}`);
|
|
1409
|
+
}
|
|
1410
|
+
const data = new Uint8Array(await response.arrayBuffer());
|
|
1411
|
+
this.tessdataCache.set(language, data);
|
|
1412
|
+
return data;
|
|
1413
|
+
}
|
|
1414
|
+
reportProgress(progress) {
|
|
1415
|
+
if (this.progressCallback) {
|
|
1416
|
+
try {
|
|
1417
|
+
this.progressCallback(Math.min(100, Math.max(0, progress)));
|
|
1418
|
+
} catch {
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1421
|
+
}
|
|
1422
|
+
};
|
|
1264
1423
|
async function enableOcr() {
|
|
1265
1424
|
if (!isInitialized2()) {
|
|
1266
1425
|
throw new Error("WASM module not initialized. Call initWasm() first.");
|
|
1267
1426
|
}
|
|
1268
|
-
|
|
1427
|
+
try {
|
|
1428
|
+
const wasm2 = getWasmModule();
|
|
1429
|
+
if (wasm2?.ocrIsAvailable?.()) {
|
|
1430
|
+
const backend = new NativeWasmOcrBackend();
|
|
1431
|
+
await backend.initialize();
|
|
1432
|
+
registerOcrBackend(backend);
|
|
1433
|
+
registerBackendInRustRegistry(wasm2, backend);
|
|
1434
|
+
return;
|
|
1435
|
+
}
|
|
1436
|
+
if (isBrowser()) {
|
|
1437
|
+
const backend = new TesseractWasmBackend();
|
|
1438
|
+
await backend.initialize();
|
|
1439
|
+
registerOcrBackend(backend);
|
|
1440
|
+
registerBackendInRustRegistry(wasm2, backend);
|
|
1441
|
+
return;
|
|
1442
|
+
}
|
|
1269
1443
|
throw new Error(
|
|
1270
|
-
"OCR
|
|
1444
|
+
"No OCR backend available. Build with the 'ocr-wasm' feature to enable native Tesseract OCR in all environments, or use a browser environment with the tesseract-wasm npm package."
|
|
1271
1445
|
);
|
|
1272
|
-
}
|
|
1273
|
-
try {
|
|
1274
|
-
const backend = new TesseractWasmBackend();
|
|
1275
|
-
await backend.initialize();
|
|
1276
|
-
registerOcrBackend(backend);
|
|
1277
1446
|
} catch (error) {
|
|
1278
1447
|
const message = error instanceof Error ? error.message : String(error);
|
|
1279
1448
|
throw new Error(`Failed to enable OCR: ${message}`);
|
|
1280
1449
|
}
|
|
1281
1450
|
}
|
|
1451
|
+
function registerBackendInRustRegistry(wasm2, backend) {
|
|
1452
|
+
const registerFn = wasm2?.register_ocr_backend;
|
|
1453
|
+
if (!registerFn) {
|
|
1454
|
+
return;
|
|
1455
|
+
}
|
|
1456
|
+
const rustAdapter = {
|
|
1457
|
+
name: () => "tesseract",
|
|
1458
|
+
supportedLanguages: () => backend.supportedLanguages?.() ?? ["eng"],
|
|
1459
|
+
processImage: async (imageBase64, language) => {
|
|
1460
|
+
const result = await backend.processImage(imageBase64, language);
|
|
1461
|
+
return typeof result === "string" ? result : JSON.stringify(result);
|
|
1462
|
+
}
|
|
1463
|
+
};
|
|
1464
|
+
try {
|
|
1465
|
+
registerFn(rustAdapter);
|
|
1466
|
+
} catch {
|
|
1467
|
+
}
|
|
1468
|
+
}
|
|
1282
1469
|
|
|
1283
1470
|
// typescript/plugin-registry.ts
|
|
1284
1471
|
var postProcessors = /* @__PURE__ */ new Map();
|