@kreuzberg/wasm 4.4.5 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.5" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -33,6 +33,9 @@
33
33
  <a href="https://rubygems.org/gems/kreuzberg">
34
34
  <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
35
35
  </a>
36
+ <a href="https://kreuzberg-dev.r-universe.dev/kreuzberg">
37
+ <img src="https://img.shields.io/badge/R-kreuzberg-007ec6" alt="R">
38
+ </a>
36
39
  <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
37
40
  <img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
38
41
  </a>
@@ -44,6 +47,9 @@
44
47
  <a href="https://docs.kreuzberg.dev">
45
48
  <img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
46
49
  </a>
50
+ <a href="https://huggingface.co/Kreuzberg">
51
+ <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
52
+ </a>
47
53
  </div>
48
54
 
49
55
  <img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
@@ -55,9 +61,7 @@
55
61
  </div>
56
62
 
57
63
 
58
- Extract text, tables, images, and metadata from 75+ file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Deno, and Cloudflare Workers with portable deployment and multi-threading support.
59
-
60
- > **Full Feature Parity** — The WASM package supports all extraction capabilities at full parity with native bindings: PDF (via PDFium), Excel/spreadsheets (via Calamine), archives (ZIP, TAR, 7z, GZIP), and OCR (via built-in Tesseract-WASM). No external dependencies required.
64
+ Extract text, tables, images, and metadata from 88+ file formats including PDF, Office documents, and images. WebAssembly bindings for browsers, Deno, and Cloudflare Workers with portable deployment and multi-threading support.
61
65
 
62
66
 
63
67
  ## Installation
@@ -97,7 +101,7 @@ yarn add @kreuzberg/wasm
97
101
  ### System Requirements
98
102
 
99
103
  - Modern browser with WebAssembly support, or Deno 1.0+, or Cloudflare Workers
100
- - OCR is built-in via Tesseract-WASM (enable at runtime with `enableOcr()`)
104
+ - Optional: [Tesseract WASM](https://github.com/naptha/tesseract.js) for OCR functionality
101
105
 
102
106
 
103
107
 
@@ -176,40 +180,6 @@ extractWithOcr().catch(console.error);
176
180
  See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
177
181
 
178
182
 
179
- #### Excel/Spreadsheet Extraction
180
-
181
- Extract structured data from Excel files directly in the browser or server-side runtimes:
182
-
183
- ```ts
184
- import { extractBytes, initWasm } from "@kreuzberg/wasm";
185
-
186
- async function extractSpreadsheet() {
187
- await initWasm();
188
-
189
- const bytes = new Uint8Array(
190
- await fetch("report.xlsx").then((r) => r.arrayBuffer()),
191
- );
192
-
193
- const result = await extractBytes(
194
- bytes,
195
- "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
196
- );
197
-
198
- console.log("Spreadsheet content:");
199
- console.log(result.content);
200
-
201
- if (result.tables && result.tables.length > 0) {
202
- result.tables.forEach((table, index) => {
203
- console.log(`\nSheet ${index + 1}:`);
204
- console.log(table.markdown);
205
- });
206
- }
207
- }
208
-
209
- extractSpreadsheet().catch(console.error);
210
- ```
211
-
212
-
213
183
 
214
184
  #### Processing Multiple Files
215
185
 
@@ -301,19 +271,21 @@ extractDocuments(fileBytes, mimes)
301
271
 
302
272
  ## Features
303
273
 
304
- ### Supported File Formats (75+)
274
+ ### Supported File Formats (88+)
305
275
 
306
- 75+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
276
+ 88+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
307
277
 
308
278
  #### Office Documents
309
279
 
310
280
  | Category | Formats | Capabilities |
311
281
  |----------|---------|--------------|
312
- | **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
313
- | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
314
- | **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
282
+ | **Word Processing** | `.docx`, `.docm`, `.dotx`, `.dotm`, `.dot`, `.odt` | Full text, tables, images, metadata, styles |
283
+ | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.xltx`, `.xlt`, `.ods` | Sheet data, formulas, cell metadata, charts |
284
+ | **Presentations** | `.pptx`, `.pptm`, `.ppsx`, `.potx`, `.potm`, `.pot`, `.ppt` | Slides, speaker notes, images, metadata |
315
285
  | **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
316
286
  | **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
287
+ | **Database** | `.dbf` | Table data extraction, field type support |
288
+ | **Hangul** | `.hwp`, `.hwpx` | Korean document format, text extraction |
317
289
 
318
290
  #### Images (OCR-Enabled)
319
291
 
@@ -354,10 +326,14 @@ extractDocuments(fileBytes, mimes)
354
326
  - **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
355
327
  - **Table Extraction** - Parse tables with structure and cell content preservation
356
328
  - **Image Extraction** - Extract embedded images and render page previews
357
- - **OCR Support** - Built-in Tesseract-WASM for scanned documents and images
358
- - **Full Feature Parity** - All extraction capabilities at parity with native bindings: PDF, Excel, archives, OCR, and 75+ formats
329
+ - **OCR Support** - Integrate multiple OCR backends for scanned documents
330
+
359
331
  - **Async/Await** - Non-blocking document processing with concurrent operations
332
+
333
+
360
334
  - **Plugin System** - Extensible post-processing for custom text transformation
335
+
336
+
361
337
  - **Batch Processing** - Efficiently process multiple documents in parallel
362
338
  - **Memory Efficient** - Stream large files without loading entirely into memory
363
339
  - **Language Detection** - Detect and support multiple languages in documents
@@ -1 +1 @@
1
- {"version":3,"file":"wasm-adapter.d.ts","sourceRoot":"","sources":["../../typescript/adapters/wasm-adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAEH,OAAO,KAAK,EAMX,gBAAgB,EAChB,gBAAgB,EAOhB,MAAM,aAAa,CAAC;AAoCrB;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAsB,gBAAgB,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI,GAAG,OAAO,CAAC,UAAU,CAAC,CAa7E;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,IAAI,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAsCnF;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,OAAO,GAAG,gBAAgB,CAwOvE;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,GAAG,KAAK,CASpE;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,IAAI,gBAAgB,CAajF"}
1
+ {"version":3,"file":"wasm-adapter.d.ts","sourceRoot":"","sources":["../../typescript/adapters/wasm-adapter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAEH,OAAO,KAAK,EAMX,gBAAgB,EAChB,gBAAgB,EAOhB,MAAM,aAAa,CAAC;AAoCrB;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAsB,gBAAgB,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI,GAAG,OAAO,CAAC,UAAU,CAAC,CAa7E;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,IAAI,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAsCnF;AAED;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,OAAO,GAAG,gBAAgB,CAiRvE;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,aAAa,CAAC,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,GAAG,KAAK,CASpE;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,uBAAuB,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,IAAI,gBAAgB,CAajF"}
@@ -137,15 +137,48 @@ function jsToExtractionResult(jsValue) {
137
137
  if (tokenCountValue !== null && tokenCountValue !== void 0) {
138
138
  tokenCount = coerceToNumber(tokenCountValue, "tokenCount");
139
139
  }
140
+ let firstPage = null;
141
+ const firstPageValue = metadata.firstPage ?? metadata.first_page;
142
+ if (firstPageValue !== null && firstPageValue !== void 0) {
143
+ firstPage = coerceToNumber(firstPageValue, "firstPage");
144
+ }
145
+ let lastPage = null;
146
+ const lastPageValue = metadata.lastPage ?? metadata.last_page;
147
+ if (lastPageValue !== null && lastPageValue !== void 0) {
148
+ lastPage = coerceToNumber(lastPageValue, "lastPage");
149
+ }
150
+ const rawHc = metadata["heading_context"] ?? metadata["headingContext"];
151
+ let headingContext = null;
152
+ if (rawHc && typeof rawHc === "object") {
153
+ const rawHeadings = rawHc["headings"];
154
+ if (Array.isArray(rawHeadings)) {
155
+ headingContext = {
156
+ headings: rawHeadings.map((h) => {
157
+ const heading = h;
158
+ return {
159
+ // biome-ignore lint/complexity/useLiteralKeys: dynamic property access from raw object
160
+ level: heading["level"] ?? 0,
161
+ // biome-ignore lint/complexity/useLiteralKeys: dynamic property access from raw object
162
+ text: heading["text"] ?? ""
163
+ };
164
+ })
165
+ };
166
+ }
167
+ }
140
168
  return {
141
169
  content: c.content,
142
170
  embedding,
143
171
  metadata: {
172
+ byteStart: charStart,
173
+ byteEnd: charEnd,
144
174
  charStart,
145
175
  charEnd,
146
176
  tokenCount,
147
177
  chunkIndex,
148
- totalChunks
178
+ totalChunks,
179
+ firstPage,
180
+ lastPage,
181
+ headingContext
149
182
  }
150
183
  };
151
184
  }) : null;
@@ -1 +1 @@
1
- {"version":3,"sources":["../../typescript/adapters/wasm-adapter.ts"],"sourcesContent":["/**\n * WASM Type Adapter\n *\n * This module provides type adapters for converting between JavaScript/TypeScript\n * types and WASM-compatible types, handling File/Blob conversions, config normalization,\n * and result parsing.\n *\n * @example File Conversion\n * ```typescript\n * import { fileToUint8Array } from '@kreuzberg/wasm/adapters/wasm-adapter';\n *\n * const file = event.target.files[0];\n * const bytes = await fileToUint8Array(file);\n * const result = await extractBytes(bytes, file.type);\n * ```\n *\n * @example Config Normalization\n * ```typescript\n * import { configToJS } from '@kreuzberg/wasm/adapters/wasm-adapter';\n *\n * const config = {\n * ocr: { backend: 'tesseract', language: 'eng' },\n * chunking: { maxChars: 1000 }\n * };\n * const normalized = configToJS(config);\n * ```\n */\n\nimport type {\n\tChunk,\n\tDocumentStructure,\n\tElement,\n\tExtractedImage,\n\tExtractedKeyword,\n\tExtractionConfig,\n\tExtractionResult,\n\tMetadata,\n\tOcrElement,\n\tPageContent,\n\tPdfAnnotation,\n\tProcessingWarning,\n\tTable,\n} from \"../types.js\";\n\n/**\n * Maximum file size for processing (512 MB)\n *\n * @internal\n */\nconst MAX_FILE_SIZE = 512 * 1024 * 1024;\n\n/**\n * Type predicate to validate numeric value or null\n *\n * @internal\n */\nfunction isNumberOrNull(value: unknown): value is number | null {\n\treturn typeof value === \"number\" || value === null || value === undefined;\n}\n\n/**\n * Type predicate to validate string value or null\n *\n * @internal\n */\nfunction isStringOrNull(value: unknown): value is string | null {\n\treturn typeof value === \"string\" || value === null || value === undefined;\n}\n\n/**\n * Type predicate to validate boolean value\n *\n * @internal\n */\nfunction isBoolean(value: unknown): value is boolean {\n\treturn typeof value === \"boolean\" || value === undefined;\n}\n\n/**\n * Convert a File or Blob to Uint8Array\n *\n * Handles both browser File API and server-side Blob-like objects,\n * providing a unified interface for reading binary data.\n *\n * @param file - The File or Blob to convert\n * @returns Promise resolving to the byte array\n * @throws {Error} If the file cannot be read or exceeds size limit\n *\n * @example\n * ```typescript\n * const file = document.getElementById('input').files[0];\n * const bytes = await fileToUint8Array(file);\n * const result = await extractBytes(bytes, 'application/pdf');\n * ```\n */\nexport async function fileToUint8Array(file: File | Blob): Promise<Uint8Array> {\n\ttry {\n\t\tif (file.size > MAX_FILE_SIZE) {\n\t\t\tthrow new Error(\n\t\t\t\t`File size (${file.size} bytes) exceeds maximum (${MAX_FILE_SIZE} bytes). Maximum file size is 512 MB.`,\n\t\t\t);\n\t\t}\n\n\t\tconst arrayBuffer = await file.arrayBuffer();\n\t\treturn new Uint8Array(arrayBuffer);\n\t} catch (error) {\n\t\tthrow new Error(`Failed to read file: ${error instanceof Error ? error.message : String(error)}`);\n\t}\n}\n\n/**\n * Normalize ExtractionConfig for WASM processing\n *\n * Converts TypeScript configuration objects to a WASM-compatible format,\n * handling null values, undefined properties, and nested structures.\n *\n * @param config - The extraction configuration or null\n * @returns Normalized configuration object suitable for WASM\n *\n * @example\n * ```typescript\n * const config: ExtractionConfig = {\n * ocr: { backend: 'tesseract' },\n * chunking: { maxChars: 1000 }\n * };\n * const wasmConfig = configToJS(config);\n * ```\n */\nexport function configToJS(config: ExtractionConfig | null): Record<string, unknown> {\n\tif (!config) {\n\t\treturn {};\n\t}\n\n\t// Convert camelCase key to snake_case to match Rust serde field names.\n\tconst toSnakeCase = (str: string): string => str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);\n\n\tconst normalizeValue = (value: unknown): unknown => {\n\t\tif (value === null || value === undefined) {\n\t\t\treturn null;\n\t\t}\n\t\tif (typeof value === \"object\") {\n\t\t\tif (Array.isArray(value)) {\n\t\t\t\treturn value.map(normalizeValue);\n\t\t\t}\n\t\t\tconst obj = value as Record<string, unknown>;\n\t\t\tconst normalized: Record<string, unknown> = {};\n\t\t\tfor (const [key, val] of Object.entries(obj)) {\n\t\t\t\tconst normalizedVal = normalizeValue(val);\n\t\t\t\tif (normalizedVal !== null && normalizedVal !== undefined) {\n\t\t\t\t\tnormalized[toSnakeCase(key)] = normalizedVal;\n\t\t\t\t}\n\t\t\t}\n\t\t\treturn Object.keys(normalized).length > 0 ? normalized : null;\n\t\t}\n\t\treturn value;\n\t};\n\n\tconst normalized: Record<string, unknown> = {};\n\tfor (const [key, value] of Object.entries(config)) {\n\t\tconst normalizedValue = normalizeValue(value);\n\t\tif (normalizedValue !== null && normalizedValue !== undefined) {\n\t\t\tnormalized[toSnakeCase(key)] = normalizedValue;\n\t\t}\n\t}\n\n\treturn normalized;\n}\n\n/**\n * Parse WASM extraction result and convert to TypeScript type\n *\n * Handles conversion of WASM-returned objects to proper ExtractionResult types,\n * including proper array conversions and type assertions for tables, chunks, and images.\n *\n * @param jsValue - The raw WASM result value\n * @returns Properly typed ExtractionResult\n * @throws {Error} If the result structure is invalid\n *\n * @example\n * ```typescript\n * const wasmResult = await wasmExtract(bytes, mimeType, config);\n * const result = jsToExtractionResult(wasmResult);\n * console.log(result.content);\n * ```\n */\nexport function jsToExtractionResult(jsValue: unknown): ExtractionResult {\n\tif (!jsValue || typeof jsValue !== \"object\") {\n\t\tthrow new Error(\"Invalid extraction result: value is not an object\");\n\t}\n\n\tconst result = jsValue as Record<string, unknown>;\n\tconst mimeType =\n\t\ttypeof result.mimeType === \"string\"\n\t\t\t? result.mimeType\n\t\t\t: typeof result.mime_type === \"string\"\n\t\t\t\t? result.mime_type\n\t\t\t\t: null;\n\n\tif (typeof result.content !== \"string\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid content\");\n\t}\n\tif (typeof mimeType !== \"string\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid mimeType\");\n\t}\n\tif (!result.metadata || typeof result.metadata !== \"object\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid metadata\");\n\t}\n\n\tconst tables: Table[] = [];\n\tif (Array.isArray(result.tables)) {\n\t\tfor (const table of result.tables) {\n\t\t\tif (table && typeof table === \"object\") {\n\t\t\t\tconst t = table as Record<string, unknown>;\n\t\t\t\tconst pageNumber =\n\t\t\t\t\ttypeof t.pageNumber === \"number\" ? t.pageNumber : typeof t.page_number === \"number\" ? t.page_number : null;\n\t\t\t\tif (\n\t\t\t\t\tArray.isArray(t.cells) &&\n\t\t\t\t\tt.cells.every((row) => Array.isArray(row) && row.every((cell) => typeof cell === \"string\")) &&\n\t\t\t\t\ttypeof t.markdown === \"string\" &&\n\t\t\t\t\tpageNumber !== null\n\t\t\t\t) {\n\t\t\t\t\ttables.push({\n\t\t\t\t\t\tcells: t.cells as string[][],\n\t\t\t\t\t\tmarkdown: t.markdown,\n\t\t\t\t\t\tpageNumber,\n\t\t\t\t\t});\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\tconst chunks: Chunk[] | null = Array.isArray(result.chunks)\n\t\t? result.chunks.map((chunk) => {\n\t\t\t\tif (!chunk || typeof chunk !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk structure\");\n\t\t\t\t}\n\t\t\t\tconst c = chunk as Record<string, unknown>;\n\t\t\t\tif (typeof c.content !== \"string\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk: missing content\");\n\t\t\t\t}\n\t\t\t\tif (!c.metadata || typeof c.metadata !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk: missing metadata\");\n\t\t\t\t}\n\t\t\t\tconst metadata = c.metadata as Record<string, unknown>;\n\n\t\t\t\tlet embedding: number[] | null = null;\n\t\t\t\tif (Array.isArray(c.embedding)) {\n\t\t\t\t\tif (!c.embedding.every((item) => typeof item === \"number\")) {\n\t\t\t\t\t\tthrow new Error(\"Invalid chunk: embedding must contain only numbers\");\n\t\t\t\t\t}\n\t\t\t\t\tembedding = c.embedding;\n\t\t\t\t}\n\n\t\t\t\t// Coerce numeric values - handle BigInt, strings, and numbers\n\t\t\t\tconst coerceToNumber = (value: unknown, fieldName: string): number => {\n\t\t\t\t\tif (typeof value === \"number\") {\n\t\t\t\t\t\treturn value;\n\t\t\t\t\t}\n\t\t\t\t\tif (typeof value === \"bigint\") {\n\t\t\t\t\t\treturn Number(value);\n\t\t\t\t\t}\n\t\t\t\t\tif (typeof value === \"string\") {\n\t\t\t\t\t\tconst parsed = parseInt(value, 10);\n\t\t\t\t\t\tif (Number.isNaN(parsed)) {\n\t\t\t\t\t\t\tthrow new Error(`Invalid chunk metadata: ${fieldName} must be a valid number, got \"${value}\"`);\n\t\t\t\t\t\t}\n\t\t\t\t\t\treturn parsed;\n\t\t\t\t\t}\n\t\t\t\t\tthrow new Error(`Invalid chunk metadata: ${fieldName} must be a number, got ${typeof value}`);\n\t\t\t\t};\n\n\t\t\t\t// The Rust code uses snake_case field names (byte_start, byte_end, etc)\n\t\t\t\t// but TypeScript expects camelCase (charStart, charEnd, etc)\n\t\t\t\t// For now, treat byte offsets as character offsets since the content is UTF-8\n\t\t\t\tconst charStart = coerceToNumber(\n\t\t\t\t\tmetadata.charStart ?? metadata.char_start ?? metadata.byteStart ?? metadata.byte_start,\n\t\t\t\t\t\"charStart\",\n\t\t\t\t);\n\t\t\t\tconst charEnd = coerceToNumber(\n\t\t\t\t\tmetadata.charEnd ?? metadata.char_end ?? metadata.byteEnd ?? metadata.byte_end,\n\t\t\t\t\t\"charEnd\",\n\t\t\t\t);\n\t\t\t\tconst chunkIndex = coerceToNumber(metadata.chunkIndex ?? metadata.chunk_index, \"chunkIndex\");\n\t\t\t\tconst totalChunks = coerceToNumber(metadata.totalChunks ?? metadata.total_chunks, \"totalChunks\");\n\n\t\t\t\tlet tokenCount: number | null = null;\n\t\t\t\tconst tokenCountValue = metadata.tokenCount ?? metadata.token_count;\n\t\t\t\tif (tokenCountValue !== null && tokenCountValue !== undefined) {\n\t\t\t\t\ttokenCount = coerceToNumber(tokenCountValue, \"tokenCount\");\n\t\t\t\t}\n\n\t\t\t\treturn {\n\t\t\t\t\tcontent: c.content,\n\t\t\t\t\tembedding,\n\t\t\t\t\tmetadata: {\n\t\t\t\t\t\tcharStart,\n\t\t\t\t\t\tcharEnd,\n\t\t\t\t\t\ttokenCount,\n\t\t\t\t\t\tchunkIndex,\n\t\t\t\t\t\ttotalChunks,\n\t\t\t\t\t},\n\t\t\t\t};\n\t\t\t})\n\t\t: null;\n\n\tconst images: ExtractedImage[] | null = Array.isArray(result.images)\n\t\t? result.images.map((image) => {\n\t\t\t\tif (!image || typeof image !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid image structure\");\n\t\t\t\t}\n\t\t\t\tconst img = image as Record<string, unknown>;\n\t\t\t\tlet imageData: Uint8Array;\n\t\t\t\tif (img.data instanceof Uint8Array) {\n\t\t\t\t\timageData = img.data;\n\t\t\t\t} else if (Array.isArray(img.data)) {\n\t\t\t\t\timageData = new Uint8Array(img.data as number[]);\n\t\t\t\t} else {\n\t\t\t\t\tthrow new Error(\"Invalid image: data must be Uint8Array or array\");\n\t\t\t\t}\n\t\t\t\tif (typeof img.format !== \"string\") {\n\t\t\t\t\tthrow new Error(\"Invalid image: missing format\");\n\t\t\t\t}\n\n\t\t\t\t// Support both camelCase and snake_case field names (Rust serde uses snake_case)\n\t\t\t\tconst imageIndex = img.imageIndex ?? img.image_index;\n\t\t\t\tconst pageNumber = img.pageNumber ?? img.page_number;\n\t\t\t\tconst bitsPerComponent = img.bitsPerComponent ?? img.bits_per_component;\n\t\t\t\tconst isMask = img.isMask ?? img.is_mask;\n\t\t\t\tconst ocrResult = img.ocrResult ?? img.ocr_result;\n\n\t\t\t\tif (typeof imageIndex !== \"number\") {\n\t\t\t\t\tthrow new Error(\"Invalid image: imageIndex must be a number\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(pageNumber)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: pageNumber must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(img.width)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: width must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(img.height)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: height must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(bitsPerComponent)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: bitsPerComponent must be a number or null\");\n\t\t\t\t}\n\n\t\t\t\tif (!isBoolean(isMask)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: isMask must be a boolean\");\n\t\t\t\t}\n\n\t\t\t\tif (!isStringOrNull(img.colorspace)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: colorspace must be a string or null\");\n\t\t\t\t}\n\t\t\t\tif (!isStringOrNull(img.description)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: description must be a string or null\");\n\t\t\t\t}\n\n\t\t\t\treturn {\n\t\t\t\t\tdata: imageData,\n\t\t\t\t\tformat: img.format,\n\t\t\t\t\timageIndex: imageIndex,\n\t\t\t\t\tpageNumber: pageNumber ?? null,\n\t\t\t\t\twidth: (img.width as number) ?? null,\n\t\t\t\t\theight: (img.height as number) ?? null,\n\t\t\t\t\tcolorspace: (img.colorspace as string) ?? null,\n\t\t\t\t\tbitsPerComponent: bitsPerComponent ?? null,\n\t\t\t\t\tisMask: isMask ?? false,\n\t\t\t\t\tdescription: (img.description as string) ?? null,\n\t\t\t\t\tocrResult: ocrResult ? jsToExtractionResult(ocrResult) : null,\n\t\t\t\t};\n\t\t\t})\n\t\t: null;\n\n\tlet detectedLanguages: string[] | null = null;\n\tconst detectedLanguagesRaw = Array.isArray(result.detectedLanguages)\n\t\t? result.detectedLanguages\n\t\t: result.detected_languages;\n\tif (Array.isArray(detectedLanguagesRaw)) {\n\t\tif (!detectedLanguagesRaw.every((lang) => typeof lang === \"string\")) {\n\t\t\tthrow new Error(\"Invalid result: detectedLanguages must contain only strings\");\n\t\t}\n\t\tdetectedLanguages = detectedLanguagesRaw;\n\t}\n\n\tconst extractedKeywords = (result.extractedKeywords ?? result.extracted_keywords ?? null) as\n\t\t| ExtractedKeyword[]\n\t\t| null;\n\tconst qualityScore =\n\t\ttypeof (result.qualityScore ?? result.quality_score) === \"number\"\n\t\t\t? ((result.qualityScore ?? result.quality_score) as number)\n\t\t\t: null;\n\tconst processingWarnings = (result.processingWarnings ?? result.processing_warnings ?? null) as\n\t\t| ProcessingWarning[]\n\t\t| null;\n\tconst elements = (result.elements ?? null) as Element[] | null;\n\tconst ocrElements = (result.ocrElements ?? result.ocr_elements ?? null) as OcrElement[] | null;\n\tconst document = (result.document ?? null) as DocumentStructure | null;\n\tconst pages = (result.pages ?? null) as PageContent[] | null;\n\tconst annotations = (result.annotations ?? null) as PdfAnnotation[] | null;\n\n\treturn {\n\t\tcontent: result.content,\n\t\tmimeType,\n\t\tmetadata: (result.metadata ?? {}) as Metadata,\n\t\ttables,\n\t\tdetectedLanguages,\n\t\tchunks,\n\t\timages,\n\t\tpages,\n\t\textractedKeywords,\n\t\tqualityScore,\n\t\tprocessingWarnings,\n\t\telements,\n\t\tocrElements,\n\t\tdocument,\n\t\tannotations,\n\t};\n}\n\n/**\n * Wrap and format WASM errors with context\n *\n * Converts WASM error messages to JavaScript Error objects with proper context\n * and stack trace information when available.\n *\n * @param error - The error from WASM\n * @param context - Additional context about what operation failed\n * @returns A formatted Error object\n *\n * @internal\n *\n * @example\n * ```typescript\n * try {\n * await wasmExtract(bytes, mimeType);\n * } catch (error) {\n * throw wrapWasmError(error, 'extracting document');\n * }\n * ```\n */\nexport function wrapWasmError(error: unknown, context: string): Error {\n\tif (error instanceof Error) {\n\t\treturn new Error(`Error ${context}: ${error.message}`, {\n\t\t\tcause: error,\n\t\t});\n\t}\n\n\tconst message = String(error);\n\treturn new Error(`Error ${context}: ${message}`);\n}\n\n/**\n * Validate that a WASM-returned value conforms to ExtractionResult structure\n *\n * Performs structural validation without full type checking,\n * useful for runtime validation of WASM output.\n *\n * @param value - The value to validate\n * @returns True if value appears to be a valid ExtractionResult\n *\n * @internal\n */\nexport function isValidExtractionResult(value: unknown): value is ExtractionResult {\n\tif (!value || typeof value !== \"object\") {\n\t\treturn false;\n\t}\n\n\tconst obj = value as Record<string, unknown>;\n\treturn (\n\t\ttypeof obj.content === \"string\" &&\n\t\t(typeof obj.mimeType === \"string\" || typeof obj.mime_type === \"string\") &&\n\t\tobj.metadata !== null &&\n\t\ttypeof obj.metadata === \"object\" &&\n\t\tArray.isArray(obj.tables)\n\t);\n}\n"],"mappings":";AAiDA,IAAM,gBAAgB,MAAM,OAAO;AAOnC,SAAS,eAAe,OAAwC;AAC/D,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,UAAU;AACjE;AAOA,SAAS,eAAe,OAAwC;AAC/D,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,UAAU;AACjE;AAOA,SAAS,UAAU,OAAkC;AACpD,SAAO,OAAO,UAAU,aAAa,UAAU;AAChD;AAmBA,eAAsB,iBAAiB,MAAwC;AAC9E,MAAI;AACH,QAAI,KAAK,OAAO,eAAe;AAC9B,YAAM,IAAI;AAAA,QACT,cAAc,KAAK,IAAI,4BAA4B,aAAa;AAAA,MACjE;AAAA,IACD;AAEA,UAAM,cAAc,MAAM,KAAK,YAAY;AAC3C,WAAO,IAAI,WAAW,WAAW;AAAA,EAClC,SAAS,OAAO;AACf,UAAM,IAAI,MAAM,wBAAwB,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC,EAAE;AAAA,EACjG;AACD;AAoBO,SAAS,WAAW,QAA0D;AACpF,MAAI,CAAC,QAAQ;AACZ,WAAO,CAAC;AAAA,EACT;AAGA,QAAM,cAAc,CAAC,QAAwB,IAAI,QAAQ,UAAU,CAAC,WAAW,IAAI,OAAO,YAAY,CAAC,EAAE;AAEzG,QAAM,iBAAiB,CAAC,UAA4B;AACnD,QAAI,UAAU,QAAQ,UAAU,QAAW;AAC1C,aAAO;AAAA,IACR;AACA,QAAI,OAAO,UAAU,UAAU;AAC9B,UAAI,MAAM,QAAQ,KAAK,GAAG;AACzB,eAAO,MAAM,IAAI,cAAc;AAAA,MAChC;AACA,YAAM,MAAM;AACZ,YAAMA,cAAsC,CAAC;AAC7C,iBAAW,CAAC,KAAK,GAAG,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC7C,cAAM,gBAAgB,eAAe,GAAG;AACxC,YAAI,kBAAkB,QAAQ,kBAAkB,QAAW;AAC1D,UAAAA,YAAW,YAAY,GAAG,CAAC,IAAI;AAAA,QAChC;AAAA,MACD;AACA,aAAO,OAAO,KAAKA,WAAU,EAAE,SAAS,IAAIA,cAAa;AAAA,IAC1D;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAsC,CAAC;AAC7C,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,MAAM,GAAG;AAClD,UAAM,kBAAkB,eAAe,KAAK;AAC5C,QAAI,oBAAoB,QAAQ,oBAAoB,QAAW;AAC9D,iBAAW,YAAY,GAAG,CAAC,IAAI;AAAA,IAChC;AAAA,EACD;AAEA,SAAO;AACR;AAmBO,SAAS,qBAAqB,SAAoC;AACxE,MAAI,CAAC,WAAW,OAAO,YAAY,UAAU;AAC5C,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACpE;AAEA,QAAM,SAAS;AACf,QAAM,WACL,OAAO,OAAO,aAAa,WACxB,OAAO,WACP,OAAO,OAAO,cAAc,WAC3B,OAAO,YACP;AAEL,MAAI,OAAO,OAAO,YAAY,UAAU;AACvC,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACxE;AACA,MAAI,OAAO,aAAa,UAAU;AACjC,UAAM,IAAI,MAAM,wDAAwD;AAAA,EACzE;AACA,MAAI,CAAC,OAAO,YAAY,OAAO,OAAO,aAAa,UAAU;AAC5D,UAAM,IAAI,MAAM,wDAAwD;AAAA,EACzE;AAEA,QAAM,SAAkB,CAAC;AACzB,MAAI,MAAM,QAAQ,OAAO,MAAM,GAAG;AACjC,eAAW,SAAS,OAAO,QAAQ;AAClC,UAAI,SAAS,OAAO,UAAU,UAAU;AACvC,cAAM,IAAI;AACV,cAAM,aACL,OAAO,EAAE,eAAe,WAAW,EAAE,aAAa,OAAO,EAAE,gBAAgB,WAAW,EAAE,cAAc;AACvG,YACC,MAAM,QAAQ,EAAE,KAAK,KACrB,EAAE,MAAM,MAAM,CAAC,QAAQ,MAAM,QAAQ,GAAG,KAAK,IAAI,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,CAAC,KAC1F,OAAO,EAAE,aAAa,YACtB,eAAe,MACd;AACD,iBAAO,KAAK;AAAA,YACX,OAAO,EAAE;AAAA,YACT,UAAU,EAAE;AAAA,YACZ;AAAA,UACD,CAAC;AAAA,QACF;AAAA,MACD;AAAA,IACD;AAAA,EACD;AAEA,QAAM,SAAyB,MAAM,QAAQ,OAAO,MAAM,IACvD,OAAO,OAAO,IAAI,CAAC,UAAU;AAC7B,QAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,YAAM,IAAI,MAAM,yBAAyB;AAAA,IAC1C;AACA,UAAM,IAAI;AACV,QAAI,OAAO,EAAE,YAAY,UAAU;AAClC,YAAM,IAAI,MAAM,gCAAgC;AAAA,IACjD;AACA,QAAI,CAAC,EAAE,YAAY,OAAO,EAAE,aAAa,UAAU;AAClD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IAClD;AACA,UAAM,WAAW,EAAE;AAEnB,QAAI,YAA6B;AACjC,QAAI,MAAM,QAAQ,EAAE,SAAS,GAAG;AAC/B,UAAI,CAAC,EAAE,UAAU,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,GAAG;AAC3D,cAAM,IAAI,MAAM,oDAAoD;AAAA,MACrE;AACA,kBAAY,EAAE;AAAA,IACf;AAGA,UAAM,iBAAiB,CAAC,OAAgB,cAA8B;AACrE,UAAI,OAAO,UAAU,UAAU;AAC9B,eAAO;AAAA,MACR;AACA,UAAI,OAAO,UAAU,UAAU;AAC9B,eAAO,OAAO,KAAK;AAAA,MACpB;AACA,UAAI,OAAO,UAAU,UAAU;AAC9B,cAAM,SAAS,SAAS,OAAO,EAAE;AACjC,YAAI,OAAO,MAAM,MAAM,GAAG;AACzB,gBAAM,IAAI,MAAM,2BAA2B,SAAS,iCAAiC,KAAK,GAAG;AAAA,QAC9F;AACA,eAAO;AAAA,MACR;AACA,YAAM,IAAI,MAAM,2BAA2B,SAAS,0BAA0B,OAAO,KAAK,EAAE;AAAA,IAC7F;AAKA,UAAM,YAAY;AAAA,MACjB,SAAS,aAAa,SAAS,cAAc,SAAS,aAAa,SAAS;AAAA,MAC5E;AAAA,IACD;AACA,UAAM,UAAU;AAAA,MACf,SAAS,WAAW,SAAS,YAAY,SAAS,WAAW,SAAS;AAAA,MACtE;AAAA,IACD;AACA,UAAM,aAAa,eAAe,SAAS,cAAc,SAAS,aAAa,YAAY;AAC3F,UAAM,cAAc,eAAe,SAAS,eAAe,SAAS,cAAc,aAAa;AAE/F,QAAI,aAA4B;AAChC,UAAM,kBAAkB,SAAS,cAAc,SAAS;AACxD,QAAI,oBAAoB,QAAQ,oBAAoB,QAAW;AAC9D,mBAAa,eAAe,iBAAiB,YAAY;AAAA,IAC1D;AAEA,WAAO;AAAA,MACN,SAAS,EAAE;AAAA,MACX;AAAA,MACA,UAAU;AAAA,QACT;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,MACD;AAAA,IACD;AAAA,EACD,CAAC,IACA;AAEH,QAAM,SAAkC,MAAM,QAAQ,OAAO,MAAM,IAChE,OAAO,OAAO,IAAI,CAAC,UAAU;AAC7B,QAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,YAAM,IAAI,MAAM,yBAAyB;AAAA,IAC1C;AACA,UAAM,MAAM;AACZ,QAAI;AACJ,QAAI,IAAI,gBAAgB,YAAY;AACnC,kBAAY,IAAI;AAAA,IACjB,WAAW,MAAM,QAAQ,IAAI,IAAI,GAAG;AACnC,kBAAY,IAAI,WAAW,IAAI,IAAgB;AAAA,IAChD,OAAO;AACN,YAAM,IAAI,MAAM,iDAAiD;AAAA,IAClE;AACA,QAAI,OAAO,IAAI,WAAW,UAAU;AACnC,YAAM,IAAI,MAAM,+BAA+B;AAAA,IAChD;AAGA,UAAM,aAAa,IAAI,cAAc,IAAI;AACzC,UAAM,aAAa,IAAI,cAAc,IAAI;AACzC,UAAM,mBAAmB,IAAI,oBAAoB,IAAI;AACrD,UAAM,SAAS,IAAI,UAAU,IAAI;AACjC,UAAM,YAAY,IAAI,aAAa,IAAI;AAEvC,QAAI,OAAO,eAAe,UAAU;AACnC,YAAM,IAAI,MAAM,4CAA4C;AAAA,IAC7D;AACA,QAAI,CAAC,eAAe,UAAU,GAAG;AAChC,YAAM,IAAI,MAAM,oDAAoD;AAAA,IACrE;AACA,QAAI,CAAC,eAAe,IAAI,KAAK,GAAG;AAC/B,YAAM,IAAI,MAAM,+CAA+C;AAAA,IAChE;AACA,QAAI,CAAC,eAAe,IAAI,MAAM,GAAG;AAChC,YAAM,IAAI,MAAM,gDAAgD;AAAA,IACjE;AACA,QAAI,CAAC,eAAe,gBAAgB,GAAG;AACtC,YAAM,IAAI,MAAM,0DAA0D;AAAA,IAC3E;AAEA,QAAI,CAAC,UAAU,MAAM,GAAG;AACvB,YAAM,IAAI,MAAM,yCAAyC;AAAA,IAC1D;AAEA,QAAI,CAAC,eAAe,IAAI,UAAU,GAAG;AACpC,YAAM,IAAI,MAAM,oDAAoD;AAAA,IACrE;AACA,QAAI,CAAC,eAAe,IAAI,WAAW,GAAG;AACrC,YAAM,IAAI,MAAM,qDAAqD;AAAA,IACtE;AAEA,WAAO;AAAA,MACN,MAAM;AAAA,MACN,QAAQ,IAAI;AAAA,MACZ;AAAA,MACA,YAAY,cAAc;AAAA,MAC1B,OAAQ,IAAI,SAAoB;AAAA,MAChC,QAAS,IAAI,UAAqB;AAAA,MAClC,YAAa,IAAI,cAAyB;AAAA,MAC1C,kBAAkB,oBAAoB;AAAA,MACtC,QAAQ,UAAU;AAAA,MAClB,aAAc,IAAI,eAA0B;AAAA,MAC5C,WAAW,YAAY,qBAAqB,SAAS,IAAI;AAAA,IAC1D;AAAA,EACD,CAAC,IACA;AAEH,MAAI,oBAAqC;AACzC,QAAM,uBAAuB,MAAM,QAAQ,OAAO,iBAAiB,IAChE,OAAO,oBACP,OAAO;AACV,MAAI,MAAM,QAAQ,oBAAoB,GAAG;AACxC,QAAI,CAAC,qBAAqB,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,GAAG;AACpE,YAAM,IAAI,MAAM,6DAA6D;AAAA,IAC9E;AACA,wBAAoB;AAAA,EACrB;AAEA,QAAM,oBAAqB,OAAO,qBAAqB,OAAO,sBAAsB;AAGpF,QAAM,eACL,QAAQ,OAAO,gBAAgB,OAAO,mBAAmB,WACpD,OAAO,gBAAgB,OAAO,gBAChC;AACJ,QAAM,qBAAsB,OAAO,sBAAsB,OAAO,uBAAuB;AAGvF,QAAM,WAAY,OAAO,YAAY;AACrC,QAAM,cAAe,OAAO,eAAe,OAAO,gBAAgB;AAClE,QAAM,WAAY,OAAO,YAAY;AACrC,QAAM,QAAS,OAAO,SAAS;AAC/B,QAAM,cAAe,OAAO,eAAe;AAE3C,SAAO;AAAA,IACN,SAAS,OAAO;AAAA,IAChB;AAAA,IACA,UAAW,OAAO,YAAY,CAAC;AAAA,IAC/B;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACD;AACD;AAuBO,SAAS,cAAc,OAAgB,SAAwB;AACrE,MAAI,iBAAiB,OAAO;AAC3B,WAAO,IAAI,MAAM,SAAS,OAAO,KAAK,MAAM,OAAO,IAAI;AAAA,MACtD,OAAO;AAAA,IACR,CAAC;AAAA,EACF;AAEA,QAAM,UAAU,OAAO,KAAK;AAC5B,SAAO,IAAI,MAAM,SAAS,OAAO,KAAK,OAAO,EAAE;AAChD;AAaO,SAAS,wBAAwB,OAA2C;AAClF,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,WAAO;AAAA,EACR;AAEA,QAAM,MAAM;AACZ,SACC,OAAO,IAAI,YAAY,aACtB,OAAO,IAAI,aAAa,YAAY,OAAO,IAAI,cAAc,aAC9D,IAAI,aAAa,QACjB,OAAO,IAAI,aAAa,YACxB,MAAM,QAAQ,IAAI,MAAM;AAE1B;","names":["normalized"]}
1
+ {"version":3,"sources":["../../typescript/adapters/wasm-adapter.ts"],"sourcesContent":["/**\n * WASM Type Adapter\n *\n * This module provides type adapters for converting between JavaScript/TypeScript\n * types and WASM-compatible types, handling File/Blob conversions, config normalization,\n * and result parsing.\n *\n * @example File Conversion\n * ```typescript\n * import { fileToUint8Array } from '@kreuzberg/wasm/adapters/wasm-adapter';\n *\n * const file = event.target.files[0];\n * const bytes = await fileToUint8Array(file);\n * const result = await extractBytes(bytes, file.type);\n * ```\n *\n * @example Config Normalization\n * ```typescript\n * import { configToJS } from '@kreuzberg/wasm/adapters/wasm-adapter';\n *\n * const config = {\n * ocr: { backend: 'tesseract', language: 'eng' },\n * chunking: { maxChars: 1000 }\n * };\n * const normalized = configToJS(config);\n * ```\n */\n\nimport type {\n\tChunk,\n\tDocumentStructure,\n\tElement,\n\tExtractedImage,\n\tExtractedKeyword,\n\tExtractionConfig,\n\tExtractionResult,\n\tMetadata,\n\tOcrElement,\n\tPageContent,\n\tPdfAnnotation,\n\tProcessingWarning,\n\tTable,\n} from \"../types.js\";\n\n/**\n * Maximum file size for processing (512 MB)\n *\n * @internal\n */\nconst MAX_FILE_SIZE = 512 * 1024 * 1024;\n\n/**\n * Type predicate to validate numeric value or null\n *\n * @internal\n */\nfunction isNumberOrNull(value: unknown): value is number | null {\n\treturn typeof value === \"number\" || value === null || value === undefined;\n}\n\n/**\n * Type predicate to validate string value or null\n *\n * @internal\n */\nfunction isStringOrNull(value: unknown): value is string | null {\n\treturn typeof value === \"string\" || value === null || value === undefined;\n}\n\n/**\n * Type predicate to validate boolean value\n *\n * @internal\n */\nfunction isBoolean(value: unknown): value is boolean {\n\treturn typeof value === \"boolean\" || value === undefined;\n}\n\n/**\n * Convert a File or Blob to Uint8Array\n *\n * Handles both browser File API and server-side Blob-like objects,\n * providing a unified interface for reading binary data.\n *\n * @param file - The File or Blob to convert\n * @returns Promise resolving to the byte array\n * @throws {Error} If the file cannot be read or exceeds size limit\n *\n * @example\n * ```typescript\n * const file = document.getElementById('input').files[0];\n * const bytes = await fileToUint8Array(file);\n * const result = await extractBytes(bytes, 'application/pdf');\n * ```\n */\nexport async function fileToUint8Array(file: File | Blob): Promise<Uint8Array> {\n\ttry {\n\t\tif (file.size > MAX_FILE_SIZE) {\n\t\t\tthrow new Error(\n\t\t\t\t`File size (${file.size} bytes) exceeds maximum (${MAX_FILE_SIZE} bytes). Maximum file size is 512 MB.`,\n\t\t\t);\n\t\t}\n\n\t\tconst arrayBuffer = await file.arrayBuffer();\n\t\treturn new Uint8Array(arrayBuffer);\n\t} catch (error) {\n\t\tthrow new Error(`Failed to read file: ${error instanceof Error ? error.message : String(error)}`);\n\t}\n}\n\n/**\n * Normalize ExtractionConfig for WASM processing\n *\n * Converts TypeScript configuration objects to a WASM-compatible format,\n * handling null values, undefined properties, and nested structures.\n *\n * @param config - The extraction configuration or null\n * @returns Normalized configuration object suitable for WASM\n *\n * @example\n * ```typescript\n * const config: ExtractionConfig = {\n * ocr: { backend: 'tesseract' },\n * chunking: { maxChars: 1000 }\n * };\n * const wasmConfig = configToJS(config);\n * ```\n */\nexport function configToJS(config: ExtractionConfig | null): Record<string, unknown> {\n\tif (!config) {\n\t\treturn {};\n\t}\n\n\t// Convert camelCase key to snake_case to match Rust serde field names.\n\tconst toSnakeCase = (str: string): string => str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);\n\n\tconst normalizeValue = (value: unknown): unknown => {\n\t\tif (value === null || value === undefined) {\n\t\t\treturn null;\n\t\t}\n\t\tif (typeof value === \"object\") {\n\t\t\tif (Array.isArray(value)) {\n\t\t\t\treturn value.map(normalizeValue);\n\t\t\t}\n\t\t\tconst obj = value as Record<string, unknown>;\n\t\t\tconst normalized: Record<string, unknown> = {};\n\t\t\tfor (const [key, val] of Object.entries(obj)) {\n\t\t\t\tconst normalizedVal = normalizeValue(val);\n\t\t\t\tif (normalizedVal !== null && normalizedVal !== undefined) {\n\t\t\t\t\tnormalized[toSnakeCase(key)] = normalizedVal;\n\t\t\t\t}\n\t\t\t}\n\t\t\treturn Object.keys(normalized).length > 0 ? normalized : null;\n\t\t}\n\t\treturn value;\n\t};\n\n\tconst normalized: Record<string, unknown> = {};\n\tfor (const [key, value] of Object.entries(config)) {\n\t\tconst normalizedValue = normalizeValue(value);\n\t\tif (normalizedValue !== null && normalizedValue !== undefined) {\n\t\t\tnormalized[toSnakeCase(key)] = normalizedValue;\n\t\t}\n\t}\n\n\treturn normalized;\n}\n\n/**\n * Parse WASM extraction result and convert to TypeScript type\n *\n * Handles conversion of WASM-returned objects to proper ExtractionResult types,\n * including proper array conversions and type assertions for tables, chunks, and images.\n *\n * @param jsValue - The raw WASM result value\n * @returns Properly typed ExtractionResult\n * @throws {Error} If the result structure is invalid\n *\n * @example\n * ```typescript\n * const wasmResult = await wasmExtract(bytes, mimeType, config);\n * const result = jsToExtractionResult(wasmResult);\n * console.log(result.content);\n * ```\n */\nexport function jsToExtractionResult(jsValue: unknown): ExtractionResult {\n\tif (!jsValue || typeof jsValue !== \"object\") {\n\t\tthrow new Error(\"Invalid extraction result: value is not an object\");\n\t}\n\n\tconst result = jsValue as Record<string, unknown>;\n\tconst mimeType =\n\t\ttypeof result.mimeType === \"string\"\n\t\t\t? result.mimeType\n\t\t\t: typeof result.mime_type === \"string\"\n\t\t\t\t? result.mime_type\n\t\t\t\t: null;\n\n\tif (typeof result.content !== \"string\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid content\");\n\t}\n\tif (typeof mimeType !== \"string\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid mimeType\");\n\t}\n\tif (!result.metadata || typeof result.metadata !== \"object\") {\n\t\tthrow new Error(\"Invalid extraction result: missing or invalid metadata\");\n\t}\n\n\tconst tables: Table[] = [];\n\tif (Array.isArray(result.tables)) {\n\t\tfor (const table of result.tables) {\n\t\t\tif (table && typeof table === \"object\") {\n\t\t\t\tconst t = table as Record<string, unknown>;\n\t\t\t\tconst pageNumber =\n\t\t\t\t\ttypeof t.pageNumber === \"number\" ? t.pageNumber : typeof t.page_number === \"number\" ? t.page_number : null;\n\t\t\t\tif (\n\t\t\t\t\tArray.isArray(t.cells) &&\n\t\t\t\t\tt.cells.every((row) => Array.isArray(row) && row.every((cell) => typeof cell === \"string\")) &&\n\t\t\t\t\ttypeof t.markdown === \"string\" &&\n\t\t\t\t\tpageNumber !== null\n\t\t\t\t) {\n\t\t\t\t\ttables.push({\n\t\t\t\t\t\tcells: t.cells as string[][],\n\t\t\t\t\t\tmarkdown: t.markdown,\n\t\t\t\t\t\tpageNumber,\n\t\t\t\t\t});\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\tconst chunks: Chunk[] | null = Array.isArray(result.chunks)\n\t\t? result.chunks.map((chunk) => {\n\t\t\t\tif (!chunk || typeof chunk !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk structure\");\n\t\t\t\t}\n\t\t\t\tconst c = chunk as Record<string, unknown>;\n\t\t\t\tif (typeof c.content !== \"string\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk: missing content\");\n\t\t\t\t}\n\t\t\t\tif (!c.metadata || typeof c.metadata !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid chunk: missing metadata\");\n\t\t\t\t}\n\t\t\t\tconst metadata = c.metadata as Record<string, unknown>;\n\n\t\t\t\tlet embedding: number[] | null = null;\n\t\t\t\tif (Array.isArray(c.embedding)) {\n\t\t\t\t\tif (!c.embedding.every((item) => typeof item === \"number\")) {\n\t\t\t\t\t\tthrow new Error(\"Invalid chunk: embedding must contain only numbers\");\n\t\t\t\t\t}\n\t\t\t\t\tembedding = c.embedding;\n\t\t\t\t}\n\n\t\t\t\t// Coerce numeric values - handle BigInt, strings, and numbers\n\t\t\t\tconst coerceToNumber = (value: unknown, fieldName: string): number => {\n\t\t\t\t\tif (typeof value === \"number\") {\n\t\t\t\t\t\treturn value;\n\t\t\t\t\t}\n\t\t\t\t\tif (typeof value === \"bigint\") {\n\t\t\t\t\t\treturn Number(value);\n\t\t\t\t\t}\n\t\t\t\t\tif (typeof value === \"string\") {\n\t\t\t\t\t\tconst parsed = parseInt(value, 10);\n\t\t\t\t\t\tif (Number.isNaN(parsed)) {\n\t\t\t\t\t\t\tthrow new Error(`Invalid chunk metadata: ${fieldName} must be a valid number, got \"${value}\"`);\n\t\t\t\t\t\t}\n\t\t\t\t\t\treturn parsed;\n\t\t\t\t\t}\n\t\t\t\t\tthrow new Error(`Invalid chunk metadata: ${fieldName} must be a number, got ${typeof value}`);\n\t\t\t\t};\n\n\t\t\t\t// The Rust code uses snake_case field names (byte_start, byte_end, etc)\n\t\t\t\t// but TypeScript expects camelCase (charStart, charEnd, etc)\n\t\t\t\t// For now, treat byte offsets as character offsets since the content is UTF-8\n\t\t\t\tconst charStart = coerceToNumber(\n\t\t\t\t\tmetadata.charStart ?? metadata.char_start ?? metadata.byteStart ?? metadata.byte_start,\n\t\t\t\t\t\"charStart\",\n\t\t\t\t);\n\t\t\t\tconst charEnd = coerceToNumber(\n\t\t\t\t\tmetadata.charEnd ?? metadata.char_end ?? metadata.byteEnd ?? metadata.byte_end,\n\t\t\t\t\t\"charEnd\",\n\t\t\t\t);\n\t\t\t\tconst chunkIndex = coerceToNumber(metadata.chunkIndex ?? metadata.chunk_index, \"chunkIndex\");\n\t\t\t\tconst totalChunks = coerceToNumber(metadata.totalChunks ?? metadata.total_chunks, \"totalChunks\");\n\n\t\t\t\tlet tokenCount: number | null = null;\n\t\t\t\tconst tokenCountValue = metadata.tokenCount ?? metadata.token_count;\n\t\t\t\tif (tokenCountValue !== null && tokenCountValue !== undefined) {\n\t\t\t\t\ttokenCount = coerceToNumber(tokenCountValue, \"tokenCount\");\n\t\t\t\t}\n\n\t\t\t\tlet firstPage: number | null = null;\n\t\t\t\tconst firstPageValue = metadata.firstPage ?? metadata.first_page;\n\t\t\t\tif (firstPageValue !== null && firstPageValue !== undefined) {\n\t\t\t\t\tfirstPage = coerceToNumber(firstPageValue, \"firstPage\");\n\t\t\t\t}\n\n\t\t\t\tlet lastPage: number | null = null;\n\t\t\t\tconst lastPageValue = metadata.lastPage ?? metadata.last_page;\n\t\t\t\tif (lastPageValue !== null && lastPageValue !== undefined) {\n\t\t\t\t\tlastPage = coerceToNumber(lastPageValue, \"lastPage\");\n\t\t\t\t}\n\n\t\t\t\t// biome-ignore lint/complexity/useLiteralKeys: dynamic property access from raw object\n\t\t\t\tconst rawHc = (metadata[\"heading_context\"] ?? metadata[\"headingContext\"]) as\n\t\t\t\t\t| Record<string, unknown>\n\t\t\t\t\t| null\n\t\t\t\t\t| undefined;\n\t\t\t\tlet headingContext: import(\"../types.js\").HeadingContext | null = null;\n\t\t\t\tif (rawHc && typeof rawHc === \"object\") {\n\t\t\t\t\t// biome-ignore lint/complexity/useLiteralKeys: dynamic property access from raw object\n\t\t\t\t\tconst rawHeadings = rawHc[\"headings\"];\n\t\t\t\t\tif (Array.isArray(rawHeadings)) {\n\t\t\t\t\t\theadingContext = {\n\t\t\t\t\t\t\theadings: rawHeadings.map((h: unknown) => {\n\t\t\t\t\t\t\t\tconst heading = h as Record<string, unknown>;\n\t\t\t\t\t\t\t\treturn {\n\t\t\t\t\t\t\t\t\t// biome-ignore lint/complexity/useLiteralKeys: dynamic property access from raw object\n\t\t\t\t\t\t\t\t\tlevel: (heading[\"level\"] as number) ?? 0,\n\t\t\t\t\t\t\t\t\t// biome-ignore lint/complexity/useLiteralKeys: dynamic property access from raw object\n\t\t\t\t\t\t\t\t\ttext: (heading[\"text\"] as string) ?? \"\",\n\t\t\t\t\t\t\t\t};\n\t\t\t\t\t\t\t}),\n\t\t\t\t\t\t};\n\t\t\t\t\t}\n\t\t\t\t}\n\n\t\t\t\treturn {\n\t\t\t\t\tcontent: c.content,\n\t\t\t\t\tembedding,\n\t\t\t\t\tmetadata: {\n\t\t\t\t\t\tbyteStart: charStart,\n\t\t\t\t\t\tbyteEnd: charEnd,\n\t\t\t\t\t\tcharStart,\n\t\t\t\t\t\tcharEnd,\n\t\t\t\t\t\ttokenCount,\n\t\t\t\t\t\tchunkIndex,\n\t\t\t\t\t\ttotalChunks,\n\t\t\t\t\t\tfirstPage,\n\t\t\t\t\t\tlastPage,\n\t\t\t\t\t\theadingContext,\n\t\t\t\t\t},\n\t\t\t\t};\n\t\t\t})\n\t\t: null;\n\n\tconst images: ExtractedImage[] | null = Array.isArray(result.images)\n\t\t? result.images.map((image) => {\n\t\t\t\tif (!image || typeof image !== \"object\") {\n\t\t\t\t\tthrow new Error(\"Invalid image structure\");\n\t\t\t\t}\n\t\t\t\tconst img = image as Record<string, unknown>;\n\t\t\t\tlet imageData: Uint8Array;\n\t\t\t\tif (img.data instanceof Uint8Array) {\n\t\t\t\t\timageData = img.data;\n\t\t\t\t} else if (Array.isArray(img.data)) {\n\t\t\t\t\timageData = new Uint8Array(img.data as number[]);\n\t\t\t\t} else {\n\t\t\t\t\tthrow new Error(\"Invalid image: data must be Uint8Array or array\");\n\t\t\t\t}\n\t\t\t\tif (typeof img.format !== \"string\") {\n\t\t\t\t\tthrow new Error(\"Invalid image: missing format\");\n\t\t\t\t}\n\n\t\t\t\t// Support both camelCase and snake_case field names (Rust serde uses snake_case)\n\t\t\t\tconst imageIndex = img.imageIndex ?? img.image_index;\n\t\t\t\tconst pageNumber = img.pageNumber ?? img.page_number;\n\t\t\t\tconst bitsPerComponent = img.bitsPerComponent ?? img.bits_per_component;\n\t\t\t\tconst isMask = img.isMask ?? img.is_mask;\n\t\t\t\tconst ocrResult = img.ocrResult ?? img.ocr_result;\n\n\t\t\t\tif (typeof imageIndex !== \"number\") {\n\t\t\t\t\tthrow new Error(\"Invalid image: imageIndex must be a number\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(pageNumber)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: pageNumber must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(img.width)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: width must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(img.height)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: height must be a number or null\");\n\t\t\t\t}\n\t\t\t\tif (!isNumberOrNull(bitsPerComponent)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: bitsPerComponent must be a number or null\");\n\t\t\t\t}\n\n\t\t\t\tif (!isBoolean(isMask)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: isMask must be a boolean\");\n\t\t\t\t}\n\n\t\t\t\tif (!isStringOrNull(img.colorspace)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: colorspace must be a string or null\");\n\t\t\t\t}\n\t\t\t\tif (!isStringOrNull(img.description)) {\n\t\t\t\t\tthrow new Error(\"Invalid image: description must be a string or null\");\n\t\t\t\t}\n\n\t\t\t\treturn {\n\t\t\t\t\tdata: imageData,\n\t\t\t\t\tformat: img.format,\n\t\t\t\t\timageIndex: imageIndex,\n\t\t\t\t\tpageNumber: pageNumber ?? null,\n\t\t\t\t\twidth: (img.width as number) ?? null,\n\t\t\t\t\theight: (img.height as number) ?? null,\n\t\t\t\t\tcolorspace: (img.colorspace as string) ?? null,\n\t\t\t\t\tbitsPerComponent: bitsPerComponent ?? null,\n\t\t\t\t\tisMask: isMask ?? false,\n\t\t\t\t\tdescription: (img.description as string) ?? null,\n\t\t\t\t\tocrResult: ocrResult ? jsToExtractionResult(ocrResult) : null,\n\t\t\t\t};\n\t\t\t})\n\t\t: null;\n\n\tlet detectedLanguages: string[] | null = null;\n\tconst detectedLanguagesRaw = Array.isArray(result.detectedLanguages)\n\t\t? result.detectedLanguages\n\t\t: result.detected_languages;\n\tif (Array.isArray(detectedLanguagesRaw)) {\n\t\tif (!detectedLanguagesRaw.every((lang) => typeof lang === \"string\")) {\n\t\t\tthrow new Error(\"Invalid result: detectedLanguages must contain only strings\");\n\t\t}\n\t\tdetectedLanguages = detectedLanguagesRaw;\n\t}\n\n\tconst extractedKeywords = (result.extractedKeywords ?? result.extracted_keywords ?? null) as\n\t\t| ExtractedKeyword[]\n\t\t| null;\n\tconst qualityScore =\n\t\ttypeof (result.qualityScore ?? result.quality_score) === \"number\"\n\t\t\t? ((result.qualityScore ?? result.quality_score) as number)\n\t\t\t: null;\n\tconst processingWarnings = (result.processingWarnings ?? result.processing_warnings ?? null) as\n\t\t| ProcessingWarning[]\n\t\t| null;\n\tconst elements = (result.elements ?? null) as Element[] | null;\n\tconst ocrElements = (result.ocrElements ?? result.ocr_elements ?? null) as OcrElement[] | null;\n\tconst document = (result.document ?? null) as DocumentStructure | null;\n\tconst pages = (result.pages ?? null) as PageContent[] | null;\n\tconst annotations = (result.annotations ?? null) as PdfAnnotation[] | null;\n\n\treturn {\n\t\tcontent: result.content,\n\t\tmimeType,\n\t\tmetadata: (result.metadata ?? {}) as Metadata,\n\t\ttables,\n\t\tdetectedLanguages,\n\t\tchunks,\n\t\timages,\n\t\tpages,\n\t\textractedKeywords,\n\t\tqualityScore,\n\t\tprocessingWarnings,\n\t\telements,\n\t\tocrElements,\n\t\tdocument,\n\t\tannotations,\n\t};\n}\n\n/**\n * Wrap and format WASM errors with context\n *\n * Converts WASM error messages to JavaScript Error objects with proper context\n * and stack trace information when available.\n *\n * @param error - The error from WASM\n * @param context - Additional context about what operation failed\n * @returns A formatted Error object\n *\n * @internal\n *\n * @example\n * ```typescript\n * try {\n * await wasmExtract(bytes, mimeType);\n * } catch (error) {\n * throw wrapWasmError(error, 'extracting document');\n * }\n * ```\n */\nexport function wrapWasmError(error: unknown, context: string): Error {\n\tif (error instanceof Error) {\n\t\treturn new Error(`Error ${context}: ${error.message}`, {\n\t\t\tcause: error,\n\t\t});\n\t}\n\n\tconst message = String(error);\n\treturn new Error(`Error ${context}: ${message}`);\n}\n\n/**\n * Validate that a WASM-returned value conforms to ExtractionResult structure\n *\n * Performs structural validation without full type checking,\n * useful for runtime validation of WASM output.\n *\n * @param value - The value to validate\n * @returns True if value appears to be a valid ExtractionResult\n *\n * @internal\n */\nexport function isValidExtractionResult(value: unknown): value is ExtractionResult {\n\tif (!value || typeof value !== \"object\") {\n\t\treturn false;\n\t}\n\n\tconst obj = value as Record<string, unknown>;\n\treturn (\n\t\ttypeof obj.content === \"string\" &&\n\t\t(typeof obj.mimeType === \"string\" || typeof obj.mime_type === \"string\") &&\n\t\tobj.metadata !== null &&\n\t\ttypeof obj.metadata === \"object\" &&\n\t\tArray.isArray(obj.tables)\n\t);\n}\n"],"mappings":";AAiDA,IAAM,gBAAgB,MAAM,OAAO;AAOnC,SAAS,eAAe,OAAwC;AAC/D,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,UAAU;AACjE;AAOA,SAAS,eAAe,OAAwC;AAC/D,SAAO,OAAO,UAAU,YAAY,UAAU,QAAQ,UAAU;AACjE;AAOA,SAAS,UAAU,OAAkC;AACpD,SAAO,OAAO,UAAU,aAAa,UAAU;AAChD;AAmBA,eAAsB,iBAAiB,MAAwC;AAC9E,MAAI;AACH,QAAI,KAAK,OAAO,eAAe;AAC9B,YAAM,IAAI;AAAA,QACT,cAAc,KAAK,IAAI,4BAA4B,aAAa;AAAA,MACjE;AAAA,IACD;AAEA,UAAM,cAAc,MAAM,KAAK,YAAY;AAC3C,WAAO,IAAI,WAAW,WAAW;AAAA,EAClC,SAAS,OAAO;AACf,UAAM,IAAI,MAAM,wBAAwB,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK,CAAC,EAAE;AAAA,EACjG;AACD;AAoBO,SAAS,WAAW,QAA0D;AACpF,MAAI,CAAC,QAAQ;AACZ,WAAO,CAAC;AAAA,EACT;AAGA,QAAM,cAAc,CAAC,QAAwB,IAAI,QAAQ,UAAU,CAAC,WAAW,IAAI,OAAO,YAAY,CAAC,EAAE;AAEzG,QAAM,iBAAiB,CAAC,UAA4B;AACnD,QAAI,UAAU,QAAQ,UAAU,QAAW;AAC1C,aAAO;AAAA,IACR;AACA,QAAI,OAAO,UAAU,UAAU;AAC9B,UAAI,MAAM,QAAQ,KAAK,GAAG;AACzB,eAAO,MAAM,IAAI,cAAc;AAAA,MAChC;AACA,YAAM,MAAM;AACZ,YAAMA,cAAsC,CAAC;AAC7C,iBAAW,CAAC,KAAK,GAAG,KAAK,OAAO,QAAQ,GAAG,GAAG;AAC7C,cAAM,gBAAgB,eAAe,GAAG;AACxC,YAAI,kBAAkB,QAAQ,kBAAkB,QAAW;AAC1D,UAAAA,YAAW,YAAY,GAAG,CAAC,IAAI;AAAA,QAChC;AAAA,MACD;AACA,aAAO,OAAO,KAAKA,WAAU,EAAE,SAAS,IAAIA,cAAa;AAAA,IAC1D;AACA,WAAO;AAAA,EACR;AAEA,QAAM,aAAsC,CAAC;AAC7C,aAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,MAAM,GAAG;AAClD,UAAM,kBAAkB,eAAe,KAAK;AAC5C,QAAI,oBAAoB,QAAQ,oBAAoB,QAAW;AAC9D,iBAAW,YAAY,GAAG,CAAC,IAAI;AAAA,IAChC;AAAA,EACD;AAEA,SAAO;AACR;AAmBO,SAAS,qBAAqB,SAAoC;AACxE,MAAI,CAAC,WAAW,OAAO,YAAY,UAAU;AAC5C,UAAM,IAAI,MAAM,mDAAmD;AAAA,EACpE;AAEA,QAAM,SAAS;AACf,QAAM,WACL,OAAO,OAAO,aAAa,WACxB,OAAO,WACP,OAAO,OAAO,cAAc,WAC3B,OAAO,YACP;AAEL,MAAI,OAAO,OAAO,YAAY,UAAU;AACvC,UAAM,IAAI,MAAM,uDAAuD;AAAA,EACxE;AACA,MAAI,OAAO,aAAa,UAAU;AACjC,UAAM,IAAI,MAAM,wDAAwD;AAAA,EACzE;AACA,MAAI,CAAC,OAAO,YAAY,OAAO,OAAO,aAAa,UAAU;AAC5D,UAAM,IAAI,MAAM,wDAAwD;AAAA,EACzE;AAEA,QAAM,SAAkB,CAAC;AACzB,MAAI,MAAM,QAAQ,OAAO,MAAM,GAAG;AACjC,eAAW,SAAS,OAAO,QAAQ;AAClC,UAAI,SAAS,OAAO,UAAU,UAAU;AACvC,cAAM,IAAI;AACV,cAAM,aACL,OAAO,EAAE,eAAe,WAAW,EAAE,aAAa,OAAO,EAAE,gBAAgB,WAAW,EAAE,cAAc;AACvG,YACC,MAAM,QAAQ,EAAE,KAAK,KACrB,EAAE,MAAM,MAAM,CAAC,QAAQ,MAAM,QAAQ,GAAG,KAAK,IAAI,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,CAAC,KAC1F,OAAO,EAAE,aAAa,YACtB,eAAe,MACd;AACD,iBAAO,KAAK;AAAA,YACX,OAAO,EAAE;AAAA,YACT,UAAU,EAAE;AAAA,YACZ;AAAA,UACD,CAAC;AAAA,QACF;AAAA,MACD;AAAA,IACD;AAAA,EACD;AAEA,QAAM,SAAyB,MAAM,QAAQ,OAAO,MAAM,IACvD,OAAO,OAAO,IAAI,CAAC,UAAU;AAC7B,QAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,YAAM,IAAI,MAAM,yBAAyB;AAAA,IAC1C;AACA,UAAM,IAAI;AACV,QAAI,OAAO,EAAE,YAAY,UAAU;AAClC,YAAM,IAAI,MAAM,gCAAgC;AAAA,IACjD;AACA,QAAI,CAAC,EAAE,YAAY,OAAO,EAAE,aAAa,UAAU;AAClD,YAAM,IAAI,MAAM,iCAAiC;AAAA,IAClD;AACA,UAAM,WAAW,EAAE;AAEnB,QAAI,YAA6B;AACjC,QAAI,MAAM,QAAQ,EAAE,SAAS,GAAG;AAC/B,UAAI,CAAC,EAAE,UAAU,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,GAAG;AAC3D,cAAM,IAAI,MAAM,oDAAoD;AAAA,MACrE;AACA,kBAAY,EAAE;AAAA,IACf;AAGA,UAAM,iBAAiB,CAAC,OAAgB,cAA8B;AACrE,UAAI,OAAO,UAAU,UAAU;AAC9B,eAAO;AAAA,MACR;AACA,UAAI,OAAO,UAAU,UAAU;AAC9B,eAAO,OAAO,KAAK;AAAA,MACpB;AACA,UAAI,OAAO,UAAU,UAAU;AAC9B,cAAM,SAAS,SAAS,OAAO,EAAE;AACjC,YAAI,OAAO,MAAM,MAAM,GAAG;AACzB,gBAAM,IAAI,MAAM,2BAA2B,SAAS,iCAAiC,KAAK,GAAG;AAAA,QAC9F;AACA,eAAO;AAAA,MACR;AACA,YAAM,IAAI,MAAM,2BAA2B,SAAS,0BAA0B,OAAO,KAAK,EAAE;AAAA,IAC7F;AAKA,UAAM,YAAY;AAAA,MACjB,SAAS,aAAa,SAAS,cAAc,SAAS,aAAa,SAAS;AAAA,MAC5E;AAAA,IACD;AACA,UAAM,UAAU;AAAA,MACf,SAAS,WAAW,SAAS,YAAY,SAAS,WAAW,SAAS;AAAA,MACtE;AAAA,IACD;AACA,UAAM,aAAa,eAAe,SAAS,cAAc,SAAS,aAAa,YAAY;AAC3F,UAAM,cAAc,eAAe,SAAS,eAAe,SAAS,cAAc,aAAa;AAE/F,QAAI,aAA4B;AAChC,UAAM,kBAAkB,SAAS,cAAc,SAAS;AACxD,QAAI,oBAAoB,QAAQ,oBAAoB,QAAW;AAC9D,mBAAa,eAAe,iBAAiB,YAAY;AAAA,IAC1D;AAEA,QAAI,YAA2B;AAC/B,UAAM,iBAAiB,SAAS,aAAa,SAAS;AACtD,QAAI,mBAAmB,QAAQ,mBAAmB,QAAW;AAC5D,kBAAY,eAAe,gBAAgB,WAAW;AAAA,IACvD;AAEA,QAAI,WAA0B;AAC9B,UAAM,gBAAgB,SAAS,YAAY,SAAS;AACpD,QAAI,kBAAkB,QAAQ,kBAAkB,QAAW;AAC1D,iBAAW,eAAe,eAAe,UAAU;AAAA,IACpD;AAGA,UAAM,QAAS,SAAS,iBAAiB,KAAK,SAAS,gBAAgB;AAIvE,QAAI,iBAA8D;AAClE,QAAI,SAAS,OAAO,UAAU,UAAU;AAEvC,YAAM,cAAc,MAAM,UAAU;AACpC,UAAI,MAAM,QAAQ,WAAW,GAAG;AAC/B,yBAAiB;AAAA,UAChB,UAAU,YAAY,IAAI,CAAC,MAAe;AACzC,kBAAM,UAAU;AAChB,mBAAO;AAAA;AAAA,cAEN,OAAQ,QAAQ,OAAO,KAAgB;AAAA;AAAA,cAEvC,MAAO,QAAQ,MAAM,KAAgB;AAAA,YACtC;AAAA,UACD,CAAC;AAAA,QACF;AAAA,MACD;AAAA,IACD;AAEA,WAAO;AAAA,MACN,SAAS,EAAE;AAAA,MACX;AAAA,MACA,UAAU;AAAA,QACT,WAAW;AAAA,QACX,SAAS;AAAA,QACT;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,QACA;AAAA,MACD;AAAA,IACD;AAAA,EACD,CAAC,IACA;AAEH,QAAM,SAAkC,MAAM,QAAQ,OAAO,MAAM,IAChE,OAAO,OAAO,IAAI,CAAC,UAAU;AAC7B,QAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,YAAM,IAAI,MAAM,yBAAyB;AAAA,IAC1C;AACA,UAAM,MAAM;AACZ,QAAI;AACJ,QAAI,IAAI,gBAAgB,YAAY;AACnC,kBAAY,IAAI;AAAA,IACjB,WAAW,MAAM,QAAQ,IAAI,IAAI,GAAG;AACnC,kBAAY,IAAI,WAAW,IAAI,IAAgB;AAAA,IAChD,OAAO;AACN,YAAM,IAAI,MAAM,iDAAiD;AAAA,IAClE;AACA,QAAI,OAAO,IAAI,WAAW,UAAU;AACnC,YAAM,IAAI,MAAM,+BAA+B;AAAA,IAChD;AAGA,UAAM,aAAa,IAAI,cAAc,IAAI;AACzC,UAAM,aAAa,IAAI,cAAc,IAAI;AACzC,UAAM,mBAAmB,IAAI,oBAAoB,IAAI;AACrD,UAAM,SAAS,IAAI,UAAU,IAAI;AACjC,UAAM,YAAY,IAAI,aAAa,IAAI;AAEvC,QAAI,OAAO,eAAe,UAAU;AACnC,YAAM,IAAI,MAAM,4CAA4C;AAAA,IAC7D;AACA,QAAI,CAAC,eAAe,UAAU,GAAG;AAChC,YAAM,IAAI,MAAM,oDAAoD;AAAA,IACrE;AACA,QAAI,CAAC,eAAe,IAAI,KAAK,GAAG;AAC/B,YAAM,IAAI,MAAM,+CAA+C;AAAA,IAChE;AACA,QAAI,CAAC,eAAe,IAAI,MAAM,GAAG;AAChC,YAAM,IAAI,MAAM,gDAAgD;AAAA,IACjE;AACA,QAAI,CAAC,eAAe,gBAAgB,GAAG;AACtC,YAAM,IAAI,MAAM,0DAA0D;AAAA,IAC3E;AAEA,QAAI,CAAC,UAAU,MAAM,GAAG;AACvB,YAAM,IAAI,MAAM,yCAAyC;AAAA,IAC1D;AAEA,QAAI,CAAC,eAAe,IAAI,UAAU,GAAG;AACpC,YAAM,IAAI,MAAM,oDAAoD;AAAA,IACrE;AACA,QAAI,CAAC,eAAe,IAAI,WAAW,GAAG;AACrC,YAAM,IAAI,MAAM,qDAAqD;AAAA,IACtE;AAEA,WAAO;AAAA,MACN,MAAM;AAAA,MACN,QAAQ,IAAI;AAAA,MACZ;AAAA,MACA,YAAY,cAAc;AAAA,MAC1B,OAAQ,IAAI,SAAoB;AAAA,MAChC,QAAS,IAAI,UAAqB;AAAA,MAClC,YAAa,IAAI,cAAyB;AAAA,MAC1C,kBAAkB,oBAAoB;AAAA,MACtC,QAAQ,UAAU;AAAA,MAClB,aAAc,IAAI,eAA0B;AAAA,MAC5C,WAAW,YAAY,qBAAqB,SAAS,IAAI;AAAA,IAC1D;AAAA,EACD,CAAC,IACA;AAEH,MAAI,oBAAqC;AACzC,QAAM,uBAAuB,MAAM,QAAQ,OAAO,iBAAiB,IAChE,OAAO,oBACP,OAAO;AACV,MAAI,MAAM,QAAQ,oBAAoB,GAAG;AACxC,QAAI,CAAC,qBAAqB,MAAM,CAAC,SAAS,OAAO,SAAS,QAAQ,GAAG;AACpE,YAAM,IAAI,MAAM,6DAA6D;AAAA,IAC9E;AACA,wBAAoB;AAAA,EACrB;AAEA,QAAM,oBAAqB,OAAO,qBAAqB,OAAO,sBAAsB;AAGpF,QAAM,eACL,QAAQ,OAAO,gBAAgB,OAAO,mBAAmB,WACpD,OAAO,gBAAgB,OAAO,gBAChC;AACJ,QAAM,qBAAsB,OAAO,sBAAsB,OAAO,uBAAuB;AAGvF,QAAM,WAAY,OAAO,YAAY;AACrC,QAAM,cAAe,OAAO,eAAe,OAAO,gBAAgB;AAClE,QAAM,WAAY,OAAO,YAAY;AACrC,QAAM,QAAS,OAAO,SAAS;AAC/B,QAAM,cAAe,OAAO,eAAe;AAE3C,SAAO;AAAA,IACN,SAAS,OAAO;AAAA,IAChB;AAAA,IACA,UAAW,OAAO,YAAY,CAAC;AAAA,IAC/B;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACD;AACD;AAuBO,SAAS,cAAc,OAAgB,SAAwB;AACrE,MAAI,iBAAiB,OAAO;AAC3B,WAAO,IAAI,MAAM,SAAS,OAAO,KAAK,MAAM,OAAO,IAAI;AAAA,MACtD,OAAO;AAAA,IACR,CAAC;AAAA,EACF;AAEA,QAAM,UAAU,OAAO,KAAK;AAC5B,SAAO,IAAI,MAAM,SAAS,OAAO,KAAK,OAAO,EAAE;AAChD;AAaO,SAAS,wBAAwB,OAA2C;AAClF,MAAI,CAAC,SAAS,OAAO,UAAU,UAAU;AACxC,WAAO;AAAA,EACR;AAEA,QAAM,MAAM;AACZ,SACC,OAAO,IAAI,YAAY,aACtB,OAAO,IAAI,aAAa,YAAY,OAAO,IAAI,cAAc,aAC9D,IAAI,aAAa,QACjB,OAAO,IAAI,aAAa,YACxB,MAAM,QAAQ,IAAI,MAAM;AAE1B;","names":["normalized"]}
package/dist/index.js CHANGED
@@ -363,15 +363,48 @@ function jsToExtractionResult(jsValue) {
363
363
  if (tokenCountValue !== null && tokenCountValue !== void 0) {
364
364
  tokenCount = coerceToNumber(tokenCountValue, "tokenCount");
365
365
  }
366
+ let firstPage = null;
367
+ const firstPageValue = metadata.firstPage ?? metadata.first_page;
368
+ if (firstPageValue !== null && firstPageValue !== void 0) {
369
+ firstPage = coerceToNumber(firstPageValue, "firstPage");
370
+ }
371
+ let lastPage = null;
372
+ const lastPageValue = metadata.lastPage ?? metadata.last_page;
373
+ if (lastPageValue !== null && lastPageValue !== void 0) {
374
+ lastPage = coerceToNumber(lastPageValue, "lastPage");
375
+ }
376
+ const rawHc = metadata["heading_context"] ?? metadata["headingContext"];
377
+ let headingContext = null;
378
+ if (rawHc && typeof rawHc === "object") {
379
+ const rawHeadings = rawHc["headings"];
380
+ if (Array.isArray(rawHeadings)) {
381
+ headingContext = {
382
+ headings: rawHeadings.map((h) => {
383
+ const heading = h;
384
+ return {
385
+ // biome-ignore lint/complexity/useLiteralKeys: dynamic property access from raw object
386
+ level: heading["level"] ?? 0,
387
+ // biome-ignore lint/complexity/useLiteralKeys: dynamic property access from raw object
388
+ text: heading["text"] ?? ""
389
+ };
390
+ })
391
+ };
392
+ }
393
+ }
366
394
  return {
367
395
  content: c.content,
368
396
  embedding,
369
397
  metadata: {
398
+ byteStart: charStart,
399
+ byteEnd: charEnd,
370
400
  charStart,
371
401
  charEnd,
372
402
  tokenCount,
373
403
  chunkIndex,
374
- totalChunks
404
+ totalChunks,
405
+ firstPage,
406
+ lastPage,
407
+ headingContext
375
408
  }
376
409
  };
377
410
  }) : null;