@kreuzberg/wasm 4.0.0-rc.14 → 4.0.0-rc.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +7 -0
- package/README.md +28 -0
- package/dist/index.d.cts +4 -4
- package/dist/index.d.ts +3 -3
- package/package.json +1 -1
package/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2025 Na'aman Hirschfeld
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
CHANGED
|
@@ -946,6 +946,34 @@ Tesseract training data (`.traineddata` files) are loaded from jsDelivr CDN on f
|
|
|
946
946
|
|
|
947
947
|
Cloudflare Workers has a 10MB bundle size limit (compressed). The WASM binary is ~2MB compressed, leaving room for your application code.
|
|
948
948
|
|
|
949
|
+
### HTML File Size Limits
|
|
950
|
+
|
|
951
|
+
**WASM builds have a 2MB limit for HTML files** due to limited stack space. HTML files larger than 2MB will be rejected with a validation error to prevent stack overflow.
|
|
952
|
+
|
|
953
|
+
```typescript
|
|
954
|
+
// Files > 2MB will throw an error in WASM builds
|
|
955
|
+
const largeHtml = new Uint8Array(3 * 1024 * 1024); // 3MB
|
|
956
|
+
await extractBytes(largeHtml, 'text/html');
|
|
957
|
+
// ❌ Throws: "HTML file size exceeds WASM limit of 2MB"
|
|
958
|
+
```
|
|
959
|
+
|
|
960
|
+
For large HTML files, use the native [@kreuzberg/node](https://www.npmjs.com/package/@kreuzberg/node) binding which has no size limits.
|
|
961
|
+
|
|
962
|
+
### PDF Extraction in Non-Browser Environments
|
|
963
|
+
|
|
964
|
+
PDF extraction requires PDFium, which is only available in browser environments. In Deno, Node.js, and Cloudflare Workers, PDF extraction will fail with an error.
|
|
965
|
+
|
|
966
|
+
```typescript
|
|
967
|
+
// ❌ Won't work in Deno/Node.js/Workers
|
|
968
|
+
await extractBytes(pdfBytes, 'application/pdf');
|
|
969
|
+
// Throws: "PDF extraction requires proper WASM module initialization"
|
|
970
|
+
```
|
|
971
|
+
|
|
972
|
+
**Solutions:**
|
|
973
|
+
- **Browser**: PDF extraction works out of the box
|
|
974
|
+
- **Deno/Node.js**: Use [@kreuzberg/node](https://www.npmjs.com/package/@kreuzberg/node) with native PDFium bindings
|
|
975
|
+
- **Cloudflare Workers**: PDF extraction is not currently supported
|
|
976
|
+
|
|
949
977
|
## Troubleshooting
|
|
950
978
|
|
|
951
979
|
### "WASM module failed to initialize"
|
package/dist/index.d.cts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.cjs';
|
|
2
|
-
export { C as Chunk,
|
|
1
|
+
import { E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.cjs.js';
|
|
2
|
+
export { C as Chunk, b as ChunkingConfig, c as ChunkMetadata, d as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, O as OcrBackendProtocol, e as OcrConfig, P as PageContent, f as PageExtractionConfig, g as PdfConfig, h as PostProcessorConfig, T as Table, i as TesseractConfig, j as TokenReductionConfig, E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.cjs.js';
|
|
3
3
|
export { configToJS, fileToUint8Array, isValidExtractionResult, jsToExtractionResult, wrapWasmError } from './adapters/wasm-adapter.cjs';
|
|
4
4
|
export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend } from './ocr/registry.cjs';
|
|
5
5
|
export { TesseractWasmBackend } from './ocr/tesseract-wasm-backend.cjs';
|
|
6
|
-
export { RuntimeType, WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.
|
|
6
|
+
export { type RuntimeType, type WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.d.cts';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
9
|
* Kreuzberg - WebAssembly Bindings for Browser and Runtime Environments
|
|
@@ -420,4 +420,4 @@ declare function batchExtractFiles(files: File[], config?: ExtractionConfig | nu
|
|
|
420
420
|
*/
|
|
421
421
|
declare function enableOcr(): Promise<void>;
|
|
422
422
|
|
|
423
|
-
export {
|
|
423
|
+
export { batchExtractBytes, batchExtractBytesSync, batchExtractFiles, enableOcr, extractBytes, extractBytesSync, extractFile, extractFromFile, getInitializationError, getVersion, initWasm, isInitialized };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.
|
|
2
|
-
export { C as Chunk, b as
|
|
1
|
+
import { E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.d.ts';
|
|
2
|
+
export { C as Chunk, b as ChunkingConfig, c as ChunkMetadata, d as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, O as OcrBackendProtocol, e as OcrConfig, P as PageContent, f as PageExtractionConfig, g as PdfConfig, h as PostProcessorConfig, T as Table, i as TesseractConfig, j as TokenReductionConfig, E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.d.ts';
|
|
3
3
|
export { configToJS, fileToUint8Array, isValidExtractionResult, jsToExtractionResult, wrapWasmError } from './adapters/wasm-adapter.js';
|
|
4
4
|
export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend } from './ocr/registry.js';
|
|
5
5
|
export { TesseractWasmBackend } from './ocr/tesseract-wasm-backend.js';
|
|
6
|
-
export { RuntimeType, WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.
|
|
6
|
+
export { type RuntimeType, type WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.d.ts';
|
|
7
7
|
|
|
8
8
|
/**
|
|
9
9
|
* Kreuzberg - WebAssembly Bindings for Browser and Runtime Environments
|