@kreuzberg/wasm 4.0.8 → 4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/extraction/batch.d.ts +80 -0
- package/dist/extraction/batch.d.ts.map +1 -0
- package/dist/extraction/bytes.d.ts +69 -0
- package/dist/extraction/bytes.d.ts.map +1 -0
- package/dist/extraction/files.d.ts +77 -0
- package/dist/extraction/files.d.ts.map +1 -0
- package/dist/extraction/index.d.ts +11 -0
- package/dist/extraction/index.d.ts.map +1 -0
- package/dist/extraction/internal.d.ts +21 -0
- package/dist/extraction/internal.d.ts.map +1 -0
- package/dist/index.d.ts +9 -323
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +677 -591
- package/dist/index.js.map +1 -1
- package/dist/initialization/pdfium-loader.d.ts +30 -0
- package/dist/initialization/pdfium-loader.d.ts.map +1 -0
- package/dist/initialization/state.d.ts +100 -0
- package/dist/initialization/state.d.ts.map +1 -0
- package/dist/initialization/wasm-loader.d.ts +81 -0
- package/dist/initialization/wasm-loader.d.ts.map +1 -0
- package/dist/ocr/enabler.d.ts +86 -0
- package/dist/ocr/enabler.d.ts.map +1 -0
- package/dist/pkg/README.md +1 -1
- package/dist/pkg/kreuzberg_wasm.d.ts +76 -0
- package/dist/pkg/kreuzberg_wasm.js +142 -82
- package/dist/pkg/kreuzberg_wasm_bg.js +7 -7
- package/dist/pkg/kreuzberg_wasm_bg.wasm +0 -0
- package/dist/pkg/kreuzberg_wasm_bg.wasm.d.ts +3 -3
- package/dist/pkg/package.json +5 -1
- package/dist/runtime.d.ts +22 -2
- package/dist/runtime.d.ts.map +1 -1
- package/dist/runtime.js +21 -1
- package/dist/runtime.js.map +1 -1
- package/dist/types.d.ts +75 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +6 -6
package/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.1" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Batch extraction functions
|
|
3
|
+
*
|
|
4
|
+
* Provides batch processing capabilities for extracting from multiple documents
|
|
5
|
+
* in a single operation for improved efficiency.
|
|
6
|
+
*/
|
|
7
|
+
import type { ExtractionConfig as ExtractionConfigType, ExtractionResult } from "../types.d.ts";
|
|
8
|
+
/**
|
|
9
|
+
* Batch extract content from multiple byte arrays asynchronously
|
|
10
|
+
*
|
|
11
|
+
* Extracts content from multiple documents in a single batch operation,
|
|
12
|
+
* allowing for more efficient processing of multiple files.
|
|
13
|
+
*
|
|
14
|
+
* @param files - Array of objects containing data (Uint8Array) and mimeType (string)
|
|
15
|
+
* @param config - Optional extraction configuration applied to all files
|
|
16
|
+
* @returns Promise resolving to array of extraction results
|
|
17
|
+
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
18
|
+
*
|
|
19
|
+
* @example
|
|
20
|
+
* ```typescript
|
|
21
|
+
* const files = [
|
|
22
|
+
* { data: pdfBytes, mimeType: 'application/pdf' },
|
|
23
|
+
* { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
|
|
24
|
+
* ];
|
|
25
|
+
* const results = await batchExtractBytes(files);
|
|
26
|
+
* results.forEach((result) => console.log(result.content));
|
|
27
|
+
* ```
|
|
28
|
+
*/
|
|
29
|
+
export declare function batchExtractBytes(files: Array<{
|
|
30
|
+
data: Uint8Array;
|
|
31
|
+
mimeType: string;
|
|
32
|
+
}>, config?: ExtractionConfigType | null): Promise<ExtractionResult[]>;
|
|
33
|
+
/**
|
|
34
|
+
* Batch extract content from multiple byte arrays synchronously
|
|
35
|
+
*
|
|
36
|
+
* Synchronous version of batchExtractBytes. Extracts content from multiple documents
|
|
37
|
+
* in a single batch operation without async operations.
|
|
38
|
+
*
|
|
39
|
+
* @param files - Array of objects containing data (Uint8Array) and mimeType (string)
|
|
40
|
+
* @param config - Optional extraction configuration applied to all files
|
|
41
|
+
* @returns Array of extraction results
|
|
42
|
+
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
43
|
+
*
|
|
44
|
+
* @example
|
|
45
|
+
* ```typescript
|
|
46
|
+
* const files = [
|
|
47
|
+
* { data: pdfBytes, mimeType: 'application/pdf' },
|
|
48
|
+
* { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
|
|
49
|
+
* ];
|
|
50
|
+
* const results = batchExtractBytesSync(files);
|
|
51
|
+
* results.forEach((result) => console.log(result.content));
|
|
52
|
+
* ```
|
|
53
|
+
*/
|
|
54
|
+
export declare function batchExtractBytesSync(files: Array<{
|
|
55
|
+
data: Uint8Array;
|
|
56
|
+
mimeType: string;
|
|
57
|
+
}>, config?: ExtractionConfigType | null): ExtractionResult[];
|
|
58
|
+
/**
|
|
59
|
+
* Batch extract content from multiple File objects asynchronously
|
|
60
|
+
*
|
|
61
|
+
* Convenience function that converts File objects to Uint8Array and calls batchExtractBytes.
|
|
62
|
+
* Automatically uses the file.type as MIME type if available.
|
|
63
|
+
*
|
|
64
|
+
* @param files - Array of File objects to extract from
|
|
65
|
+
* @param config - Optional extraction configuration applied to all files
|
|
66
|
+
* @returns Promise resolving to array of extraction results
|
|
67
|
+
* @throws {Error} If WASM module is not initialized, files cannot be read, or extraction fails
|
|
68
|
+
*
|
|
69
|
+
* @example
|
|
70
|
+
* ```typescript
|
|
71
|
+
* const fileInput = document.getElementById('files');
|
|
72
|
+
* const files = Array.from(fileInput.files ?? []);
|
|
73
|
+
* const results = await batchExtractFiles(files);
|
|
74
|
+
* results.forEach((result, index) => {
|
|
75
|
+
* console.log(`File ${index}: ${result.content.substring(0, 50)}...`);
|
|
76
|
+
* });
|
|
77
|
+
* ```
|
|
78
|
+
*/
|
|
79
|
+
export declare function batchExtractFiles(files: File[], config?: ExtractionConfigType | null): Promise<ExtractionResult[]>;
|
|
80
|
+
//# sourceMappingURL=batch.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"batch.d.ts","sourceRoot":"","sources":["../../typescript/extraction/batch.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CA6D7B;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,qBAAqB,CACpC,KAAK,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,EAAE,CA6DpB;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAsB,iBAAiB,CACtC,KAAK,EAAE,IAAI,EAAE,EACb,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAiC7B"}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Byte-based extraction functions
|
|
3
|
+
*
|
|
4
|
+
* Provides synchronous and asynchronous extraction functions for document bytes.
|
|
5
|
+
*/
|
|
6
|
+
import type { ExtractionConfig as ExtractionConfigType, ExtractionResult } from "../types.d.ts";
|
|
7
|
+
/**
|
|
8
|
+
* Extract content from bytes (document data)
|
|
9
|
+
*
|
|
10
|
+
* Extracts text, metadata, tables, images, and other content from document bytes.
|
|
11
|
+
* Automatically detects document type from MIME type and applies appropriate extraction logic.
|
|
12
|
+
*
|
|
13
|
+
* @param data - The document bytes to extract from
|
|
14
|
+
* @param mimeType - MIME type of the document (e.g., 'application/pdf', 'image/jpeg')
|
|
15
|
+
* @param config - Optional extraction configuration
|
|
16
|
+
* @returns Promise resolving to the extraction result
|
|
17
|
+
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
18
|
+
*
|
|
19
|
+
* @example Extract PDF
|
|
20
|
+
* ```typescript
|
|
21
|
+
* const bytes = new Uint8Array(buffer);
|
|
22
|
+
* const result = await extractBytes(bytes, 'application/pdf');
|
|
23
|
+
* console.log(result.content);
|
|
24
|
+
* console.log(result.tables);
|
|
25
|
+
* ```
|
|
26
|
+
*
|
|
27
|
+
* @example Extract with Configuration
|
|
28
|
+
* ```typescript
|
|
29
|
+
* const result = await extractBytes(bytes, 'application/pdf', {
|
|
30
|
+
* ocr: {
|
|
31
|
+
* backend: 'tesseract',
|
|
32
|
+
* language: 'deu' // German
|
|
33
|
+
* },
|
|
34
|
+
* images: {
|
|
35
|
+
* extractImages: true,
|
|
36
|
+
* targetDpi: 200
|
|
37
|
+
* }
|
|
38
|
+
* });
|
|
39
|
+
* ```
|
|
40
|
+
*
|
|
41
|
+
* @example Extract from File
|
|
42
|
+
* ```typescript
|
|
43
|
+
* const file = inputEvent.target.files[0];
|
|
44
|
+
* const bytes = await fileToUint8Array(file);
|
|
45
|
+
* const result = await extractBytes(bytes, file.type);
|
|
46
|
+
* ```
|
|
47
|
+
*/
|
|
48
|
+
export declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfigType | null): Promise<ExtractionResult>;
|
|
49
|
+
/**
|
|
50
|
+
* Extract content from bytes synchronously
|
|
51
|
+
*
|
|
52
|
+
* Synchronous version of extractBytes. Performs extraction without async operations.
|
|
53
|
+
* Note: Some extraction features may still be async internally, but the wrapper is synchronous.
|
|
54
|
+
*
|
|
55
|
+
* @param data - The document bytes to extract from
|
|
56
|
+
* @param mimeType - MIME type of the document
|
|
57
|
+
* @param config - Optional extraction configuration
|
|
58
|
+
* @returns The extraction result
|
|
59
|
+
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
60
|
+
*
|
|
61
|
+
* @example
|
|
62
|
+
* ```typescript
|
|
63
|
+
* const bytes = new Uint8Array(buffer);
|
|
64
|
+
* const result = extractBytesSync(bytes, 'application/pdf');
|
|
65
|
+
* console.log(result.content);
|
|
66
|
+
* ```
|
|
67
|
+
*/
|
|
68
|
+
export declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfigType | null): ExtractionResult;
|
|
69
|
+
//# sourceMappingURL=bytes.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bytes.d.ts","sourceRoot":"","sources":["../../typescript/extraction/bytes.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAG9F;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAsB,YAAY,CACjC,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,CAAC,CA4B3B;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,gBAAgB,CAC/B,IAAI,EAAE,UAAU,EAChB,QAAQ,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,gBAAgB,CA4BlB"}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File-based extraction functions
|
|
3
|
+
*
|
|
4
|
+
* Provides extraction functions for files in filesystem-based environments (Node.js, Deno, Bun)
|
|
5
|
+
* and browser File/Blob objects.
|
|
6
|
+
*/
|
|
7
|
+
import type { ExtractionConfig as ExtractionConfigType, ExtractionResult } from "../types.d.ts";
|
|
8
|
+
/**
|
|
9
|
+
* Extract content from a file on the file system
|
|
10
|
+
*
|
|
11
|
+
* Node.js and Deno specific function that reads a file from the file system
|
|
12
|
+
* and extracts content from it. Automatically detects MIME type if not provided.
|
|
13
|
+
*
|
|
14
|
+
* @param path - Path to the file to extract from
|
|
15
|
+
* @param mimeType - Optional MIME type of the file. If not provided, will attempt to detect
|
|
16
|
+
* @param config - Optional extraction configuration
|
|
17
|
+
* @returns Promise resolving to the extraction result
|
|
18
|
+
* @throws {Error} If WASM module is not initialized, file doesn't exist, or extraction fails
|
|
19
|
+
*
|
|
20
|
+
* @example Extract with auto-detection
|
|
21
|
+
* ```typescript
|
|
22
|
+
* const result = await extractFile('./document.pdf');
|
|
23
|
+
* console.log(result.content);
|
|
24
|
+
* ```
|
|
25
|
+
*
|
|
26
|
+
* @example Extract with explicit MIME type
|
|
27
|
+
* ```typescript
|
|
28
|
+
* const result = await extractFile('./document.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document');
|
|
29
|
+
* ```
|
|
30
|
+
*
|
|
31
|
+
* @example Extract from Node.js with config
|
|
32
|
+
* ```typescript
|
|
33
|
+
* import { extractFile } from '@kreuzberg/wasm';
|
|
34
|
+
* import { readFile } from 'fs/promises';
|
|
35
|
+
*
|
|
36
|
+
* const result = await extractFile('./report.xlsx', null, {
|
|
37
|
+
* chunking: {
|
|
38
|
+
* maxChars: 1000
|
|
39
|
+
* }
|
|
40
|
+
* });
|
|
41
|
+
* ```
|
|
42
|
+
*/
|
|
43
|
+
export declare function extractFile(path: string, mimeType?: string | null, config?: ExtractionConfigType | null): Promise<ExtractionResult>;
|
|
44
|
+
/**
|
|
45
|
+
* Extract content from a File or Blob (browser-friendly wrapper)
|
|
46
|
+
*
|
|
47
|
+
* Convenience function that wraps fileToUint8Array and extractBytes,
|
|
48
|
+
* providing a streamlined API for browser applications handling file inputs.
|
|
49
|
+
*
|
|
50
|
+
* @param file - The File or Blob to extract from
|
|
51
|
+
* @param mimeType - Optional MIME type. If not provided, uses file.type if available
|
|
52
|
+
* @param config - Optional extraction configuration
|
|
53
|
+
* @returns Promise resolving to the extraction result
|
|
54
|
+
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
55
|
+
*
|
|
56
|
+
* @example Simple file extraction
|
|
57
|
+
* ```typescript
|
|
58
|
+
* const fileInput = document.getElementById('file');
|
|
59
|
+
* fileInput.addEventListener('change', async (e) => {
|
|
60
|
+
* const file = e.target.files?.[0];
|
|
61
|
+
* if (file) {
|
|
62
|
+
* const result = await extractFromFile(file);
|
|
63
|
+
* console.log(result.content);
|
|
64
|
+
* }
|
|
65
|
+
* });
|
|
66
|
+
* ```
|
|
67
|
+
*
|
|
68
|
+
* @example With configuration
|
|
69
|
+
* ```typescript
|
|
70
|
+
* const result = await extractFromFile(file, file.type, {
|
|
71
|
+
* chunking: { maxChars: 1000 },
|
|
72
|
+
* images: { extractImages: true }
|
|
73
|
+
* });
|
|
74
|
+
* ```
|
|
75
|
+
*/
|
|
76
|
+
export declare function extractFromFile(file: File | Blob, mimeType?: string | null, config?: ExtractionConfigType | null): Promise<ExtractionResult>;
|
|
77
|
+
//# sourceMappingURL=files.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"files.d.ts","sourceRoot":"","sources":["../../typescript/extraction/files.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAIH,OAAO,KAAK,EAAE,gBAAgB,IAAI,oBAAoB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AAI9F;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AACH,wBAAsB,WAAW,CAChC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,MAAM,GAAG,IAAI,EACxB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,CAAC,CAmD3B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AACH,wBAAsB,eAAe,CACpC,IAAI,EAAE,IAAI,GAAG,IAAI,EACjB,QAAQ,CAAC,EAAE,MAAM,GAAG,IAAI,EACxB,MAAM,CAAC,EAAE,oBAAoB,GAAG,IAAI,GAClC,OAAO,CAAC,gBAAgB,CAAC,CAiB3B"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extraction module
|
|
3
|
+
*
|
|
4
|
+
* Provides comprehensive extraction functionality for various document formats.
|
|
5
|
+
* Includes byte-based, file-based, and batch processing capabilities.
|
|
6
|
+
*/
|
|
7
|
+
export type { ExtractionConfig, ExtractionResult } from "../types.d.ts";
|
|
8
|
+
export { batchExtractBytes, batchExtractBytesSync, batchExtractFiles } from "./batch.d.ts";
|
|
9
|
+
export { extractBytes, extractBytesSync } from "./bytes.d.ts";
|
|
10
|
+
export { extractFile, extractFromFile } from "./files.d.ts";
|
|
11
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../typescript/extraction/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,YAAY,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,MAAM,aAAa,CAAC;AACtE,OAAO,EAAE,iBAAiB,EAAE,qBAAqB,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AACzF,OAAO,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AAC5D,OAAO,EAAE,WAAW,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Internal extraction module helpers
|
|
3
|
+
*
|
|
4
|
+
* Provides internal utilities and access to the WASM module state.
|
|
5
|
+
* Re-exports state management from the centralized state module.
|
|
6
|
+
*/
|
|
7
|
+
import { type WasmModule } from "../initialization/state.d.ts";
|
|
8
|
+
/**
|
|
9
|
+
* Get the WASM module
|
|
10
|
+
*
|
|
11
|
+
* @returns The WASM module
|
|
12
|
+
* @throws {Error} If WASM module is not loaded
|
|
13
|
+
*/
|
|
14
|
+
export declare function getWasmModule(): WasmModule;
|
|
15
|
+
/**
|
|
16
|
+
* Check if WASM module is initialized
|
|
17
|
+
*
|
|
18
|
+
* @returns True if WASM module is initialized
|
|
19
|
+
*/
|
|
20
|
+
export declare function isInitialized(): boolean;
|
|
21
|
+
//# sourceMappingURL=internal.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"internal.d.ts","sourceRoot":"","sources":["../../typescript/extraction/internal.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAGN,KAAK,UAAU,EACf,MAAM,4BAA4B,CAAC;AAEpC;;;;;GAKG;AACH,wBAAgB,aAAa,IAAI,UAAU,CAO1C;AAED;;;;GAIG;AACH,wBAAgB,aAAa,IAAI,OAAO,CAEvC"}
|
package/dist/index.d.ts
CHANGED
|
@@ -57,7 +57,7 @@
|
|
|
57
57
|
* ## Runtime Detection
|
|
58
58
|
*
|
|
59
59
|
* ```typescript
|
|
60
|
-
* import { detectRuntime, getWasmCapabilities } from '@kreuzberg/wasm
|
|
60
|
+
* import { detectRuntime, getWasmCapabilities } from '@kreuzberg/wasm';
|
|
61
61
|
*
|
|
62
62
|
* const runtime = detectRuntime();
|
|
63
63
|
* const caps = getWasmCapabilities();
|
|
@@ -93,330 +93,16 @@
|
|
|
93
93
|
* const result = await extractBytes(bytes, 'application/pdf', config);
|
|
94
94
|
* ```
|
|
95
95
|
*/
|
|
96
|
-
|
|
96
|
+
export type * from "./types.d.ts";
|
|
97
|
+
export { initializePdfiumAsync } from "./initialization/pdfium-loader.d.ts";
|
|
98
|
+
export { getInitializationError, getVersion, getWasmModule, initWasm, isInitialized, type ModuleInfo, type WasmModule, } from "./initialization/wasm-loader.d.ts";
|
|
99
|
+
export { extractBytes, extractBytesSync } from "./extraction/bytes.d.ts";
|
|
100
|
+
export { extractFile, extractFromFile } from "./extraction/files.d.ts";
|
|
101
|
+
export { batchExtractBytes, batchExtractBytesSync, batchExtractFiles, } from "./extraction/batch.d.ts";
|
|
102
|
+
export { enableOcr } from "./ocr/enabler.d.ts";
|
|
97
103
|
export { configToJS, fileToUint8Array, isValidExtractionResult, jsToExtractionResult, wrapWasmError, } from "./adapters/wasm-adapter.d.ts";
|
|
98
104
|
export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend, } from "./ocr/registry.d.ts";
|
|
99
105
|
export { TesseractWasmBackend } from "./ocr/tesseract-wasm-backend.d.ts";
|
|
100
106
|
export { clearPostProcessors, clearValidators, getPostProcessor, getValidator, listPostProcessors, listValidators, type PostProcessor, registerPostProcessor, registerValidator, unregisterPostProcessor, unregisterValidator, type Validator, } from "./plugin-registry.d.ts";
|
|
101
|
-
export { detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment, type RuntimeType, type WasmCapabilities, } from "./runtime.d.ts";
|
|
102
|
-
export type * from "./types.d.ts";
|
|
103
|
-
export type { Chunk, ChunkingConfig, ChunkMetadata, ExtractedImage, ExtractionConfig, ExtractionResult, ImageExtractionConfig, LanguageDetectionConfig, Metadata, OcrBackendProtocol, OcrConfig, PageContent, PageExtractionConfig, PdfConfig, PostProcessorConfig, Table, TesseractConfig, TokenReductionConfig, } from "./types.d.ts";
|
|
104
|
-
export declare function initWasm(): Promise<void>;
|
|
105
|
-
/**
|
|
106
|
-
* Check if WASM module is initialized
|
|
107
|
-
*
|
|
108
|
-
* @returns True if WASM module is initialized, false otherwise
|
|
109
|
-
*
|
|
110
|
-
* @example
|
|
111
|
-
* ```typescript
|
|
112
|
-
* if (!isInitialized()) {
|
|
113
|
-
* await initWasm();
|
|
114
|
-
* }
|
|
115
|
-
* ```
|
|
116
|
-
*/
|
|
117
|
-
export declare function isInitialized(): boolean;
|
|
118
|
-
/**
|
|
119
|
-
* Get WASM module version
|
|
120
|
-
*
|
|
121
|
-
* @throws {Error} If WASM module is not initialized
|
|
122
|
-
* @returns The version string of the WASM module
|
|
123
|
-
*
|
|
124
|
-
* @example
|
|
125
|
-
* ```typescript
|
|
126
|
-
* const version = getVersion();
|
|
127
|
-
* console.log(`Using Kreuzberg ${version}`);
|
|
128
|
-
* ```
|
|
129
|
-
*/
|
|
130
|
-
export declare function getVersion(): string;
|
|
131
|
-
/**
|
|
132
|
-
* Get initialization error if module failed to load
|
|
133
|
-
*
|
|
134
|
-
* @returns The error that occurred during initialization, or null if no error
|
|
135
|
-
*
|
|
136
|
-
* @internal
|
|
137
|
-
*/
|
|
138
|
-
export declare function getInitializationError(): Error | null;
|
|
139
|
-
/**
|
|
140
|
-
* Extract content from bytes (document data)
|
|
141
|
-
*
|
|
142
|
-
* Extracts text, metadata, tables, images, and other content from document bytes.
|
|
143
|
-
* Automatically detects document type from MIME type and applies appropriate extraction logic.
|
|
144
|
-
*
|
|
145
|
-
* @param data - The document bytes to extract from
|
|
146
|
-
* @param mimeType - MIME type of the document (e.g., 'application/pdf', 'image/jpeg')
|
|
147
|
-
* @param config - Optional extraction configuration
|
|
148
|
-
* @returns Promise resolving to the extraction result
|
|
149
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
150
|
-
*
|
|
151
|
-
* @example Extract PDF
|
|
152
|
-
* ```typescript
|
|
153
|
-
* const bytes = new Uint8Array(buffer);
|
|
154
|
-
* const result = await extractBytes(bytes, 'application/pdf');
|
|
155
|
-
* console.log(result.content);
|
|
156
|
-
* console.log(result.tables);
|
|
157
|
-
* ```
|
|
158
|
-
*
|
|
159
|
-
* @example Extract with Configuration
|
|
160
|
-
* ```typescript
|
|
161
|
-
* const result = await extractBytes(bytes, 'application/pdf', {
|
|
162
|
-
* ocr: {
|
|
163
|
-
* backend: 'tesseract',
|
|
164
|
-
* language: 'deu' // German
|
|
165
|
-
* },
|
|
166
|
-
* images: {
|
|
167
|
-
* extractImages: true,
|
|
168
|
-
* targetDpi: 200
|
|
169
|
-
* }
|
|
170
|
-
* });
|
|
171
|
-
* ```
|
|
172
|
-
*
|
|
173
|
-
* @example Extract from File
|
|
174
|
-
* ```typescript
|
|
175
|
-
* const file = inputEvent.target.files[0];
|
|
176
|
-
* const bytes = await fileToUint8Array(file);
|
|
177
|
-
* const result = await extractBytes(bytes, file.type);
|
|
178
|
-
* ```
|
|
179
|
-
*/
|
|
180
|
-
export declare function extractBytes(data: Uint8Array, mimeType: string, config?: ExtractionConfigType | null): Promise<ExtractionResult>;
|
|
181
|
-
/**
|
|
182
|
-
* Extract content from a file on the file system
|
|
183
|
-
*
|
|
184
|
-
* Node.js and Deno specific function that reads a file from the file system
|
|
185
|
-
* and extracts content from it. Automatically detects MIME type if not provided.
|
|
186
|
-
*
|
|
187
|
-
* @param path - Path to the file to extract from
|
|
188
|
-
* @param mimeType - Optional MIME type of the file. If not provided, will attempt to detect
|
|
189
|
-
* @param config - Optional extraction configuration
|
|
190
|
-
* @returns Promise resolving to the extraction result
|
|
191
|
-
* @throws {Error} If WASM module is not initialized, file doesn't exist, or extraction fails
|
|
192
|
-
*
|
|
193
|
-
* @example Extract with auto-detection
|
|
194
|
-
* ```typescript
|
|
195
|
-
* const result = await extractFile('./document.pdf');
|
|
196
|
-
* console.log(result.content);
|
|
197
|
-
* ```
|
|
198
|
-
*
|
|
199
|
-
* @example Extract with explicit MIME type
|
|
200
|
-
* ```typescript
|
|
201
|
-
* const result = await extractFile('./document.docx', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document');
|
|
202
|
-
* ```
|
|
203
|
-
*
|
|
204
|
-
* @example Extract from Node.js with config
|
|
205
|
-
* ```typescript
|
|
206
|
-
* import { extractFile } from '@kreuzberg/wasm';
|
|
207
|
-
* import { readFile } from 'fs/promises';
|
|
208
|
-
*
|
|
209
|
-
* const result = await extractFile('./report.xlsx', null, {
|
|
210
|
-
* chunking: {
|
|
211
|
-
* maxChars: 1000
|
|
212
|
-
* }
|
|
213
|
-
* });
|
|
214
|
-
* ```
|
|
215
|
-
*/
|
|
216
|
-
export declare function extractFile(path: string, mimeType?: string | null, config?: ExtractionConfigType | null): Promise<ExtractionResult>;
|
|
217
|
-
/**
|
|
218
|
-
* Extract content from a File or Blob (browser-friendly wrapper)
|
|
219
|
-
*
|
|
220
|
-
* Convenience function that wraps fileToUint8Array and extractBytes,
|
|
221
|
-
* providing a streamlined API for browser applications handling file inputs.
|
|
222
|
-
*
|
|
223
|
-
* @param file - The File or Blob to extract from
|
|
224
|
-
* @param mimeType - Optional MIME type. If not provided, uses file.type if available
|
|
225
|
-
* @param config - Optional extraction configuration
|
|
226
|
-
* @returns Promise resolving to the extraction result
|
|
227
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
228
|
-
*
|
|
229
|
-
* @example Simple file extraction
|
|
230
|
-
* ```typescript
|
|
231
|
-
* const fileInput = document.getElementById('file');
|
|
232
|
-
* fileInput.addEventListener('change', async (e) => {
|
|
233
|
-
* const file = e.target.files?.[0];
|
|
234
|
-
* if (file) {
|
|
235
|
-
* const result = await extractFromFile(file);
|
|
236
|
-
* console.log(result.content);
|
|
237
|
-
* }
|
|
238
|
-
* });
|
|
239
|
-
* ```
|
|
240
|
-
*
|
|
241
|
-
* @example With configuration
|
|
242
|
-
* ```typescript
|
|
243
|
-
* const result = await extractFromFile(file, file.type, {
|
|
244
|
-
* chunking: { maxChars: 1000 },
|
|
245
|
-
* images: { extractImages: true }
|
|
246
|
-
* });
|
|
247
|
-
* ```
|
|
248
|
-
*/
|
|
249
|
-
export declare function extractFromFile(file: File | Blob, mimeType?: string | null, config?: ExtractionConfigType | null): Promise<ExtractionResult>;
|
|
250
|
-
/**
|
|
251
|
-
* Extract content from bytes synchronously
|
|
252
|
-
*
|
|
253
|
-
* Synchronous version of extractBytes. Performs extraction without async operations.
|
|
254
|
-
* Note: Some extraction features may still be async internally, but the wrapper is synchronous.
|
|
255
|
-
*
|
|
256
|
-
* @param data - The document bytes to extract from
|
|
257
|
-
* @param mimeType - MIME type of the document
|
|
258
|
-
* @param config - Optional extraction configuration
|
|
259
|
-
* @returns The extraction result
|
|
260
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
261
|
-
*
|
|
262
|
-
* @example
|
|
263
|
-
* ```typescript
|
|
264
|
-
* const bytes = new Uint8Array(buffer);
|
|
265
|
-
* const result = extractBytesSync(bytes, 'application/pdf');
|
|
266
|
-
* console.log(result.content);
|
|
267
|
-
* ```
|
|
268
|
-
*/
|
|
269
|
-
export declare function extractBytesSync(data: Uint8Array, mimeType: string, config?: ExtractionConfigType | null): ExtractionResult;
|
|
270
|
-
/**
|
|
271
|
-
* Batch extract content from multiple byte arrays asynchronously
|
|
272
|
-
*
|
|
273
|
-
* Extracts content from multiple documents in a single batch operation,
|
|
274
|
-
* allowing for more efficient processing of multiple files.
|
|
275
|
-
*
|
|
276
|
-
* @param files - Array of objects containing data (Uint8Array) and mimeType (string)
|
|
277
|
-
* @param config - Optional extraction configuration applied to all files
|
|
278
|
-
* @returns Promise resolving to array of extraction results
|
|
279
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
280
|
-
*
|
|
281
|
-
* @example
|
|
282
|
-
* ```typescript
|
|
283
|
-
* const files = [
|
|
284
|
-
* { data: pdfBytes, mimeType: 'application/pdf' },
|
|
285
|
-
* { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
|
|
286
|
-
* ];
|
|
287
|
-
* const results = await batchExtractBytes(files);
|
|
288
|
-
* results.forEach((result) => console.log(result.content));
|
|
289
|
-
* ```
|
|
290
|
-
*/
|
|
291
|
-
export declare function batchExtractBytes(files: Array<{
|
|
292
|
-
data: Uint8Array;
|
|
293
|
-
mimeType: string;
|
|
294
|
-
}>, config?: ExtractionConfigType | null): Promise<ExtractionResult[]>;
|
|
295
|
-
/**
|
|
296
|
-
* Batch extract content from multiple byte arrays synchronously
|
|
297
|
-
*
|
|
298
|
-
* Synchronous version of batchExtractBytes. Extracts content from multiple documents
|
|
299
|
-
* in a single batch operation without async operations.
|
|
300
|
-
*
|
|
301
|
-
* @param files - Array of objects containing data (Uint8Array) and mimeType (string)
|
|
302
|
-
* @param config - Optional extraction configuration applied to all files
|
|
303
|
-
* @returns Array of extraction results
|
|
304
|
-
* @throws {Error} If WASM module is not initialized or extraction fails
|
|
305
|
-
*
|
|
306
|
-
* @example
|
|
307
|
-
* ```typescript
|
|
308
|
-
* const files = [
|
|
309
|
-
* { data: pdfBytes, mimeType: 'application/pdf' },
|
|
310
|
-
* { data: docxBytes, mimeType: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }
|
|
311
|
-
* ];
|
|
312
|
-
* const results = batchExtractBytesSync(files);
|
|
313
|
-
* results.forEach((result) => console.log(result.content));
|
|
314
|
-
* ```
|
|
315
|
-
*/
|
|
316
|
-
export declare function batchExtractBytesSync(files: Array<{
|
|
317
|
-
data: Uint8Array;
|
|
318
|
-
mimeType: string;
|
|
319
|
-
}>, config?: ExtractionConfigType | null): ExtractionResult[];
|
|
320
|
-
/**
|
|
321
|
-
* Batch extract content from multiple File objects asynchronously
|
|
322
|
-
*
|
|
323
|
-
* Convenience function that converts File objects to Uint8Array and calls batchExtractBytes.
|
|
324
|
-
* Automatically uses the file.type as MIME type if available.
|
|
325
|
-
*
|
|
326
|
-
* @param files - Array of File objects to extract from
|
|
327
|
-
* @param config - Optional extraction configuration applied to all files
|
|
328
|
-
* @returns Promise resolving to array of extraction results
|
|
329
|
-
* @throws {Error} If WASM module is not initialized, files cannot be read, or extraction fails
|
|
330
|
-
*
|
|
331
|
-
* @example
|
|
332
|
-
* ```typescript
|
|
333
|
-
* const fileInput = document.getElementById('files');
|
|
334
|
-
* const files = Array.from(fileInput.files ?? []);
|
|
335
|
-
* const results = await batchExtractFiles(files);
|
|
336
|
-
* results.forEach((result, index) => {
|
|
337
|
-
* console.log(`File ${index}: ${result.content.substring(0, 50)}...`);
|
|
338
|
-
* });
|
|
339
|
-
* ```
|
|
340
|
-
*/
|
|
341
|
-
export declare function batchExtractFiles(files: File[], config?: ExtractionConfigType | null): Promise<ExtractionResult[]>;
|
|
342
|
-
/**
|
|
343
|
-
* Enable OCR functionality with tesseract-wasm backend
|
|
344
|
-
*
|
|
345
|
-
* Convenience function that automatically initializes and registers the Tesseract WASM backend.
|
|
346
|
-
* This is the recommended approach for enabling OCR in WASM-based applications.
|
|
347
|
-
*
|
|
348
|
-
* ## Browser Requirement
|
|
349
|
-
*
|
|
350
|
-
* This function requires a browser environment with support for:
|
|
351
|
-
* - WebWorkers (for Tesseract processing)
|
|
352
|
-
* - createImageBitmap (for image conversion)
|
|
353
|
-
* - Blob API
|
|
354
|
-
*
|
|
355
|
-
* ## Network Requirement
|
|
356
|
-
*
|
|
357
|
-
* Training data will be loaded from jsDelivr CDN on first use of each language.
|
|
358
|
-
* Ensure network access to cdn.jsdelivr.net is available.
|
|
359
|
-
*
|
|
360
|
-
* @throws {Error} If not in browser environment or tesseract-wasm is not available
|
|
361
|
-
*
|
|
362
|
-
* @example Basic Usage
|
|
363
|
-
* ```typescript
|
|
364
|
-
* import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
365
|
-
*
|
|
366
|
-
* async function main() {
|
|
367
|
-
* // Initialize WASM module
|
|
368
|
-
* await initWasm();
|
|
369
|
-
*
|
|
370
|
-
* // Enable OCR with tesseract-wasm
|
|
371
|
-
* await enableOcr();
|
|
372
|
-
*
|
|
373
|
-
* // Now you can use OCR in extraction
|
|
374
|
-
* const imageBytes = new Uint8Array(buffer);
|
|
375
|
-
* const result = await extractBytes(imageBytes, 'image/png', {
|
|
376
|
-
* ocr: { backend: 'tesseract-wasm', language: 'eng' }
|
|
377
|
-
* });
|
|
378
|
-
*
|
|
379
|
-
* console.log(result.content); // Extracted text
|
|
380
|
-
* }
|
|
381
|
-
*
|
|
382
|
-
* main().catch(console.error);
|
|
383
|
-
* ```
|
|
384
|
-
*
|
|
385
|
-
* @example With Progress Tracking
|
|
386
|
-
* ```typescript
|
|
387
|
-
* import { enableOcr, TesseractWasmBackend } from '@kreuzberg/wasm';
|
|
388
|
-
*
|
|
389
|
-
* async function setupOcrWithProgress() {
|
|
390
|
-
* const backend = new TesseractWasmBackend();
|
|
391
|
-
* backend.setProgressCallback((progress) => {
|
|
392
|
-
* console.log(`OCR Progress: ${progress}%`);
|
|
393
|
-
* updateProgressBar(progress);
|
|
394
|
-
* });
|
|
395
|
-
*
|
|
396
|
-
* await backend.initialize();
|
|
397
|
-
* registerOcrBackend(backend);
|
|
398
|
-
* }
|
|
399
|
-
*
|
|
400
|
-
* setupOcrWithProgress().catch(console.error);
|
|
401
|
-
* ```
|
|
402
|
-
*
|
|
403
|
-
* @example Multiple Languages
|
|
404
|
-
* ```typescript
|
|
405
|
-
* import { enableOcr, extractBytes, initWasm } from '@kreuzberg/wasm';
|
|
406
|
-
*
|
|
407
|
-
* await initWasm();
|
|
408
|
-
* await enableOcr();
|
|
409
|
-
*
|
|
410
|
-
* // Extract English text
|
|
411
|
-
* const englishResult = await extractBytes(engImageBytes, 'image/png', {
|
|
412
|
-
* ocr: { backend: 'tesseract-wasm', language: 'eng' }
|
|
413
|
-
* });
|
|
414
|
-
*
|
|
415
|
-
* // Extract German text - model is cached after first use
|
|
416
|
-
* const germanResult = await extractBytes(deImageBytes, 'image/png', {
|
|
417
|
-
* ocr: { backend: 'tesseract-wasm', language: 'deu' }
|
|
418
|
-
* });
|
|
419
|
-
* ```
|
|
420
|
-
*/
|
|
421
|
-
export declare function enableOcr(): Promise<void>;
|
|
107
|
+
export { detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isCloudflareWorkers, isDeno, isEdgeEnvironment, isEdgeRuntime, isNode, isServerEnvironment, isWebEnvironment, type RuntimeType, type WasmCapabilities, } from "./runtime.d.ts";
|
|
422
108
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../typescript/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8FG;AAMH,OAAO,KAAK,EAAE,gBAAgB,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../typescript/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8FG;AAMH,mBAAmB,YAAY,CAAC;AAMhC,OAAO,EAAE,qBAAqB,EAAE,MAAM,mCAAmC,CAAC;AAC1E,OAAO,EACN,sBAAsB,EACtB,UAAU,EACV,aAAa,EACb,QAAQ,EACR,aAAa,EACb,KAAK,UAAU,EACf,KAAK,UAAU,GACf,MAAM,iCAAiC,CAAC;AAMzC,OAAO,EAAE,YAAY,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAMvE,OAAO,EAAE,WAAW,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AAMrE,OAAO,EACN,iBAAiB,EACjB,qBAAqB,EACrB,iBAAiB,GACjB,MAAM,uBAAuB,CAAC;AAM/B,OAAO,EAAE,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAM7C,OAAO,EACN,UAAU,EACV,gBAAgB,EAChB,uBAAuB,EACvB,oBAAoB,EACpB,aAAa,GACb,MAAM,4BAA4B,CAAC;AAMpC,OAAO,EACN,gBAAgB,EAChB,aAAa,EACb,eAAe,EACf,kBAAkB,EAClB,oBAAoB,GACpB,MAAM,mBAAmB,CAAC;AAE3B,OAAO,EAAE,oBAAoB,EAAE,MAAM,iCAAiC,CAAC;AAMvE,OAAO,EACN,mBAAmB,EACnB,eAAe,EACf,gBAAgB,EAChB,YAAY,EACZ,kBAAkB,EAClB,cAAc,EACd,KAAK,aAAa,EAClB,qBAAqB,EACrB,iBAAiB,EACjB,uBAAuB,EACvB,mBAAmB,EACnB,KAAK,SAAS,GACd,MAAM,sBAAsB,CAAC;AAM9B,OAAO,EACN,aAAa,EACb,cAAc,EACd,iBAAiB,EACjB,mBAAmB,EACnB,SAAS,EACT,OAAO,EACP,UAAU,EACV,gBAAgB,EAChB,oBAAoB,EACpB,OAAO,EACP,gBAAgB,EAChB,UAAU,EACV,SAAS,EACT,KAAK,EACL,mBAAmB,EACnB,MAAM,EACN,iBAAiB,EACjB,aAAa,EACb,MAAM,EACN,mBAAmB,EACnB,gBAAgB,EAChB,KAAK,WAAW,EAChB,KAAK,gBAAgB,GACrB,MAAM,cAAc,CAAC"}
|