@kreuzberg/wasm 4.0.0-rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +982 -0
- package/dist/adapters/wasm-adapter.d.mts +121 -0
- package/dist/adapters/wasm-adapter.d.ts +121 -0
- package/dist/adapters/wasm-adapter.js +241 -0
- package/dist/adapters/wasm-adapter.js.map +1 -0
- package/dist/adapters/wasm-adapter.mjs +221 -0
- package/dist/adapters/wasm-adapter.mjs.map +1 -0
- package/dist/index.d.mts +466 -0
- package/dist/index.d.ts +466 -0
- package/dist/index.js +383 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +384 -0
- package/dist/index.mjs.map +1 -0
- package/dist/kreuzberg_wasm.d.mts +758 -0
- package/dist/kreuzberg_wasm.d.ts +758 -0
- package/dist/kreuzberg_wasm.js +1913 -0
- package/dist/kreuzberg_wasm.mjs +48 -0
- package/dist/kreuzberg_wasm_bg.wasm +0 -0
- package/dist/kreuzberg_wasm_bg.wasm.d.ts +54 -0
- package/dist/ocr/registry.d.mts +102 -0
- package/dist/ocr/registry.d.ts +102 -0
- package/dist/ocr/registry.js +90 -0
- package/dist/ocr/registry.js.map +1 -0
- package/dist/ocr/registry.mjs +70 -0
- package/dist/ocr/registry.mjs.map +1 -0
- package/dist/ocr/tesseract-wasm-backend.d.mts +257 -0
- package/dist/ocr/tesseract-wasm-backend.d.ts +257 -0
- package/dist/ocr/tesseract-wasm-backend.js +454 -0
- package/dist/ocr/tesseract-wasm-backend.js.map +1 -0
- package/dist/ocr/tesseract-wasm-backend.mjs +424 -0
- package/dist/ocr/tesseract-wasm-backend.mjs.map +1 -0
- package/dist/runtime.d.mts +256 -0
- package/dist/runtime.d.ts +256 -0
- package/dist/runtime.js +172 -0
- package/dist/runtime.js.map +1 -0
- package/dist/runtime.mjs +152 -0
- package/dist/runtime.mjs.map +1 -0
- package/dist/snippets/wasm-bindgen-rayon-38edf6e439f6d70d/src/workerHelpers.js +107 -0
- package/dist/types-GJVIvbPy.d.mts +221 -0
- package/dist/types-GJVIvbPy.d.ts +221 -0
- package/package.json +138 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type definitions for Kreuzberg WASM bindings
|
|
3
|
+
*
|
|
4
|
+
* These types are generated from the Rust core library and define
|
|
5
|
+
* the interface for extraction, configuration, and results.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Configuration for document extraction
|
|
9
|
+
*/
|
|
10
|
+
interface ExtractionConfig {
|
|
11
|
+
/** OCR configuration */
|
|
12
|
+
ocr?: OcrConfig;
|
|
13
|
+
/** Chunking configuration */
|
|
14
|
+
chunking?: ChunkingConfig;
|
|
15
|
+
/** Image extraction configuration */
|
|
16
|
+
images?: ImageExtractionConfig;
|
|
17
|
+
/** Page extraction configuration */
|
|
18
|
+
pages?: PageExtractionConfig;
|
|
19
|
+
/** Language detection configuration */
|
|
20
|
+
languageDetection?: LanguageDetectionConfig;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* OCR configuration
|
|
24
|
+
*/
|
|
25
|
+
interface OcrConfig {
|
|
26
|
+
/** OCR backend to use */
|
|
27
|
+
backend?: string;
|
|
28
|
+
/** Language codes (ISO 639) */
|
|
29
|
+
languages?: string[];
|
|
30
|
+
/** Whether to perform OCR */
|
|
31
|
+
enabled?: boolean;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Chunking configuration
|
|
35
|
+
*/
|
|
36
|
+
interface ChunkingConfig {
|
|
37
|
+
/** Maximum characters per chunk */
|
|
38
|
+
maxChars?: number;
|
|
39
|
+
/** Overlap between chunks */
|
|
40
|
+
maxOverlap?: number;
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Image extraction configuration
|
|
44
|
+
*/
|
|
45
|
+
interface ImageExtractionConfig {
|
|
46
|
+
/** Whether to extract images */
|
|
47
|
+
enabled?: boolean;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Page extraction configuration
|
|
51
|
+
*/
|
|
52
|
+
interface PageExtractionConfig {
|
|
53
|
+
/** Whether to extract per-page content */
|
|
54
|
+
enabled?: boolean;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Language detection configuration
|
|
58
|
+
*/
|
|
59
|
+
interface LanguageDetectionConfig {
|
|
60
|
+
/** Whether to detect languages */
|
|
61
|
+
enabled?: boolean;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Result of document extraction
|
|
65
|
+
*/
|
|
66
|
+
interface ExtractionResult {
|
|
67
|
+
/** Extracted text content */
|
|
68
|
+
content: string;
|
|
69
|
+
/** MIME type of the document */
|
|
70
|
+
mimeType: string;
|
|
71
|
+
/** Document metadata */
|
|
72
|
+
metadata: Metadata;
|
|
73
|
+
/** Extracted tables */
|
|
74
|
+
tables: Table[];
|
|
75
|
+
/** Detected languages (ISO 639 codes) */
|
|
76
|
+
detectedLanguages?: string[] | null;
|
|
77
|
+
/** Text chunks when chunking is enabled */
|
|
78
|
+
chunks?: Chunk[] | null;
|
|
79
|
+
/** Extracted images */
|
|
80
|
+
images?: ExtractedImage[] | null;
|
|
81
|
+
/** Per-page content */
|
|
82
|
+
pages?: PageContent[] | null;
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Document metadata
|
|
86
|
+
*/
|
|
87
|
+
interface Metadata {
|
|
88
|
+
/** Document title */
|
|
89
|
+
title?: string;
|
|
90
|
+
/** Document subject or description */
|
|
91
|
+
subject?: string;
|
|
92
|
+
/** Document author(s) */
|
|
93
|
+
authors?: string[];
|
|
94
|
+
/** Keywords/tags */
|
|
95
|
+
keywords?: string[];
|
|
96
|
+
/** Primary language (ISO 639 code) */
|
|
97
|
+
language?: string;
|
|
98
|
+
/** Creation timestamp (ISO 8601 format) */
|
|
99
|
+
createdAt?: string;
|
|
100
|
+
/** Last modification timestamp (ISO 8601 format) */
|
|
101
|
+
modifiedAt?: string;
|
|
102
|
+
/** User who created the document */
|
|
103
|
+
creator?: string;
|
|
104
|
+
/** User who last modified the document */
|
|
105
|
+
lastModifiedBy?: string;
|
|
106
|
+
/** Number of pages/slides */
|
|
107
|
+
pageCount?: number;
|
|
108
|
+
/** Format-specific metadata */
|
|
109
|
+
formatMetadata?: unknown;
|
|
110
|
+
/** Custom additional fields */
|
|
111
|
+
additional?: Record<string, unknown>;
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Extracted table
|
|
115
|
+
*/
|
|
116
|
+
interface Table {
|
|
117
|
+
/** Table cells/rows */
|
|
118
|
+
cells?: string[][];
|
|
119
|
+
/** Table markdown representation */
|
|
120
|
+
markdown?: string;
|
|
121
|
+
/** Page number if available */
|
|
122
|
+
pageNumber?: number;
|
|
123
|
+
/** Table headers */
|
|
124
|
+
headers?: string[];
|
|
125
|
+
/** Table rows */
|
|
126
|
+
rows?: string[][];
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Chunk metadata
|
|
130
|
+
*/
|
|
131
|
+
interface ChunkMetadata {
|
|
132
|
+
/** Character start position in original content */
|
|
133
|
+
charStart: number;
|
|
134
|
+
/** Character end position in original content */
|
|
135
|
+
charEnd: number;
|
|
136
|
+
/** Token count if available */
|
|
137
|
+
tokenCount: number | null;
|
|
138
|
+
/** Index of this chunk */
|
|
139
|
+
chunkIndex: number;
|
|
140
|
+
/** Total number of chunks */
|
|
141
|
+
totalChunks: number;
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Text chunk from chunked content
|
|
145
|
+
*/
|
|
146
|
+
interface Chunk {
|
|
147
|
+
/** Chunk text content */
|
|
148
|
+
content: string;
|
|
149
|
+
/** Chunk metadata */
|
|
150
|
+
metadata?: ChunkMetadata;
|
|
151
|
+
/** Character position in original content (legacy) */
|
|
152
|
+
charIndex?: number;
|
|
153
|
+
/** Token count if available (legacy) */
|
|
154
|
+
tokenCount?: number;
|
|
155
|
+
/** Embedding vector if computed */
|
|
156
|
+
embedding?: number[] | null;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Extracted image from document
|
|
160
|
+
*/
|
|
161
|
+
interface ExtractedImage {
|
|
162
|
+
/** Image data as Uint8Array or base64 string */
|
|
163
|
+
data: Uint8Array | string;
|
|
164
|
+
/** Image format/MIME type */
|
|
165
|
+
format?: string;
|
|
166
|
+
/** MIME type of the image */
|
|
167
|
+
mimeType?: string;
|
|
168
|
+
/** Image index in document */
|
|
169
|
+
imageIndex?: number;
|
|
170
|
+
/** Page number if available */
|
|
171
|
+
pageNumber?: number | null;
|
|
172
|
+
/** Image width in pixels */
|
|
173
|
+
width?: number | null;
|
|
174
|
+
/** Image height in pixels */
|
|
175
|
+
height?: number | null;
|
|
176
|
+
/** Color space of the image */
|
|
177
|
+
colorspace?: string | null;
|
|
178
|
+
/** Bits per color component */
|
|
179
|
+
bitsPerComponent?: number | null;
|
|
180
|
+
/** Whether this is a mask image */
|
|
181
|
+
isMask?: boolean;
|
|
182
|
+
/** Image description */
|
|
183
|
+
description?: string | null;
|
|
184
|
+
/** Optional OCR result from the image */
|
|
185
|
+
ocrResult?: ExtractionResult | string | null;
|
|
186
|
+
}
|
|
187
|
+
/**
|
|
188
|
+
* Per-page content
|
|
189
|
+
*/
|
|
190
|
+
interface PageContent {
|
|
191
|
+
/** Page number (1-indexed) */
|
|
192
|
+
pageNumber: number;
|
|
193
|
+
/** Text content of the page */
|
|
194
|
+
content: string;
|
|
195
|
+
/** Tables on this page */
|
|
196
|
+
tables?: Table[];
|
|
197
|
+
/** Images on this page */
|
|
198
|
+
images?: ExtractedImage[];
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* OCR backend protocol/interface
|
|
202
|
+
*/
|
|
203
|
+
interface OcrBackendProtocol {
|
|
204
|
+
/** Get the backend name */
|
|
205
|
+
name(): string;
|
|
206
|
+
/** Get supported language codes */
|
|
207
|
+
supportedLanguages?(): string[];
|
|
208
|
+
/** Initialize the backend */
|
|
209
|
+
initialize(options?: Record<string, unknown>): void | Promise<void>;
|
|
210
|
+
/** Shutdown the backend */
|
|
211
|
+
shutdown?(): void | Promise<void>;
|
|
212
|
+
/** Process an image with OCR */
|
|
213
|
+
processImage(imageData: Uint8Array | string, language?: string): Promise<{
|
|
214
|
+
content: string;
|
|
215
|
+
mime_type: string;
|
|
216
|
+
metadata?: Record<string, unknown>;
|
|
217
|
+
tables?: unknown[];
|
|
218
|
+
} | string>;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
export type { Chunk as C, ExtractionConfig as E, ImageExtractionConfig as I, LanguageDetectionConfig as L, Metadata as M, OcrConfig as O, PageExtractionConfig as P, Table as T, ExtractionResult as a, ChunkingConfig as b, ExtractedImage as c, ChunkMetadata as d, PageContent as e, OcrBackendProtocol as f };
|
package/package.json
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@kreuzberg/wasm",
|
|
3
|
+
"version": "4.0.0-rc.6",
|
|
4
|
+
"description": "Kreuzberg document intelligence - WebAssembly bindings",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "Na'aman Hirschfeld",
|
|
7
|
+
"email": "nhirschfeld@gmail.com",
|
|
8
|
+
"url": "https://kreuzberg.dev"
|
|
9
|
+
},
|
|
10
|
+
"homepage": "https://kreuzberg.dev",
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/kreuzberg-dev/kreuzberg/issues"
|
|
13
|
+
},
|
|
14
|
+
"main": "dist/index.js",
|
|
15
|
+
"module": "dist/index.mjs",
|
|
16
|
+
"types": "dist/index.d.ts",
|
|
17
|
+
"exports": {
|
|
18
|
+
".": {
|
|
19
|
+
"import": {
|
|
20
|
+
"types": "./dist/index.d.mts",
|
|
21
|
+
"default": "./dist/index.mjs"
|
|
22
|
+
},
|
|
23
|
+
"require": {
|
|
24
|
+
"types": "./dist/index.d.ts",
|
|
25
|
+
"default": "./dist/index.js"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"./runtime": {
|
|
29
|
+
"import": {
|
|
30
|
+
"types": "./dist/runtime.d.mts",
|
|
31
|
+
"default": "./dist/runtime.mjs"
|
|
32
|
+
},
|
|
33
|
+
"require": {
|
|
34
|
+
"types": "./dist/runtime.d.ts",
|
|
35
|
+
"default": "./dist/runtime.js"
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
"./adapters/wasm-adapter": {
|
|
39
|
+
"import": {
|
|
40
|
+
"types": "./dist/adapters/wasm-adapter.d.mts",
|
|
41
|
+
"default": "./dist/adapters/wasm-adapter.mjs"
|
|
42
|
+
},
|
|
43
|
+
"require": {
|
|
44
|
+
"types": "./dist/adapters/wasm-adapter.d.ts",
|
|
45
|
+
"default": "./dist/adapters/wasm-adapter.js"
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"./ocr/registry": {
|
|
49
|
+
"import": {
|
|
50
|
+
"types": "./dist/ocr/registry.d.mts",
|
|
51
|
+
"default": "./dist/ocr/registry.mjs"
|
|
52
|
+
},
|
|
53
|
+
"require": {
|
|
54
|
+
"types": "./dist/ocr/registry.d.ts",
|
|
55
|
+
"default": "./dist/ocr/registry.js"
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
"./ocr/tesseract-wasm-backend": {
|
|
59
|
+
"import": {
|
|
60
|
+
"types": "./dist/ocr/tesseract-wasm-backend.d.mts",
|
|
61
|
+
"default": "./dist/ocr/tesseract-wasm-backend.mjs"
|
|
62
|
+
},
|
|
63
|
+
"require": {
|
|
64
|
+
"types": "./dist/ocr/tesseract-wasm-backend.d.ts",
|
|
65
|
+
"default": "./dist/ocr/tesseract-wasm-backend.js"
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
"repository": {
|
|
70
|
+
"type": "git",
|
|
71
|
+
"url": "https://github.com/kreuzberg-dev/kreuzberg.git"
|
|
72
|
+
},
|
|
73
|
+
"license": "MIT",
|
|
74
|
+
"keywords": [
|
|
75
|
+
"wasm",
|
|
76
|
+
"webassembly",
|
|
77
|
+
"document-intelligence",
|
|
78
|
+
"document-extraction",
|
|
79
|
+
"text-extraction",
|
|
80
|
+
"pdf-extraction",
|
|
81
|
+
"ocr",
|
|
82
|
+
"pdf",
|
|
83
|
+
"docx",
|
|
84
|
+
"xlsx",
|
|
85
|
+
"pptx",
|
|
86
|
+
"office-documents",
|
|
87
|
+
"table-extraction",
|
|
88
|
+
"metadata-extraction",
|
|
89
|
+
"rust",
|
|
90
|
+
"binding",
|
|
91
|
+
"typescript"
|
|
92
|
+
],
|
|
93
|
+
"files": [
|
|
94
|
+
"dist",
|
|
95
|
+
"pkg",
|
|
96
|
+
"*.wasm",
|
|
97
|
+
"*.d.ts",
|
|
98
|
+
"README.md"
|
|
99
|
+
],
|
|
100
|
+
"engines": {
|
|
101
|
+
"node": ">= 16"
|
|
102
|
+
},
|
|
103
|
+
"publishConfig": {
|
|
104
|
+
"registry": "https://registry.npmjs.org/",
|
|
105
|
+
"access": "public"
|
|
106
|
+
},
|
|
107
|
+
"scripts": {
|
|
108
|
+
"build:wasm:web": "wasm-pack build --target web --out-dir pkg --release",
|
|
109
|
+
"build:wasm:bundler": "wasm-pack build --target bundler --out-dir pkg --release",
|
|
110
|
+
"build:wasm:nodejs": "wasm-pack build --target nodejs --out-dir pkg --release",
|
|
111
|
+
"build:wasm:deno": "wasm-pack build --target deno --out-dir pkg --release",
|
|
112
|
+
"build:ts": "tsup",
|
|
113
|
+
"build": "npm run build:wasm:nodejs && npm run build:ts",
|
|
114
|
+
"build:all": "npm run build:wasm:web && npm run build:wasm:bundler && npm run build:wasm:nodejs && npm run build:wasm:deno && npm run build:ts",
|
|
115
|
+
"typecheck": "tsc --noEmit",
|
|
116
|
+
"lint": "biome check typescript && oxlint typescript",
|
|
117
|
+
"lint:fix": "biome check --write typescript",
|
|
118
|
+
"format": "biome format --write typescript",
|
|
119
|
+
"test": "vitest run",
|
|
120
|
+
"test:watch": "vitest",
|
|
121
|
+
"test:coverage": "vitest run --coverage",
|
|
122
|
+
"test:ui": "vitest --ui",
|
|
123
|
+
"prepublishOnly": "npm run build"
|
|
124
|
+
},
|
|
125
|
+
"devDependencies": {
|
|
126
|
+
"@types/node": "^24.10.1",
|
|
127
|
+
"@vitest/coverage-v8": "^1.4.0",
|
|
128
|
+
"@vitest/ui": "^1.4.0",
|
|
129
|
+
"jsdom": "^24.0.0",
|
|
130
|
+
"oxlint": "^1.31.0",
|
|
131
|
+
"tsup": "^8.5.1",
|
|
132
|
+
"typescript": "^5.9.3",
|
|
133
|
+
"vitest": "^1.4.0"
|
|
134
|
+
},
|
|
135
|
+
"optionalDependencies": {
|
|
136
|
+
"tesseract-wasm": "^0.11.0"
|
|
137
|
+
}
|
|
138
|
+
}
|