@kreuzberg/wasm 4.0.0-rc.21 → 4.0.0-rc.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1,10 +1,226 @@
1
- import { E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.cjs.js';
2
- export { C as Chunk, b as ChunkingConfig, c as ChunkMetadata, d as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, O as OcrBackendProtocol, e as OcrConfig, P as PageContent, f as PageExtractionConfig, g as PdfConfig, h as PostProcessorConfig, T as Table, i as TesseractConfig, j as TokenReductionConfig, E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.cjs.js';
1
+ import { E as ExtractionConfig, a as ExtractionResult } from './types-wVLLDHkl.cjs.js';
2
+ export { C as Chunk, b as ChunkingConfig, c as ChunkMetadata, d as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, O as OcrBackendProtocol, e as OcrConfig, P as PageContent, f as PageExtractionConfig, g as PdfConfig, h as PostProcessorConfig, T as Table, i as TesseractConfig, j as TokenReductionConfig, E as ExtractionConfig, a as ExtractionResult } from './types-wVLLDHkl.cjs.js';
3
3
  export { configToJS, fileToUint8Array, isValidExtractionResult, jsToExtractionResult, wrapWasmError } from './adapters/wasm-adapter.cjs';
4
4
  export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend } from './ocr/registry.cjs';
5
5
  export { TesseractWasmBackend } from './ocr/tesseract-wasm-backend.cjs';
6
6
  export { type RuntimeType, type WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.d.cts';
7
7
 
8
+ /**
9
+ * Plugin Registry Module
10
+ *
11
+ * This module manages registrations and execution of post-processors and validators
12
+ * for document extraction pipelines.
13
+ *
14
+ * # Thread Safety
15
+ * All registrations are stored in Maps and are single-threaded safe for WASM environments.
16
+ *
17
+ * # Global Callback Functions
18
+ * The WASM module can invoke processing via global callback functions:
19
+ * - `__kreuzberg_execute_post_processor`: Execute a registered post-processor
20
+ * - `__kreuzberg_execute_validator`: Execute a registered validator
21
+ */
22
+
23
+ /**
24
+ * Post-processor plugin interface
25
+ *
26
+ * A post-processor modifies extraction results after extraction completes.
27
+ */
28
+ interface PostProcessor {
29
+ /**
30
+ * Get the processor name (must be non-empty string)
31
+ */
32
+ name(): string;
33
+ /**
34
+ * Get the processing stage (optional, defaults to "middle")
35
+ * - "early": Process early in the pipeline
36
+ * - "middle": Process in the middle of the pipeline
37
+ * - "late": Process late in the pipeline
38
+ */
39
+ stage?(): "early" | "middle" | "late";
40
+ /**
41
+ * Process an extraction result
42
+ * Can be sync or async
43
+ */
44
+ process(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;
45
+ /**
46
+ * Shutdown the processor (optional)
47
+ */
48
+ shutdown?(): void | Promise<void>;
49
+ }
50
+ /**
51
+ * Validator plugin interface
52
+ *
53
+ * A validator checks extraction results for correctness
54
+ */
55
+ interface Validator {
56
+ /**
57
+ * Get the validator name (must be non-empty string)
58
+ */
59
+ name(): string;
60
+ /**
61
+ * Get the validation priority (optional, defaults to 50)
62
+ * Higher numbers = higher priority (execute first)
63
+ */
64
+ priority?(): number;
65
+ /**
66
+ * Validate an extraction result
67
+ * Can be sync or async
68
+ */
69
+ validate(result: ExtractionResult): {
70
+ valid: boolean;
71
+ errors: string[];
72
+ } | Promise<{
73
+ valid: boolean;
74
+ errors: string[];
75
+ }>;
76
+ /**
77
+ * Shutdown the validator (optional)
78
+ */
79
+ shutdown?(): void | Promise<void>;
80
+ }
81
+ /**
82
+ * Register a post-processor plugin
83
+ *
84
+ * @param processor - The post-processor to register
85
+ * @throws {Error} If the processor is invalid or missing required methods
86
+ *
87
+ * @example
88
+ * ```typescript
89
+ * const processor = {
90
+ * name: () => "my-processor",
91
+ * stage: () => "middle",
92
+ * process: async (result) => {
93
+ * result.content = result.content.toUpperCase();
94
+ * return result;
95
+ * }
96
+ * };
97
+ * registerPostProcessor(processor);
98
+ * ```
99
+ */
100
+ declare function registerPostProcessor(processor: PostProcessor): void;
101
+ /**
102
+ * Get a registered post-processor by name
103
+ *
104
+ * @param name - The processor name
105
+ * @returns The processor, or undefined if not found
106
+ *
107
+ * @example
108
+ * ```typescript
109
+ * const processor = getPostProcessor("my-processor");
110
+ * if (processor) {
111
+ * console.log("Found processor:", processor.name());
112
+ * }
113
+ * ```
114
+ */
115
+ declare function getPostProcessor(name: string): PostProcessor | undefined;
116
+ /**
117
+ * List all registered post-processor names
118
+ *
119
+ * @returns Array of processor names
120
+ *
121
+ * @example
122
+ * ```typescript
123
+ * const names = listPostProcessors();
124
+ * console.log("Registered processors:", names);
125
+ * ```
126
+ */
127
+ declare function listPostProcessors(): string[];
128
+ /**
129
+ * Unregister a post-processor and call its shutdown method
130
+ *
131
+ * @param name - The processor name
132
+ * @throws {Error} If the processor is not registered
133
+ *
134
+ * @example
135
+ * ```typescript
136
+ * await unregisterPostProcessor("my-processor");
137
+ * ```
138
+ */
139
+ declare function unregisterPostProcessor(name: string): Promise<void>;
140
+ /**
141
+ * Clear all registered post-processors
142
+ *
143
+ * Calls shutdown on all processors before clearing.
144
+ *
145
+ * @example
146
+ * ```typescript
147
+ * await clearPostProcessors();
148
+ * ```
149
+ */
150
+ declare function clearPostProcessors(): Promise<void>;
151
+ /**
152
+ * Register a validator plugin
153
+ *
154
+ * @param validator - The validator to register
155
+ * @throws {Error} If the validator is invalid or missing required methods
156
+ *
157
+ * @example
158
+ * ```typescript
159
+ * const validator = {
160
+ * name: () => "my-validator",
161
+ * priority: () => 50,
162
+ * validate: async (result) => {
163
+ * if (!result.content) {
164
+ * return { valid: false, errors: ["Content is empty"] };
165
+ * }
166
+ * return { valid: true, errors: [] };
167
+ * }
168
+ * };
169
+ * registerValidator(validator);
170
+ * ```
171
+ */
172
+ declare function registerValidator(validator: Validator): void;
173
+ /**
174
+ * Get a registered validator by name
175
+ *
176
+ * @param name - The validator name
177
+ * @returns The validator, or undefined if not found
178
+ *
179
+ * @example
180
+ * ```typescript
181
+ * const validator = getValidator("my-validator");
182
+ * if (validator) {
183
+ * console.log("Found validator:", validator.name());
184
+ * }
185
+ * ```
186
+ */
187
+ declare function getValidator(name: string): Validator | undefined;
188
+ /**
189
+ * List all registered validator names
190
+ *
191
+ * @returns Array of validator names
192
+ *
193
+ * @example
194
+ * ```typescript
195
+ * const names = listValidators();
196
+ * console.log("Registered validators:", names);
197
+ * ```
198
+ */
199
+ declare function listValidators(): string[];
200
+ /**
201
+ * Unregister a validator and call its shutdown method
202
+ *
203
+ * @param name - The validator name
204
+ * @throws {Error} If the validator is not registered
205
+ *
206
+ * @example
207
+ * ```typescript
208
+ * await unregisterValidator("my-validator");
209
+ * ```
210
+ */
211
+ declare function unregisterValidator(name: string): Promise<void>;
212
+ /**
213
+ * Clear all registered validators
214
+ *
215
+ * Calls shutdown on all validators before clearing.
216
+ *
217
+ * @example
218
+ * ```typescript
219
+ * await clearValidators();
220
+ * ```
221
+ */
222
+ declare function clearValidators(): Promise<void>;
223
+
8
224
  /**
9
225
  * Kreuzberg - WebAssembly Bindings for Browser and Runtime Environments
10
226
  *
@@ -420,4 +636,4 @@ declare function batchExtractFiles(files: File[], config?: ExtractionConfig | nu
420
636
  */
421
637
  declare function enableOcr(): Promise<void>;
422
638
 
423
- export { batchExtractBytes, batchExtractBytesSync, batchExtractFiles, enableOcr, extractBytes, extractBytesSync, extractFile, extractFromFile, getInitializationError, getVersion, initWasm, isInitialized };
639
+ export { type PostProcessor, type Validator, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, clearPostProcessors, clearValidators, enableOcr, extractBytes, extractBytesSync, extractFile, extractFromFile, getInitializationError, getPostProcessor, getValidator, getVersion, initWasm, isInitialized, listPostProcessors, listValidators, registerPostProcessor, registerValidator, unregisterPostProcessor, unregisterValidator };
package/dist/index.d.ts CHANGED
@@ -1,10 +1,226 @@
1
- import { E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.d.ts';
2
- export { C as Chunk, b as ChunkingConfig, c as ChunkMetadata, d as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, O as OcrBackendProtocol, e as OcrConfig, P as PageContent, f as PageExtractionConfig, g as PdfConfig, h as PostProcessorConfig, T as Table, i as TesseractConfig, j as TokenReductionConfig, E as ExtractionConfig, a as ExtractionResult } from './types-CKjcIYcX.d.ts';
1
+ import { E as ExtractionConfig, a as ExtractionResult } from './types-wVLLDHkl.d.ts';
2
+ export { C as Chunk, b as ChunkingConfig, c as ChunkMetadata, d as ExtractedImage, I as ImageExtractionConfig, L as LanguageDetectionConfig, M as Metadata, O as OcrBackendProtocol, e as OcrConfig, P as PageContent, f as PageExtractionConfig, g as PdfConfig, h as PostProcessorConfig, T as Table, i as TesseractConfig, j as TokenReductionConfig, E as ExtractionConfig, a as ExtractionResult } from './types-wVLLDHkl.d.ts';
3
3
  export { configToJS, fileToUint8Array, isValidExtractionResult, jsToExtractionResult, wrapWasmError } from './adapters/wasm-adapter.js';
4
4
  export { clearOcrBackends, getOcrBackend, listOcrBackends, registerOcrBackend, unregisterOcrBackend } from './ocr/registry.js';
5
5
  export { TesseractWasmBackend } from './ocr/tesseract-wasm-backend.js';
6
6
  export { type RuntimeType, type WasmCapabilities, detectRuntime, getRuntimeInfo, getRuntimeVersion, getWasmCapabilities, hasBigInt, hasBlob, hasFileApi, hasModuleWorkers, hasSharedArrayBuffer, hasWasm, hasWasmStreaming, hasWorkers, isBrowser, isBun, isDeno, isNode, isServerEnvironment, isWebEnvironment } from './runtime.d.ts';
7
7
 
8
+ /**
9
+ * Plugin Registry Module
10
+ *
11
+ * This module manages registrations and execution of post-processors and validators
12
+ * for document extraction pipelines.
13
+ *
14
+ * # Thread Safety
15
+ * All registrations are stored in Maps and are single-threaded safe for WASM environments.
16
+ *
17
+ * # Global Callback Functions
18
+ * The WASM module can invoke processing via global callback functions:
19
+ * - `__kreuzberg_execute_post_processor`: Execute a registered post-processor
20
+ * - `__kreuzberg_execute_validator`: Execute a registered validator
21
+ */
22
+
23
+ /**
24
+ * Post-processor plugin interface
25
+ *
26
+ * A post-processor modifies extraction results after extraction completes.
27
+ */
28
+ interface PostProcessor {
29
+ /**
30
+ * Get the processor name (must be non-empty string)
31
+ */
32
+ name(): string;
33
+ /**
34
+ * Get the processing stage (optional, defaults to "middle")
35
+ * - "early": Process early in the pipeline
36
+ * - "middle": Process in the middle of the pipeline
37
+ * - "late": Process late in the pipeline
38
+ */
39
+ stage?(): "early" | "middle" | "late";
40
+ /**
41
+ * Process an extraction result
42
+ * Can be sync or async
43
+ */
44
+ process(result: ExtractionResult): ExtractionResult | Promise<ExtractionResult>;
45
+ /**
46
+ * Shutdown the processor (optional)
47
+ */
48
+ shutdown?(): void | Promise<void>;
49
+ }
50
+ /**
51
+ * Validator plugin interface
52
+ *
53
+ * A validator checks extraction results for correctness
54
+ */
55
+ interface Validator {
56
+ /**
57
+ * Get the validator name (must be non-empty string)
58
+ */
59
+ name(): string;
60
+ /**
61
+ * Get the validation priority (optional, defaults to 50)
62
+ * Higher numbers = higher priority (execute first)
63
+ */
64
+ priority?(): number;
65
+ /**
66
+ * Validate an extraction result
67
+ * Can be sync or async
68
+ */
69
+ validate(result: ExtractionResult): {
70
+ valid: boolean;
71
+ errors: string[];
72
+ } | Promise<{
73
+ valid: boolean;
74
+ errors: string[];
75
+ }>;
76
+ /**
77
+ * Shutdown the validator (optional)
78
+ */
79
+ shutdown?(): void | Promise<void>;
80
+ }
81
+ /**
82
+ * Register a post-processor plugin
83
+ *
84
+ * @param processor - The post-processor to register
85
+ * @throws {Error} If the processor is invalid or missing required methods
86
+ *
87
+ * @example
88
+ * ```typescript
89
+ * const processor = {
90
+ * name: () => "my-processor",
91
+ * stage: () => "middle",
92
+ * process: async (result) => {
93
+ * result.content = result.content.toUpperCase();
94
+ * return result;
95
+ * }
96
+ * };
97
+ * registerPostProcessor(processor);
98
+ * ```
99
+ */
100
+ declare function registerPostProcessor(processor: PostProcessor): void;
101
+ /**
102
+ * Get a registered post-processor by name
103
+ *
104
+ * @param name - The processor name
105
+ * @returns The processor, or undefined if not found
106
+ *
107
+ * @example
108
+ * ```typescript
109
+ * const processor = getPostProcessor("my-processor");
110
+ * if (processor) {
111
+ * console.log("Found processor:", processor.name());
112
+ * }
113
+ * ```
114
+ */
115
+ declare function getPostProcessor(name: string): PostProcessor | undefined;
116
+ /**
117
+ * List all registered post-processor names
118
+ *
119
+ * @returns Array of processor names
120
+ *
121
+ * @example
122
+ * ```typescript
123
+ * const names = listPostProcessors();
124
+ * console.log("Registered processors:", names);
125
+ * ```
126
+ */
127
+ declare function listPostProcessors(): string[];
128
+ /**
129
+ * Unregister a post-processor and call its shutdown method
130
+ *
131
+ * @param name - The processor name
132
+ * @throws {Error} If the processor is not registered
133
+ *
134
+ * @example
135
+ * ```typescript
136
+ * await unregisterPostProcessor("my-processor");
137
+ * ```
138
+ */
139
+ declare function unregisterPostProcessor(name: string): Promise<void>;
140
+ /**
141
+ * Clear all registered post-processors
142
+ *
143
+ * Calls shutdown on all processors before clearing.
144
+ *
145
+ * @example
146
+ * ```typescript
147
+ * await clearPostProcessors();
148
+ * ```
149
+ */
150
+ declare function clearPostProcessors(): Promise<void>;
151
+ /**
152
+ * Register a validator plugin
153
+ *
154
+ * @param validator - The validator to register
155
+ * @throws {Error} If the validator is invalid or missing required methods
156
+ *
157
+ * @example
158
+ * ```typescript
159
+ * const validator = {
160
+ * name: () => "my-validator",
161
+ * priority: () => 50,
162
+ * validate: async (result) => {
163
+ * if (!result.content) {
164
+ * return { valid: false, errors: ["Content is empty"] };
165
+ * }
166
+ * return { valid: true, errors: [] };
167
+ * }
168
+ * };
169
+ * registerValidator(validator);
170
+ * ```
171
+ */
172
+ declare function registerValidator(validator: Validator): void;
173
+ /**
174
+ * Get a registered validator by name
175
+ *
176
+ * @param name - The validator name
177
+ * @returns The validator, or undefined if not found
178
+ *
179
+ * @example
180
+ * ```typescript
181
+ * const validator = getValidator("my-validator");
182
+ * if (validator) {
183
+ * console.log("Found validator:", validator.name());
184
+ * }
185
+ * ```
186
+ */
187
+ declare function getValidator(name: string): Validator | undefined;
188
+ /**
189
+ * List all registered validator names
190
+ *
191
+ * @returns Array of validator names
192
+ *
193
+ * @example
194
+ * ```typescript
195
+ * const names = listValidators();
196
+ * console.log("Registered validators:", names);
197
+ * ```
198
+ */
199
+ declare function listValidators(): string[];
200
+ /**
201
+ * Unregister a validator and call its shutdown method
202
+ *
203
+ * @param name - The validator name
204
+ * @throws {Error} If the validator is not registered
205
+ *
206
+ * @example
207
+ * ```typescript
208
+ * await unregisterValidator("my-validator");
209
+ * ```
210
+ */
211
+ declare function unregisterValidator(name: string): Promise<void>;
212
+ /**
213
+ * Clear all registered validators
214
+ *
215
+ * Calls shutdown on all validators before clearing.
216
+ *
217
+ * @example
218
+ * ```typescript
219
+ * await clearValidators();
220
+ * ```
221
+ */
222
+ declare function clearValidators(): Promise<void>;
223
+
8
224
  /**
9
225
  * Kreuzberg - WebAssembly Bindings for Browser and Runtime Environments
10
226
  *
@@ -420,4 +636,4 @@ declare function batchExtractFiles(files: File[], config?: ExtractionConfig | nu
420
636
  */
421
637
  declare function enableOcr(): Promise<void>;
422
638
 
423
- export { batchExtractBytes, batchExtractBytesSync, batchExtractFiles, enableOcr, extractBytes, extractBytesSync, extractFile, extractFromFile, getInitializationError, getVersion, initWasm, isInitialized };
639
+ export { type PostProcessor, type Validator, batchExtractBytes, batchExtractBytesSync, batchExtractFiles, clearPostProcessors, clearValidators, enableOcr, extractBytes, extractBytesSync, extractFile, extractFromFile, getInitializationError, getPostProcessor, getValidator, getVersion, initWasm, isInitialized, listPostProcessors, listValidators, registerPostProcessor, registerValidator, unregisterPostProcessor, unregisterValidator };