kreuzberg 4.0.0.pre.rc.7 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +13 -12
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +1 -1
  5. data/ext/kreuzberg_rb/native/Cargo.lock +397 -183
  6. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  8. data/kreuzberg.gemspec +34 -2
  9. data/lib/kreuzberg/cache_api.rb +35 -0
  10. data/lib/kreuzberg/error_context.rb +49 -1
  11. data/lib/kreuzberg/extraction_api.rb +255 -0
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/lib/kreuzberg.rb +6 -0
  14. data/lib/libpdfium.dylib +0 -0
  15. data/sig/kreuzberg.rbs +9 -0
  16. data/vendor/Cargo.toml +44 -0
  17. data/vendor/kreuzberg/Cargo.toml +65 -35
  18. data/vendor/kreuzberg/README.md +50 -0
  19. data/vendor/kreuzberg/build.rs +548 -190
  20. data/vendor/kreuzberg/src/api/mod.rs +0 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  22. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  23. data/vendor/kreuzberg/src/error.rs +1 -1
  24. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  25. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  26. data/vendor/kreuzberg/src/extractors/pdf.rs +99 -47
  27. data/vendor/kreuzberg/src/mcp/mod.rs +3 -2
  28. data/vendor/kreuzberg/src/mcp/server.rs +106 -0
  29. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  30. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -0
  31. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  32. data/vendor/kreuzberg/src/pdf/mod.rs +6 -0
  33. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  34. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  35. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  36. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  37. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  38. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  39. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  40. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  41. data/vendor/kreuzberg-ffi/README.md +851 -0
  42. data/vendor/kreuzberg-ffi/build.rs +176 -0
  43. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  44. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  45. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  46. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  47. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  48. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  49. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  50. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  51. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  52. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  53. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  54. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  55. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  56. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  57. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  58. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  59. data/vendor/kreuzberg-tesseract/README.md +399 -0
  60. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  61. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  62. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  63. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  64. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  65. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  66. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  67. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  68. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  69. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  70. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  71. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  72. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  73. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  74. data/vendor/rb-sys/src/lib.rs +1 -0
  75. metadata +41 -3
  76. data/vendor/rb-sys/bin/release.sh +0 -22
@@ -0,0 +1,1087 @@
1
+ /* Auto-generated C bindings for Kreuzberg */
2
+
3
+ #ifndef KREUZBERG_FFI_H
4
+ #define KREUZBERG_FFI_H
5
+
6
+ #pragma once
7
+
8
+ /* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */
9
+
10
+ #include <stdarg.h>
11
+ #include <stdbool.h>
12
+ #include <stdint.h>
13
+ #include <stdlib.h>
14
+ /**
15
+ * Opaque type for extraction configuration.
16
+ * This is an opaque pointer type - callers should not access its internals.
17
+ */
18
+ typedef struct ExtractionConfig ExtractionConfig;
19
+
20
+
21
+ /**
22
+ * C-compatible extraction result structure
23
+ *
24
+ * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
25
+ * Field order: 11 pointers (8 bytes each) + 1 bool + 7 bytes padding = 96 bytes total
26
+ */
27
+ typedef struct CExtractionResult {
28
+ /**
29
+ * Extracted text content (null-terminated UTF-8 string, must be freed with kreuzberg_free_string)
30
+ */
31
+ char *content;
32
+ /**
33
+ * Detected MIME type (null-terminated string, must be freed with kreuzberg_free_string)
34
+ */
35
+ char *mime_type;
36
+ /**
37
+ * Document language (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
38
+ */
39
+ char *language;
40
+ /**
41
+ * Document date (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
42
+ */
43
+ char *date;
44
+ /**
45
+ * Document subject (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
46
+ */
47
+ char *subject;
48
+ /**
49
+ * Tables as JSON array (null-terminated string, or NULL if no tables, must be freed with kreuzberg_free_string)
50
+ */
51
+ char *tables_json;
52
+ /**
53
+ * Detected languages as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
54
+ */
55
+ char *detected_languages_json;
56
+ /**
57
+ * Metadata as JSON object (null-terminated string, or NULL if no metadata, must be freed with kreuzberg_free_string)
58
+ */
59
+ char *metadata_json;
60
+ /**
61
+ * Text chunks as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
62
+ */
63
+ char *chunks_json;
64
+ /**
65
+ * Extracted images as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
66
+ */
67
+ char *images_json;
68
+ /**
69
+ * Page structure as JSON object (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
70
+ */
71
+ char *page_structure_json;
72
+ /**
73
+ * Whether extraction was successful
74
+ */
75
+ bool success;
76
+ /**
77
+ * Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
78
+ */
79
+ uint8_t _padding1[7];
80
+ } CExtractionResult;
81
+
82
+ /**
83
+ * C-compatible structure for batch extraction results
84
+ *
85
+ * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
86
+ * Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 bool + 7 bytes padding = 24 bytes total
87
+ */
88
+ typedef struct CBatchResult {
89
+ /**
90
+ * Array of extraction results
91
+ */
92
+ struct CExtractionResult **results;
93
+ /**
94
+ * Number of results
95
+ */
96
+ uintptr_t count;
97
+ /**
98
+ * Whether batch operation was successful
99
+ */
100
+ bool success;
101
+ /**
102
+ * Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
103
+ */
104
+ uint8_t _padding2[7];
105
+ } CBatchResult;
106
+
107
+ /**
108
+ * C-compatible structure for passing byte array with MIME type in batch operations
109
+ *
110
+ * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
111
+ * Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 pointer (8 bytes) = 24 bytes total
112
+ */
113
+ typedef struct CBytesWithMime {
114
+ /**
115
+ * Pointer to byte data
116
+ */
117
+ const uint8_t *data;
118
+ /**
119
+ * Length of byte data
120
+ */
121
+ uintptr_t data_len;
122
+ /**
123
+ * MIME type as null-terminated C string
124
+ */
125
+ const char *mime_type;
126
+ } CBytesWithMime;
127
+
128
+ /**
129
+ * Type alias for the OCR backend callback function.
130
+ *
131
+ * # Parameters
132
+ *
133
+ * - `image_bytes`: Pointer to image data
134
+ * - `image_length`: Length of image data in bytes
135
+ * - `config_json`: JSON-encoded OcrConfig (null-terminated string)
136
+ *
137
+ * # Returns
138
+ *
139
+ * Null-terminated string containing extracted text (must be freed by Rust via kreuzberg_free_string),
140
+ * or NULL on error.
141
+ *
142
+ * # Safety
143
+ *
144
+ * The callback must:
145
+ * - Not store the image_bytes pointer (it's only valid for the duration of the call)
146
+ * - Return a valid null-terminated UTF-8 string allocated by the caller
147
+ * - Return NULL on error (error message should be retrievable separately)
148
+ */
149
+ typedef char *(*OcrBackendCallback)(const uint8_t *image_bytes,
150
+ uintptr_t image_length,
151
+ const char *config_json);
152
+
153
+ /**
154
+ * Type alias for the PostProcessor callback function.
155
+ *
156
+ * # Parameters
157
+ *
158
+ * - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
159
+ *
160
+ * # Returns
161
+ *
162
+ * Null-terminated JSON string containing the processed ExtractionResult
163
+ * (must be freed by Rust via kreuzberg_free_string), or NULL on error.
164
+ *
165
+ * # Safety
166
+ *
167
+ * The callback must:
168
+ * - Not store the result_json pointer (it's only valid for the duration of the call)
169
+ * - Return a valid null-terminated UTF-8 JSON string allocated by the caller
170
+ * - Return NULL on error (error message should be retrievable separately)
171
+ */
172
+ typedef char *(*PostProcessorCallback)(const char *result_json);
173
+
174
+ /**
175
+ * Type alias for the DocumentExtractor callback function.
176
+ *
177
+ * # Parameters
178
+ *
179
+ * - `content`: Raw document bytes
180
+ * - `content_len`: Length of the content array
181
+ * - `mime_type`: MIME type of the document (null-terminated string)
182
+ * - `config_json`: JSON-encoded ExtractionConfig (null-terminated string)
183
+ *
184
+ * # Returns
185
+ *
186
+ * Null-terminated JSON string containing the ExtractionResult, or NULL on error.
187
+ * The returned string must be freeable by kreuzberg_free_string.
188
+ *
189
+ * # Safety
190
+ *
191
+ * The callback must:
192
+ * - Not store the content, mime_type, or config_json pointers (only valid during the call)
193
+ * - Return a valid null-terminated UTF-8 JSON string or NULL on error
194
+ * - The returned string must be freeable by kreuzberg_free_string
195
+ */
196
+ typedef char *(*DocumentExtractorCallback)(const uint8_t *content,
197
+ uintptr_t content_len,
198
+ const char *mime_type,
199
+ const char *config_json);
200
+
201
+ /**
202
+ * Type alias for the Validator callback function.
203
+ *
204
+ * # Parameters
205
+ *
206
+ * - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
207
+ *
208
+ * # Returns
209
+ *
210
+ * Null-terminated error message string if validation fails (must be freed by Rust
211
+ * via kreuzberg_free_string), or NULL if validation passes.
212
+ *
213
+ * # Safety
214
+ *
215
+ * The callback must:
216
+ * - Not store the result_json pointer (it's only valid for the duration of the call)
217
+ * - Return a valid null-terminated UTF-8 string (error message) if validation fails
218
+ * - Return NULL if validation passes
219
+ * - The returned string must be freeable by kreuzberg_free_string
220
+ */
221
+ typedef char *(*ValidatorCallback)(const char *result_json);
222
+
223
+ /**
224
+ * Extract text and metadata from a file (synchronous).
225
+ *
226
+ * # Safety
227
+ *
228
+ * - `file_path` must be a valid null-terminated C string
229
+ * - The returned pointer must be freed with `kreuzberg_free_result`
230
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
231
+ *
232
+ * # Example (C)
233
+ *
234
+ * ```c
235
+ * const char* path = "/path/to/document.pdf";
236
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
237
+ * if (result != NULL && result->success) {
238
+ * printf("Content: %s\n", result->content);
239
+ * printf("MIME: %s\n", result->mime_type);
240
+ * kreuzberg_free_result(result);
241
+ * } else {
242
+ * const char* error = kreuzberg_last_error();
243
+ * printf("Error: %s\n", error);
244
+ * }
245
+ * ```
246
+ */
247
+ struct CExtractionResult *kreuzberg_extract_file_sync(const char *file_path);
248
+
249
+ /**
250
+ * Detect MIME type from a file path.
251
+ *
252
+ * # Safety
253
+ *
254
+ * - `file_path` must be a valid null-terminated C string
255
+ * - The returned string must be freed with `kreuzberg_free_string`
256
+ * - Returns NULL on error (check `kreuzberg_last_error`)
257
+ */
258
+ char *kreuzberg_detect_mime_type(const char *file_path, bool check_exists);
259
+
260
+ /**
261
+ * Validate that a MIME type is supported by Kreuzberg.
262
+ *
263
+ * # Safety
264
+ *
265
+ * - `mime_type` must be a valid null-terminated C string
266
+ * - The returned string must be freed with `kreuzberg_free_string`
267
+ * - Returns NULL on error (check `kreuzberg_last_error`)
268
+ */
269
+ char *kreuzberg_validate_mime_type(const char *mime_type);
270
+
271
+ /**
272
+ * List available embedding preset names.
273
+ *
274
+ * # Safety
275
+ *
276
+ * - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
277
+ * - Returns NULL on error (check `kreuzberg_last_error`)
278
+ */
279
+ char *kreuzberg_list_embedding_presets(void);
280
+
281
+ /**
282
+ * Get a specific embedding preset by name.
283
+ *
284
+ * # Safety
285
+ *
286
+ * - `name` must be a valid null-terminated C string
287
+ * - Returned string is JSON object and must be freed with `kreuzberg_free_string`
288
+ * - Returns NULL on error (check `kreuzberg_last_error`)
289
+ */
290
+ char *kreuzberg_get_embedding_preset(const char *name);
291
+
292
+ /**
293
+ * Extract text and metadata from a file with custom configuration (synchronous).
294
+ *
295
+ * # Safety
296
+ *
297
+ * - `file_path` must be a valid null-terminated C string
298
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
299
+ * - The returned pointer must be freed with `kreuzberg_free_result`
300
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
301
+ *
302
+ * # Example (C)
303
+ *
304
+ * ```c
305
+ * const char* path = "/path/to/document.pdf";
306
+ * const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
307
+ * CExtractionResult* result = kreuzberg_extract_file_sync_with_config(path, config);
308
+ * if (result != NULL && result->success) {
309
+ * printf("Content: %s\n", result->content);
310
+ * kreuzberg_free_result(result);
311
+ * }
312
+ * ```
313
+ */
314
+ struct CExtractionResult *kreuzberg_extract_file_sync_with_config(const char *file_path,
315
+ const char *config_json);
316
+
317
+ /**
318
+ * Extract text and metadata from byte array (synchronous).
319
+ *
320
+ * # Safety
321
+ *
322
+ * - `data` must be a valid pointer to a byte array of length `data_len`
323
+ * - `mime_type` must be a valid null-terminated C string
324
+ * - The returned pointer must be freed with `kreuzberg_free_result`
325
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
326
+ *
327
+ * # Example (C)
328
+ *
329
+ * ```c
330
+ * const uint8_t* data = ...; // Document bytes
331
+ * size_t len = ...; // Length of data
332
+ * const char* mime = "application/pdf";
333
+ * CExtractionResult* result = kreuzberg_extract_bytes_sync(data, len, mime);
334
+ * if (result != NULL && result->success) {
335
+ * printf("Content: %s\n", result->content);
336
+ * kreuzberg_free_result(result);
337
+ * } else {
338
+ * const char* error = kreuzberg_last_error();
339
+ * printf("Error: %s\n", error);
340
+ * }
341
+ * ```
342
+ */
343
+ struct CExtractionResult *kreuzberg_extract_bytes_sync(const uint8_t *data,
344
+ uintptr_t data_len,
345
+ const char *mime_type);
346
+
347
+ /**
348
+ * Extract text and metadata from byte array with custom configuration (synchronous).
349
+ *
350
+ * # Safety
351
+ *
352
+ * - `data` must be a valid pointer to a byte array of length `data_len`
353
+ * - `mime_type` must be a valid null-terminated C string
354
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
355
+ * - The returned pointer must be freed with `kreuzberg_free_result`
356
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
357
+ *
358
+ * # Example (C)
359
+ *
360
+ * ```c
361
+ * const uint8_t* data = ...; // Document bytes
362
+ * size_t len = ...; // Length of data
363
+ * const char* mime = "application/pdf";
364
+ * const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
365
+ * CExtractionResult* result = kreuzberg_extract_bytes_sync_with_config(data, len, mime, config);
366
+ * if (result != NULL && result->success) {
367
+ * printf("Content: %s\n", result->content);
368
+ * kreuzberg_free_result(result);
369
+ * }
370
+ * ```
371
+ */
372
+ struct CExtractionResult *kreuzberg_extract_bytes_sync_with_config(const uint8_t *data,
373
+ uintptr_t data_len,
374
+ const char *mime_type,
375
+ const char *config_json);
376
+
377
+ /**
378
+ * Batch extract text and metadata from multiple files (synchronous).
379
+ *
380
+ * # Safety
381
+ *
382
+ * - `file_paths` must be a valid pointer to an array of null-terminated C strings
383
+ * - `count` must be the number of file paths in the array
384
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
385
+ * - The returned pointer must be freed with `kreuzberg_free_batch_result`
386
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
387
+ */
388
+ struct CBatchResult *kreuzberg_batch_extract_files_sync(const char *const *file_paths,
389
+ uintptr_t count,
390
+ const char *config_json);
391
+
392
+ /**
393
+ * Batch extract text and metadata from multiple byte arrays (synchronous).
394
+ *
395
+ * # Safety
396
+ *
397
+ * - `items` must be a valid pointer to an array of CBytesWithMime structures
398
+ * - `count` must be the number of items in the array
399
+ * - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
400
+ * - The returned pointer must be freed with `kreuzberg_free_batch_result`
401
+ * - Returns NULL on error (check `kreuzberg_last_error` for details)
402
+ */
403
+ struct CBatchResult *kreuzberg_batch_extract_bytes_sync(const struct CBytesWithMime *items,
404
+ uintptr_t count,
405
+ const char *config_json);
406
+
407
+ /**
408
+ * Load an extraction configuration from a TOML/YAML/JSON file.
409
+ *
410
+ * # Safety
411
+ *
412
+ * - `file_path` must be a valid null-terminated C string
413
+ * - The returned string must be freed with `kreuzberg_free_string`
414
+ * - Returns NULL on error (check `kreuzberg_last_error`)
415
+ */
416
+ char *kreuzberg_load_extraction_config_from_file(const char *file_path);
417
+
418
+ /**
419
+ * Free a batch result returned by batch extraction functions.
420
+ *
421
+ * # Safety
422
+ *
423
+ * - `batch_result` must be a pointer previously returned by a batch extraction function
424
+ * - `batch_result` can be NULL (no-op)
425
+ * - `batch_result` must not be used after this call
426
+ * - All results and strings within the batch result will be freed automatically
427
+ */
428
+ void kreuzberg_free_batch_result(struct CBatchResult *batch_result);
429
+
430
+ /**
431
+ * Free a string returned by Kreuzberg functions.
432
+ *
433
+ * # Safety
434
+ *
435
+ * - `s` must be a string previously returned by a Kreuzberg function
436
+ * - `s` can be NULL (no-op)
437
+ * - `s` must not be used after this call
438
+ *
439
+ * # Example (C)
440
+ *
441
+ * ```c
442
+ * char* str = result->content;
443
+ * kreuzberg_free_string(str);
444
+ * // str is now invalid
445
+ * ```
446
+ */
447
+ void kreuzberg_free_string(char *s);
448
+
449
+ /**
450
+ * Clone a null-terminated string using Rust's allocator.
451
+ *
452
+ * # Safety
453
+ *
454
+ * - `s` must be a valid null-terminated UTF-8 string
455
+ * - Returned pointer must be freed with `kreuzberg_free_string`
456
+ * - Returns NULL on error (check `kreuzberg_last_error`)
457
+ */
458
+ char *kreuzberg_clone_string(const char *s);
459
+
460
+ /**
461
+ * Free an extraction result returned by `kreuzberg_extract_file_sync`.
462
+ *
463
+ * # Safety
464
+ *
465
+ * - `result` must be a pointer previously returned by `kreuzberg_extract_file_sync`
466
+ * - `result` can be NULL (no-op)
467
+ * - `result` must not be used after this call
468
+ * - All string fields within the result will be freed automatically
469
+ *
470
+ * # Example (C)
471
+ *
472
+ * ```c
473
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
474
+ * // Use result...
475
+ * kreuzberg_free_result(result);
476
+ * // result is now invalid
477
+ * ```
478
+ */
479
+ void kreuzberg_free_result(struct CExtractionResult *result);
480
+
481
+ /**
482
+ * Get the last error message from a failed operation.
483
+ *
484
+ * # Safety
485
+ *
486
+ * - Returns a static string that does not need to be freed
487
+ * - Returns NULL if no error has occurred
488
+ * - The returned string is valid until the next Kreuzberg function call on the same thread
489
+ *
490
+ * # Example (C)
491
+ *
492
+ * ```c
493
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
494
+ * if (result == NULL) {
495
+ * const char* error = kreuzberg_last_error();
496
+ * if (error != NULL) {
497
+ * printf("Error: %s\n", error);
498
+ * }
499
+ * }
500
+ * ```
501
+ */
502
+ const char *kreuzberg_last_error(void);
503
+
504
+ /**
505
+ * Get the error code for the last error.
506
+ *
507
+ * Returns the error code as an i32. Error codes are defined in ErrorCode enum:
508
+ * - 0: Success (no error)
509
+ * - 1: GenericError
510
+ * - 2: Panic
511
+ * - 3: InvalidArgument
512
+ * - 4: IoError
513
+ * - 5: ParsingError
514
+ * - 6: OcrError
515
+ * - 7: MissingDependency
516
+ *
517
+ * # Safety
518
+ *
519
+ * This function is thread-safe and always safe to call.
520
+ *
521
+ * # Example (C)
522
+ *
523
+ * ```c
524
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
525
+ * if (result == NULL) {
526
+ * int32_t code = kreuzberg_last_error_code();
527
+ * if (code == 2) {
528
+ * // A panic occurred
529
+ * }
530
+ * }
531
+ * ```
532
+ */
533
+ int32_t kreuzberg_last_error_code(void);
534
+
535
+ /**
536
+ * Get the panic context for the last error (if it was a panic).
537
+ *
538
+ * Returns a JSON string containing panic context information, or NULL if
539
+ * the last error was not a panic.
540
+ *
541
+ * The JSON structure contains:
542
+ * - file: Source file where panic occurred
543
+ * - line: Line number
544
+ * - function: Function name
545
+ * - message: Panic message
546
+ * - timestamp_secs: Unix timestamp (seconds since epoch)
547
+ *
548
+ * # Safety
549
+ *
550
+ * The returned string must be freed with kreuzberg_free_string().
551
+ *
552
+ * # Example (C)
553
+ *
554
+ * ```c
555
+ * CExtractionResult* result = kreuzberg_extract_file_sync(path);
556
+ * if (result == NULL && kreuzberg_last_error_code() == 2) {
557
+ * const char* context = kreuzberg_last_panic_context();
558
+ * if (context != NULL) {
559
+ * printf("Panic context: %s\n", context);
560
+ * kreuzberg_free_string((char*)context);
561
+ * }
562
+ * }
563
+ * ```
564
+ */
565
+ char *kreuzberg_last_panic_context(void);
566
+
567
+ /**
568
+ * Get the library version string.
569
+ *
570
+ * # Safety
571
+ *
572
+ * - Returns a static string that does not need to be freed
573
+ * - The returned string is always valid
574
+ *
575
+ * # Example (C)
576
+ *
577
+ * ```c
578
+ * const char* version = kreuzberg_version();
579
+ * printf("Kreuzberg version: %s\n", version);
580
+ * ```
581
+ */
582
+ const char *kreuzberg_version(void);
583
+
584
+ /**
585
+ * Register a custom OCR backend via FFI callback.
586
+ *
587
+ * # Safety
588
+ *
589
+ * - `name` must be a valid null-terminated C string
590
+ * - `callback` must be a valid function pointer that:
591
+ * - Does not store the image_bytes pointer
592
+ * - Returns a null-terminated UTF-8 string or NULL on error
593
+ * - The returned string must be freeable by kreuzberg_free_string
594
+ * - Returns true on success, false on error (check kreuzberg_last_error)
595
+ *
596
+ * # Example (C)
597
+ *
598
+ * ```c
599
+ * char* my_ocr_backend(const uint8_t* image_bytes, size_t image_length, const char* config_json) {
600
+ * // Implement OCR logic here
601
+ * // Return allocated string with result, or NULL on error
602
+ * return strdup("Extracted text");
603
+ * }
604
+ *
605
+ * bool success = kreuzberg_register_ocr_backend("my-ocr", my_ocr_backend);
606
+ * if (!success) {
607
+ * const char* error = kreuzberg_last_error();
608
+ * printf("Failed to register: %s\n", error);
609
+ * }
610
+ * ```
611
+ */
612
+ bool kreuzberg_register_ocr_backend(const char *name, OcrBackendCallback callback);
613
+
614
+ /**
615
+ * Register a custom OCR backend with explicit language support via FFI callback.
616
+ *
617
+ * # Safety
618
+ *
619
+ * - `languages_json` must be a null-terminated JSON array of language codes or NULL
620
+ * - See `kreuzberg_register_ocr_backend` for additional safety notes.
621
+ */
622
+ bool kreuzberg_register_ocr_backend_with_languages(const char *name,
623
+ OcrBackendCallback callback,
624
+ const char *languages_json);
625
+
626
+ /**
627
+ * Register a custom PostProcessor via FFI callback.
628
+ *
629
+ * # Safety
630
+ *
631
+ * - `name` must be a valid null-terminated C string
632
+ * - `callback` must be a valid function pointer that:
633
+ * - Does not store the result_json pointer
634
+ * - Returns a null-terminated UTF-8 JSON string or NULL on error
635
+ * - The returned string must be freeable by kreuzberg_free_string
636
+ * - `priority` determines the order of execution (higher priority runs first)
637
+ * - Returns true on success, false on error (check kreuzberg_last_error)
638
+ *
639
+ * # Example (C)
640
+ *
641
+ * ```c
642
+ * char* my_post_processor(const char* result_json) {
643
+ * // Parse result_json, modify it, return JSON string
644
+ * return strdup("{\"content\":\"PROCESSED\"}");
645
+ * }
646
+ *
647
+ * bool success = kreuzberg_register_post_processor("my-processor", my_post_processor, 100);
648
+ * if (!success) {
649
+ * const char* error = kreuzberg_last_error();
650
+ * printf("Failed to register: %s\n", error);
651
+ * }
652
+ * ```
653
+ */
654
+ bool kreuzberg_register_post_processor(const char *name,
655
+ PostProcessorCallback callback,
656
+ int32_t priority);
657
+
658
+ /**
659
+ * Register a custom PostProcessor with an explicit processing stage.
660
+ *
661
+ * # Safety
662
+ *
663
+ * - `name` must be a valid null-terminated C string
664
+ * - `stage` must be a valid null-terminated C string containing "early", "middle", or "late"
665
+ * - `callback` must be a valid function pointer that:
666
+ * - Does not store the result_json pointer
667
+ * - Returns a null-terminated UTF-8 JSON string or NULL on error
668
+ * - The returned string must be freeable by kreuzberg_free_string
669
+ * - `priority` determines the order of execution within the stage (higher priority runs first)
670
+ * - Returns true on success, false on error (check kreuzberg_last_error)
671
+ */
672
+ bool kreuzberg_register_post_processor_with_stage(const char *name,
673
+ PostProcessorCallback callback,
674
+ int32_t priority,
675
+ const char *stage);
676
+
677
+ /**
678
+ * Unregister a PostProcessor by name.
679
+ *
680
+ * # Safety
681
+ *
682
+ * - `name` must be a valid null-terminated C string
683
+ * - Returns true on success, false on error (check kreuzberg_last_error)
684
+ *
685
+ * # Example (C)
686
+ *
687
+ * ```c
688
+ * bool success = kreuzberg_unregister_post_processor("my-processor");
689
+ * if (!success) {
690
+ * const char* error = kreuzberg_last_error();
691
+ * printf("Failed to unregister: %s\n", error);
692
+ * }
693
+ * ```
694
+ */
695
+ bool kreuzberg_unregister_post_processor(const char *name);
696
+
697
+ /**
698
+ * Clear all registered PostProcessors.
699
+ *
700
+ * # Safety
701
+ *
702
+ * - Removes all registered processors. Subsequent extractions will run without them.
703
+ * - Returns true on success, false on error.
704
+ */
705
+ bool kreuzberg_clear_post_processors(void);
706
+
707
+ /**
708
+ * List all registered PostProcessors as a JSON array of names.
709
+ *
710
+ * # Safety
711
+ *
712
+ * - Returned string must be freed with `kreuzberg_free_string`.
713
+ * - Returns NULL on error (check `kreuzberg_last_error`).
714
+ */
715
+ char *kreuzberg_list_post_processors(void);
716
+
717
+ /**
718
+ * Register a custom DocumentExtractor via FFI callback.
719
+ *
720
+ * # Safety
721
+ *
722
+ * - `name` must be a valid null-terminated C string
723
+ * - `callback` must be a valid function pointer that:
724
+ * - Does not store the content, mime_type, or config_json pointers
725
+ * - Returns a null-terminated UTF-8 JSON string or NULL on error
726
+ * - The returned string must be freeable by kreuzberg_free_string
727
+ * - `mime_types` must be a valid null-terminated C string containing comma-separated MIME types
728
+ * - `priority` determines the order of selection (higher priority preferred)
729
+ * - Returns true on success, false on error (check kreuzberg_last_error)
730
+ *
731
+ * # Example (C)
732
+ *
733
+ * ```c
734
+ * char* my_extractor(const uint8_t* content, size_t len, const char* mime_type, const char* config) {
735
+ * // Extract content from bytes, return JSON ExtractionResult
736
+ * return strdup("{\"content\":\"extracted text\",\"mime_type\":\"text/plain\",\"metadata\":{}}");
737
+ * }
738
+ *
739
+ * bool success = kreuzberg_register_document_extractor(
740
+ * "my-extractor",
741
+ * my_extractor,
742
+ * "application/x-custom,text/x-custom",
743
+ * 100
744
+ * );
745
+ * if (!success) {
746
+ * const char* error = kreuzberg_last_error();
747
+ * printf("Failed to register: %s\n", error);
748
+ * }
749
+ * ```
750
+ */
751
+ bool kreuzberg_register_document_extractor(const char *name,
752
+ DocumentExtractorCallback callback,
753
+ const char *mime_types,
754
+ int32_t priority);
755
+
756
+ /**
757
+ * Unregister a DocumentExtractor by name.
758
+ *
759
+ * # Safety
760
+ *
761
+ * - `name` must be a valid null-terminated C string
762
+ * - Returns true on success, false on error (check kreuzberg_last_error)
763
+ *
764
+ * # Example (C)
765
+ *
766
+ * ```c
767
+ * bool success = kreuzberg_unregister_document_extractor("my-extractor");
768
+ * if (!success) {
769
+ * const char* error = kreuzberg_last_error();
770
+ * printf("Failed to unregister: %s\n", error);
771
+ * }
772
+ * ```
773
+ */
774
+ bool kreuzberg_unregister_document_extractor(const char *name);
775
+
776
+ /**
777
+ * List all registered DocumentExtractors as a JSON array of names.
778
+ *
779
+ * # Safety
780
+ *
781
+ * - Returned string must be freed with `kreuzberg_free_string`.
782
+ * - Returns NULL on error (check `kreuzberg_last_error`).
783
+ */
784
+ char *kreuzberg_list_document_extractors(void);
785
+
786
+ /**
787
+ * Register a custom Validator via FFI callback.
788
+ *
789
+ * # Safety
790
+ *
791
+ * - `name` must be a valid null-terminated C string
792
+ * - `callback` must be a valid function pointer that:
793
+ * - Does not store the result_json pointer
794
+ * - Returns a null-terminated UTF-8 string (error message) if validation fails
795
+ * - Returns NULL if validation passes
796
+ * - The returned string must be freeable by kreuzberg_free_string
797
+ * - `priority` determines the order of validation (higher priority runs first)
798
+ * - Returns true on success, false on error (check kreuzberg_last_error)
799
+ *
800
+ * # Example (C)
801
+ *
802
+ * ```c
803
+ * char* my_validator(const char* result_json) {
804
+ * // Parse result_json, validate it
805
+ * // Return error message if validation fails, NULL if passes
806
+ * if (invalid) {
807
+ * return strdup("Validation failed: content too short");
808
+ * }
809
+ * return NULL;
810
+ * }
811
+ *
812
+ * bool success = kreuzberg_register_validator("my-validator", my_validator, 100);
813
+ * if (!success) {
814
+ * const char* error = kreuzberg_last_error();
815
+ * printf("Failed to register: %s\n", error);
816
+ * }
817
+ * ```
818
+ */
819
+ bool kreuzberg_register_validator(const char *name, ValidatorCallback callback, int32_t priority);
820
+
821
+ /**
822
+ * Unregister a Validator by name.
823
+ *
824
+ * # Safety
825
+ *
826
+ * - `name` must be a valid null-terminated C string
827
+ * - Returns true on success, false on error (check kreuzberg_last_error)
828
+ *
829
+ * # Example (C)
830
+ *
831
+ * ```c
832
+ * bool success = kreuzberg_unregister_validator("my-validator");
833
+ * if (!success) {
834
+ * const char* error = kreuzberg_last_error();
835
+ * printf("Failed to unregister: %s\n", error);
836
+ * }
837
+ * ```
838
+ */
839
+ bool kreuzberg_unregister_validator(const char *name);
840
+
841
+ /**
842
+ * Clear all registered Validators.
843
+ *
844
+ * # Safety
845
+ *
846
+ * - Removes all validators. Subsequent extractions will skip custom validation.
847
+ * - Returns true on success, false on error.
848
+ */
849
+ bool kreuzberg_clear_validators(void);
850
+
851
+ /**
852
+ * List all registered Validators as a JSON array of names.
853
+ *
854
+ * # Safety
855
+ *
856
+ * - Returned string must be freed with `kreuzberg_free_string`.
857
+ * - Returns NULL on error (check `kreuzberg_last_error`).
858
+ */
859
+ char *kreuzberg_list_validators(void);
860
+
861
+ /**
862
+ * Unregister an OCR backend by name.
863
+ *
864
+ * # Safety
865
+ *
866
+ * - `name` must be a valid null-terminated C string
867
+ * - Returns true on success, false on error (check kreuzberg_last_error)
868
+ *
869
+ * # Example (C)
870
+ *
871
+ * ```c
872
+ * bool success = kreuzberg_unregister_ocr_backend("custom-ocr");
873
+ * if (!success) {
874
+ * const char* error = kreuzberg_last_error();
875
+ * printf("Failed to unregister: %s\n", error);
876
+ * }
877
+ * ```
878
+ */
879
+ bool kreuzberg_unregister_ocr_backend(const char *name);
880
+
881
+ /**
882
+ * List all registered OCR backends as a JSON array of names.
883
+ *
884
+ * # Safety
885
+ *
886
+ * - Returned string must be freed with `kreuzberg_free_string`.
887
+ * - Returns NULL on error (check `kreuzberg_last_error`).
888
+ *
889
+ * # Example (C)
890
+ *
891
+ * ```c
892
+ * char* backends = kreuzberg_list_ocr_backends();
893
+ * if (backends == NULL) {
894
+ * const char* error = kreuzberg_last_error();
895
+ * printf("Failed to list backends: %s\n", error);
896
+ * } else {
897
+ * printf("OCR backends: %s\n", backends);
898
+ * kreuzberg_free_string(backends);
899
+ * }
900
+ * ```
901
+ */
902
+ char *kreuzberg_list_ocr_backends(void);
903
+
904
+ /**
905
+ * Clear all registered OCR backends.
906
+ *
907
+ * # Safety
908
+ *
909
+ * - Removes all registered OCR backends. Subsequent extractions will use only built-in backends.
910
+ * - Returns true on success, false on error.
911
+ *
912
+ * # Example (C)
913
+ *
914
+ * ```c
915
+ * bool success = kreuzberg_clear_ocr_backends();
916
+ * if (!success) {
917
+ * const char* error = kreuzberg_last_error();
918
+ * printf("Failed to clear OCR backends: %s\n", error);
919
+ * }
920
+ * ```
921
+ */
922
+ bool kreuzberg_clear_ocr_backends(void);
923
+
924
+ /**
925
+ * Clear all registered DocumentExtractors.
926
+ *
927
+ * # Safety
928
+ *
929
+ * - Removes all registered extractors. Subsequent extractions will use only built-in extractors.
930
+ * - Returns true on success, false on error.
931
+ *
932
+ * # Example (C)
933
+ *
934
+ * ```c
935
+ * bool success = kreuzberg_clear_document_extractors();
936
+ * if (!success) {
937
+ * const char* error = kreuzberg_last_error();
938
+ * printf("Failed to clear document extractors: %s\n", error);
939
+ * }
940
+ * ```
941
+ */
942
+ bool kreuzberg_clear_document_extractors(void);
943
+
944
+ /**
945
+ * Detect MIME type from raw bytes.
946
+ *
947
+ * # Safety
948
+ *
949
+ * - `bytes` must be a valid pointer to byte data
950
+ * - `len` must be the correct length of the byte array
951
+ * - The returned string must be freed with `kreuzberg_free_string`
952
+ * - Returns NULL on error (check `kreuzberg_last_error`)
953
+ *
954
+ * # Example (C)
955
+ *
956
+ * ```c
957
+ * const char* pdf_bytes = "%PDF-1.4\n";
958
+ * char* mime = kreuzberg_detect_mime_type_from_bytes((const uint8_t*)pdf_bytes, strlen(pdf_bytes));
959
+ * if (mime == NULL) {
960
+ * const char* error = kreuzberg_last_error();
961
+ * printf("Failed to detect MIME type: %s\n", error);
962
+ * } else {
963
+ * printf("MIME type: %s\n", mime);
964
+ * kreuzberg_free_string(mime);
965
+ * }
966
+ * ```
967
+ */
968
+ char *kreuzberg_detect_mime_type_from_bytes(const uint8_t *bytes, uintptr_t len);
969
+
970
+ /**
971
+ * Detect MIME type from file path (checks extension and reads file content).
972
+ *
973
+ * # Safety
974
+ *
975
+ * - `file_path` must be a valid null-terminated C string
976
+ * - The returned string must be freed with `kreuzberg_free_string`
977
+ * - Returns NULL on error (check `kreuzberg_last_error`)
978
+ *
979
+ * # Example (C)
980
+ *
981
+ * ```c
982
+ * char* mime = kreuzberg_detect_mime_type_from_path("document.pdf");
983
+ * if (mime == NULL) {
984
+ * const char* error = kreuzberg_last_error();
985
+ * printf("Failed to detect MIME type: %s\n", error);
986
+ * } else {
987
+ * printf("MIME type: %s\n", mime);
988
+ * kreuzberg_free_string(mime);
989
+ * }
990
+ * ```
991
+ */
992
+ char *kreuzberg_detect_mime_type_from_path(const char *file_path);
993
+
994
+ /**
995
+ * Get file extensions for a MIME type.
996
+ *
997
+ * # Safety
998
+ *
999
+ * - `mime_type` must be a valid null-terminated C string
1000
+ * - The returned string is a JSON array of extensions (must be freed with `kreuzberg_free_string`)
1001
+ * - Returns NULL on error (check `kreuzberg_last_error`)
1002
+ *
1003
+ * # Example (C)
1004
+ *
1005
+ * ```c
1006
+ * char* extensions = kreuzberg_get_extensions_for_mime("application/pdf");
1007
+ * if (extensions == NULL) {
1008
+ * const char* error = kreuzberg_last_error();
1009
+ * printf("Failed to get extensions: %s\n", error);
1010
+ * } else {
1011
+ * printf("Extensions: %s\n", extensions);
1012
+ * kreuzberg_free_string(extensions);
1013
+ * }
1014
+ * ```
1015
+ */
1016
+ char *kreuzberg_get_extensions_for_mime(const char *mime_type);
1017
+
1018
+ /**
1019
+ * Load an ExtractionConfig from a file.
1020
+ *
1021
+ * Automatically detects the file format based on extension:
1022
+ * - `.toml` - TOML format
1023
+ * - `.yaml`, `.yml` - YAML format
1024
+ * - `.json` - JSON format
1025
+ *
1026
+ * # Safety
1027
+ *
1028
+ * - `path` must be a valid null-terminated C string representing a file path
1029
+ * - Returns a pointer to ExtractionConfig on success, NULL on error
1030
+ * - The returned config must be freed with `kreuzberg_free_config`
1031
+ * - Check `kreuzberg_last_error` on NULL return
1032
+ *
1033
+ * # Example (C)
1034
+ *
1035
+ * ```c
1036
+ * ExtractionConfig* config = kreuzberg_config_from_file("kreuzberg.toml");
1037
+ * if (config == NULL) {
1038
+ * const char* error = kreuzberg_last_error();
1039
+ * printf("Failed to load config: %s\n", error);
1040
+ * return 1;
1041
+ * }
1042
+ *
1043
+ * // Use config...
1044
+ * char* result = kreuzberg_extract_file_with_config_sync("document.pdf", config);
1045
+ *
1046
+ * kreuzberg_free_config(config);
1047
+ * ```
1048
+ */
1049
+ ExtractionConfig *kreuzberg_config_from_file(const char *path);
1050
+
1051
+ /**
1052
+ * Discover and load an ExtractionConfig by searching parent directories.
1053
+ *
1054
+ * Searches the current directory and all parent directories for:
1055
+ * - `kreuzberg.toml`
1056
+ * - `kreuzberg.yaml`
1057
+ * - `kreuzberg.yml`
1058
+ * - `kreuzberg.json`
1059
+ *
1060
+ * Returns the first config file found as JSON, or NULL if none found.
1061
+ *
1062
+ * # Safety
1063
+ *
1064
+ * - The returned string must be freed with `kreuzberg_free_string`
1065
+ * - Returns NULL if no config found or on error (check `kreuzberg_last_error`)
1066
+ *
1067
+ * # Example (C)
1068
+ *
1069
+ * ```c
1070
+ * char* config_json = kreuzberg_config_discover();
1071
+ * if (config_json == NULL) {
1072
+ * const char* error = kreuzberg_last_error();
1073
+ * if (error != NULL && strlen(error) > 0) {
1074
+ * printf("Error discovering config: %s\n", error);
1075
+ * return 1;
1076
+ * }
1077
+ * // No config found, use defaults
1078
+ * printf("No config file found\n");
1079
+ * } else {
1080
+ * printf("Config: %s\n", config_json);
1081
+ * kreuzberg_free_string(config_json);
1082
+ * }
1083
+ * ```
1084
+ */
1085
+ char *kreuzberg_config_discover(void);
1086
+
1087
+ #endif /* KREUZBERG_FFI_H */