kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,3012 @@
|
|
|
1
|
+
/* Auto-generated C bindings for Kreuzberg */
|
|
2
|
+
|
|
3
|
+
#ifndef KREUZBERG_FFI_H
|
|
4
|
+
#define KREUZBERG_FFI_H
|
|
5
|
+
|
|
6
|
+
#pragma once
|
|
7
|
+
|
|
8
|
+
/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */
|
|
9
|
+
|
|
10
|
+
#include <stdarg.h>
|
|
11
|
+
#include <stdbool.h>
|
|
12
|
+
#include <stdint.h>
|
|
13
|
+
#include <stdlib.h>
|
|
14
|
+
/**
|
|
15
|
+
* Opaque type for extraction configuration.
|
|
16
|
+
* This is an opaque pointer type - callers should not access its internals.
|
|
17
|
+
*/
|
|
18
|
+
typedef struct ExtractionConfig ExtractionConfig;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Opaque type for extraction result.
|
|
22
|
+
* This is an opaque pointer type - callers should not access its internals.
|
|
23
|
+
* Use the kreuzberg_result_* accessor functions to extract data.
|
|
24
|
+
*/
|
|
25
|
+
typedef struct ExtractionResult ExtractionResult;
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
typedef struct Option_ErrorCallback Option_ErrorCallback;
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Memory pool for ExtractionResult objects.
|
|
32
|
+
*
|
|
33
|
+
* Pre-allocates storage and reuses memory across multiple extractions.
|
|
34
|
+
* Thread-safe with internal synchronization.
|
|
35
|
+
*
|
|
36
|
+
* # Memory Model
|
|
37
|
+
*
|
|
38
|
+
* - Results are owned by the pool until reset or freed
|
|
39
|
+
* - Pool grows automatically if capacity is exceeded
|
|
40
|
+
* - Reset clears all results but retains capacity
|
|
41
|
+
* - Free releases all memory and destroys pool
|
|
42
|
+
*
|
|
43
|
+
* # Thread Safety
|
|
44
|
+
*
|
|
45
|
+
* Pool uses internal Mutex for synchronization. Safe for concurrent access
|
|
46
|
+
* but may serialize extractions. For parallel processing, consider using
|
|
47
|
+
* separate pools per thread.
|
|
48
|
+
*/
|
|
49
|
+
typedef struct ResultPool ResultPool;
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Zero-copy view into an ExtractionResult.
|
|
53
|
+
*
|
|
54
|
+
* Provides direct pointers to string data without allocation or copying.
|
|
55
|
+
* All pointers are valid UTF-8 byte slices (not null-terminated).
|
|
56
|
+
*
|
|
57
|
+
* # Lifetime Safety
|
|
58
|
+
*
|
|
59
|
+
* This structure contains borrowed pointers. The caller MUST ensure:
|
|
60
|
+
* - The source `ExtractionResult` outlives this view
|
|
61
|
+
* - No use after the source result is freed with `kreuzberg_result_free()`
|
|
62
|
+
*
|
|
63
|
+
* # Memory Layout
|
|
64
|
+
*
|
|
65
|
+
* Field order: 6 ptr+len pairs (96 bytes) + 5 counts (40 bytes) = 136 bytes on 64-bit systems
|
|
66
|
+
* All pointers are either valid UTF-8 data or NULL (with corresponding len=0).
|
|
67
|
+
*
|
|
68
|
+
* # Thread Safety
|
|
69
|
+
*
|
|
70
|
+
* Views are NOT thread-safe. External synchronization required for concurrent access.
|
|
71
|
+
*/
|
|
72
|
+
typedef struct CExtractionResultView {
|
|
73
|
+
/**
|
|
74
|
+
* Direct pointer to content bytes (UTF-8, not null-terminated)
|
|
75
|
+
*/
|
|
76
|
+
const uint8_t *content_ptr;
|
|
77
|
+
/**
|
|
78
|
+
* Length of content in bytes
|
|
79
|
+
*/
|
|
80
|
+
uintptr_t content_len;
|
|
81
|
+
/**
|
|
82
|
+
* Direct pointer to MIME type bytes (UTF-8, not null-terminated)
|
|
83
|
+
*/
|
|
84
|
+
const uint8_t *mime_type_ptr;
|
|
85
|
+
/**
|
|
86
|
+
* Length of MIME type in bytes
|
|
87
|
+
*/
|
|
88
|
+
uintptr_t mime_type_len;
|
|
89
|
+
/**
|
|
90
|
+
* Direct pointer to language bytes (UTF-8, not null-terminated), or NULL
|
|
91
|
+
*/
|
|
92
|
+
const uint8_t *language_ptr;
|
|
93
|
+
/**
|
|
94
|
+
* Length of language in bytes (0 if NULL)
|
|
95
|
+
*/
|
|
96
|
+
uintptr_t language_len;
|
|
97
|
+
/**
|
|
98
|
+
* Direct pointer to date bytes (UTF-8, not null-terminated), or NULL
|
|
99
|
+
*/
|
|
100
|
+
const uint8_t *date_ptr;
|
|
101
|
+
/**
|
|
102
|
+
* Length of date in bytes (0 if NULL)
|
|
103
|
+
*/
|
|
104
|
+
uintptr_t date_len;
|
|
105
|
+
/**
|
|
106
|
+
* Direct pointer to subject bytes (UTF-8, not null-terminated), or NULL
|
|
107
|
+
*/
|
|
108
|
+
const uint8_t *subject_ptr;
|
|
109
|
+
/**
|
|
110
|
+
* Length of subject in bytes (0 if NULL)
|
|
111
|
+
*/
|
|
112
|
+
uintptr_t subject_len;
|
|
113
|
+
/**
|
|
114
|
+
* Direct pointer to title bytes (UTF-8, not null-terminated), or NULL
|
|
115
|
+
*/
|
|
116
|
+
const uint8_t *title_ptr;
|
|
117
|
+
/**
|
|
118
|
+
* Length of title in bytes (0 if NULL)
|
|
119
|
+
*/
|
|
120
|
+
uintptr_t title_len;
|
|
121
|
+
/**
|
|
122
|
+
* Number of tables extracted
|
|
123
|
+
*/
|
|
124
|
+
uintptr_t table_count;
|
|
125
|
+
/**
|
|
126
|
+
* Number of chunks (0 if chunking not enabled)
|
|
127
|
+
*/
|
|
128
|
+
uintptr_t chunk_count;
|
|
129
|
+
/**
|
|
130
|
+
* Number of detected languages (0 if language detection not enabled)
|
|
131
|
+
*/
|
|
132
|
+
uintptr_t detected_language_count;
|
|
133
|
+
/**
|
|
134
|
+
* Number of extracted images (0 if no images)
|
|
135
|
+
*/
|
|
136
|
+
uintptr_t image_count;
|
|
137
|
+
/**
|
|
138
|
+
* Total page count (0 if not applicable)
|
|
139
|
+
*/
|
|
140
|
+
uintptr_t page_count;
|
|
141
|
+
} CExtractionResultView;
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Callback function invoked for each successfully extracted result.
|
|
145
|
+
*
|
|
146
|
+
* # Arguments
|
|
147
|
+
*
|
|
148
|
+
* * `result` - Borrowed pointer to extraction result (valid only during callback)
|
|
149
|
+
* * `file_index` - Zero-based index of the file in the batch
|
|
150
|
+
* * `user_data` - User-provided context pointer
|
|
151
|
+
*
|
|
152
|
+
* # Returns
|
|
153
|
+
*
|
|
154
|
+
* - `0` to continue processing remaining files
|
|
155
|
+
* - Non-zero to cancel batch processing (no further callbacks)
|
|
156
|
+
*
|
|
157
|
+
* # Safety
|
|
158
|
+
*
|
|
159
|
+
* - `result` pointer is valid only during the callback execution
|
|
160
|
+
* - `result` is automatically freed after callback returns
|
|
161
|
+
* - Caller must copy/serialize data if needed beyond callback scope
|
|
162
|
+
* - `user_data` is passed through opaquely (caller manages lifetime)
|
|
163
|
+
*/
|
|
164
|
+
typedef int (*ResultCallback)(const struct CExtractionResultView *result,
|
|
165
|
+
uintptr_t file_index,
|
|
166
|
+
void *user_data);
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* C-compatible structured error details returned by `kreuzberg_get_error_details()`.
|
|
170
|
+
*
|
|
171
|
+
* All string fields (message, error_type, source_file, source_function, context_info)
|
|
172
|
+
* are dynamically allocated C strings that MUST be freed using `kreuzberg_free_string()`.
|
|
173
|
+
* Set fields are non-NULL; unset fields are NULL.
|
|
174
|
+
*/
|
|
175
|
+
typedef struct CErrorDetails {
|
|
176
|
+
/**
|
|
177
|
+
* The error message (must be freed with kreuzberg_free_string)
|
|
178
|
+
*/
|
|
179
|
+
char *message;
|
|
180
|
+
/**
|
|
181
|
+
* Numeric error code (0-7 for Kreuzberg errors, 1-7 for panic_shield codes)
|
|
182
|
+
*/
|
|
183
|
+
uint32_t error_code;
|
|
184
|
+
/**
|
|
185
|
+
* Human-readable error type name (must be freed with kreuzberg_free_string)
|
|
186
|
+
*/
|
|
187
|
+
char *error_type;
|
|
188
|
+
/**
|
|
189
|
+
* Source file where error occurred (may be NULL)
|
|
190
|
+
*/
|
|
191
|
+
char *source_file;
|
|
192
|
+
/**
|
|
193
|
+
* Source function where error occurred (may be NULL)
|
|
194
|
+
*/
|
|
195
|
+
char *source_function;
|
|
196
|
+
/**
|
|
197
|
+
* Line number in source file (0 if unknown)
|
|
198
|
+
*/
|
|
199
|
+
uint32_t source_line;
|
|
200
|
+
/**
|
|
201
|
+
* Additional context information (may be NULL)
|
|
202
|
+
*/
|
|
203
|
+
char *context_info;
|
|
204
|
+
/**
|
|
205
|
+
* 1 if this error originated from a panic, 0 otherwise
|
|
206
|
+
*/
|
|
207
|
+
int32_t is_panic;
|
|
208
|
+
} CErrorDetails;
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* C-compatible extraction result structure
|
|
212
|
+
*
|
|
213
|
+
* This struct must maintain a stable ABI and memory layout for FFI compatibility.
|
|
214
|
+
*
|
|
215
|
+
* # Memory Layout
|
|
216
|
+
*
|
|
217
|
+
* Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
218
|
+
* Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
|
|
219
|
+
*
|
|
220
|
+
* The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
|
|
221
|
+
* - Fields are laid out in order
|
|
222
|
+
* - Padding is added to maintain alignment
|
|
223
|
+
* - The struct has the same size and alignment on all platforms (for 64-bit)
|
|
224
|
+
*
|
|
225
|
+
* # Memory Management
|
|
226
|
+
*
|
|
227
|
+
* All pointer fields are owned by the caller and must be freed using `kreuzberg_free_string`.
|
|
228
|
+
* The struct itself must be freed using `kreuzberg_free_extraction_result`.
|
|
229
|
+
*/
|
|
230
|
+
typedef struct CExtractionResult {
|
|
231
|
+
/**
|
|
232
|
+
* Extracted text content (null-terminated UTF-8 string, must be freed with kreuzberg_free_string)
|
|
233
|
+
*/
|
|
234
|
+
char *content;
|
|
235
|
+
/**
|
|
236
|
+
* Detected MIME type (null-terminated string, must be freed with kreuzberg_free_string)
|
|
237
|
+
*/
|
|
238
|
+
char *mime_type;
|
|
239
|
+
/**
|
|
240
|
+
* Document language (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
241
|
+
*/
|
|
242
|
+
char *language;
|
|
243
|
+
/**
|
|
244
|
+
* Document date (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
245
|
+
*/
|
|
246
|
+
char *date;
|
|
247
|
+
/**
|
|
248
|
+
* Document subject (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
249
|
+
*/
|
|
250
|
+
char *subject;
|
|
251
|
+
/**
|
|
252
|
+
* Tables as JSON array (null-terminated string, or NULL if no tables, must be freed with kreuzberg_free_string)
|
|
253
|
+
*/
|
|
254
|
+
char *tables_json;
|
|
255
|
+
/**
|
|
256
|
+
* Detected languages as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
257
|
+
*/
|
|
258
|
+
char *detected_languages_json;
|
|
259
|
+
/**
|
|
260
|
+
* Metadata as JSON object (null-terminated string, or NULL if no metadata, must be freed with kreuzberg_free_string)
|
|
261
|
+
*/
|
|
262
|
+
char *metadata_json;
|
|
263
|
+
/**
|
|
264
|
+
* Text chunks as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
265
|
+
*/
|
|
266
|
+
char *chunks_json;
|
|
267
|
+
/**
|
|
268
|
+
* Extracted images as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
269
|
+
*/
|
|
270
|
+
char *images_json;
|
|
271
|
+
/**
|
|
272
|
+
* Page structure as JSON object (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
273
|
+
*/
|
|
274
|
+
char *page_structure_json;
|
|
275
|
+
/**
|
|
276
|
+
* Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
277
|
+
*/
|
|
278
|
+
char *pages_json;
|
|
279
|
+
/**
|
|
280
|
+
* Whether extraction was successful
|
|
281
|
+
*/
|
|
282
|
+
bool success;
|
|
283
|
+
/**
|
|
284
|
+
* Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
285
|
+
*/
|
|
286
|
+
uint8_t _padding1[7];
|
|
287
|
+
} CExtractionResult;
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* C-compatible structure for batch extraction results
|
|
291
|
+
*
|
|
292
|
+
* # Memory Layout
|
|
293
|
+
*
|
|
294
|
+
* Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
295
|
+
* Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 bool + 7 bytes padding = 24 bytes total
|
|
296
|
+
*
|
|
297
|
+
* The padding ensures the struct is properly aligned for 64-bit architectures.
|
|
298
|
+
*
|
|
299
|
+
* # Memory Management
|
|
300
|
+
*
|
|
301
|
+
* - The `results` array must be freed using `kreuzberg_free_batch_result`
|
|
302
|
+
* - Each individual result in the array must also be freed
|
|
303
|
+
*/
|
|
304
|
+
typedef struct CBatchResult {
|
|
305
|
+
/**
|
|
306
|
+
* Array of extraction results
|
|
307
|
+
*/
|
|
308
|
+
struct CExtractionResult **results;
|
|
309
|
+
/**
|
|
310
|
+
* Number of results
|
|
311
|
+
*/
|
|
312
|
+
uintptr_t count;
|
|
313
|
+
/**
|
|
314
|
+
* Whether batch operation was successful
|
|
315
|
+
*/
|
|
316
|
+
bool success;
|
|
317
|
+
/**
|
|
318
|
+
* Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
319
|
+
*/
|
|
320
|
+
uint8_t _padding2[7];
|
|
321
|
+
} CBatchResult;
|
|
322
|
+
|
|
323
|
+
/**
|
|
324
|
+
* C-compatible structure for passing byte array with MIME type in batch operations
|
|
325
|
+
*
|
|
326
|
+
* # Memory Layout
|
|
327
|
+
*
|
|
328
|
+
* Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
329
|
+
* Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 pointer (8 bytes) = 24 bytes total
|
|
330
|
+
*
|
|
331
|
+
* The `#[repr(C)]` attribute ensures consistent memory layout across languages.
|
|
332
|
+
*
|
|
333
|
+
* # Usage
|
|
334
|
+
*
|
|
335
|
+
* This struct is used to pass document data to batch extraction functions. The caller
|
|
336
|
+
* retains ownership of the data and mime_type pointers.
|
|
337
|
+
*/
|
|
338
|
+
typedef struct CBytesWithMime {
|
|
339
|
+
/**
|
|
340
|
+
* Pointer to byte data
|
|
341
|
+
*/
|
|
342
|
+
const uint8_t *data;
|
|
343
|
+
/**
|
|
344
|
+
* Length of byte data
|
|
345
|
+
*/
|
|
346
|
+
uintptr_t data_len;
|
|
347
|
+
/**
|
|
348
|
+
* MIME type as null-terminated C string
|
|
349
|
+
*/
|
|
350
|
+
const char *mime_type;
|
|
351
|
+
} CBytesWithMime;
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Type alias for the DocumentExtractor callback function.
|
|
355
|
+
*
|
|
356
|
+
* # Parameters
|
|
357
|
+
*
|
|
358
|
+
* - `content`: Pointer to document bytes (valid only during the call)
|
|
359
|
+
* - `content_len`: Length of the content in bytes
|
|
360
|
+
* - `mime_type`: Null-terminated MIME type string
|
|
361
|
+
* - `config_json`: Null-terminated JSON configuration string
|
|
362
|
+
*
|
|
363
|
+
* # Returns
|
|
364
|
+
*
|
|
365
|
+
* Null-terminated JSON string containing the ExtractionResult
|
|
366
|
+
* (must be freed by Rust via kreuzberg_free_string), or NULL on error.
|
|
367
|
+
*
|
|
368
|
+
* # Safety
|
|
369
|
+
*
|
|
370
|
+
* The callback must:
|
|
371
|
+
* - Not store the content, mime_type, or config_json pointers (only valid during the call)
|
|
372
|
+
* - Return a valid null-terminated UTF-8 JSON string or NULL on error
|
|
373
|
+
* - The returned string must be freeable by kreuzberg_free_string
|
|
374
|
+
*/
|
|
375
|
+
typedef char *(*DocumentExtractorCallback)(const uint8_t *content,
|
|
376
|
+
uintptr_t content_len,
|
|
377
|
+
const char *mime_type,
|
|
378
|
+
const char *config_json);
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* Type alias for the OCR backend callback function.
|
|
382
|
+
*
|
|
383
|
+
* # Parameters
|
|
384
|
+
*
|
|
385
|
+
* - `image_bytes`: Raw image bytes
|
|
386
|
+
* - `image_length`: Length of image data in bytes
|
|
387
|
+
* - `config_json`: JSON-encoded OcrConfig (null-terminated string)
|
|
388
|
+
*
|
|
389
|
+
* # Returns
|
|
390
|
+
*
|
|
391
|
+
* Null-terminated string containing extracted text (must be freed by Rust via kreuzberg_free_string),
|
|
392
|
+
* or NULL on error.
|
|
393
|
+
*
|
|
394
|
+
* # Safety
|
|
395
|
+
*
|
|
396
|
+
* The callback must:
|
|
397
|
+
* - Not store the image_bytes pointer (it's only valid for the duration of the call)
|
|
398
|
+
* - Return a valid null-terminated UTF-8 string allocated by the caller
|
|
399
|
+
* - Return NULL on error (error message should be retrievable separately)
|
|
400
|
+
*/
|
|
401
|
+
typedef char *(*OcrBackendCallback)(const uint8_t *image_bytes,
|
|
402
|
+
uintptr_t image_length,
|
|
403
|
+
const char *config_json);
|
|
404
|
+
|
|
405
|
+
/**
|
|
406
|
+
* Type alias for the PostProcessor callback function.
|
|
407
|
+
*
|
|
408
|
+
* # Parameters
|
|
409
|
+
*
|
|
410
|
+
* - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
|
|
411
|
+
*
|
|
412
|
+
* # Returns
|
|
413
|
+
*
|
|
414
|
+
* Null-terminated JSON string containing the processed ExtractionResult
|
|
415
|
+
* (must be freed by Rust via kreuzberg_free_string), or NULL on error.
|
|
416
|
+
*
|
|
417
|
+
* # Safety
|
|
418
|
+
*
|
|
419
|
+
* The callback must:
|
|
420
|
+
* - Not store the result_json pointer (it's only valid for the duration of the call)
|
|
421
|
+
* - Return a valid null-terminated UTF-8 JSON string allocated by the caller
|
|
422
|
+
* - Return NULL on error (error message should be retrievable separately)
|
|
423
|
+
*/
|
|
424
|
+
typedef char *(*PostProcessorCallback)(const char *result_json);
|
|
425
|
+
|
|
426
|
+
/**
|
|
427
|
+
* Validator callback function type for FFI.
|
|
428
|
+
*
|
|
429
|
+
* This is a C function pointer that validates extraction results.
|
|
430
|
+
*
|
|
431
|
+
* # Safety
|
|
432
|
+
*
|
|
433
|
+
* The callback must:
|
|
434
|
+
* - Not store the result_json pointer (it's only valid for the duration of the call)
|
|
435
|
+
* - Return a valid null-terminated UTF-8 string (error message) if validation fails
|
|
436
|
+
* - Return NULL if validation passes
|
|
437
|
+
* - The returned string must be freeable by kreuzberg_free_string
|
|
438
|
+
*/
|
|
439
|
+
typedef char *(*ValidatorCallback)(const char *result_json);
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Metadata field accessor structure
|
|
443
|
+
*
|
|
444
|
+
* Returned by `kreuzberg_result_get_metadata_field()`. Contains the field value
|
|
445
|
+
* as JSON and information about whether the field exists.
|
|
446
|
+
*
|
|
447
|
+
* # Fields
|
|
448
|
+
*
|
|
449
|
+
* * `name` - The field name requested (does not need to be freed)
|
|
450
|
+
* * `json_value` - JSON representation of the field value, or NULL if field doesn't exist
|
|
451
|
+
* * `is_null` - 1 if the field doesn't exist, 0 if it does
|
|
452
|
+
*
|
|
453
|
+
* The `json_value` pointer (if non-NULL) must be freed with `kreuzberg_free_string()`.
|
|
454
|
+
*/
|
|
455
|
+
typedef struct CMetadataField {
|
|
456
|
+
const char *name;
|
|
457
|
+
char *json_value;
|
|
458
|
+
int32_t is_null;
|
|
459
|
+
} CMetadataField;
|
|
460
|
+
|
|
461
|
+
/**
|
|
462
|
+
* Statistics for result pool allocation tracking.
|
|
463
|
+
*
|
|
464
|
+
* Provides insight into pool efficiency and memory usage patterns.
|
|
465
|
+
*/
|
|
466
|
+
typedef struct CResultPoolStats {
|
|
467
|
+
/**
|
|
468
|
+
* Current number of results stored in pool
|
|
469
|
+
*/
|
|
470
|
+
uintptr_t current_count;
|
|
471
|
+
/**
|
|
472
|
+
* Maximum capacity of pool (before automatic growth)
|
|
473
|
+
*/
|
|
474
|
+
uintptr_t capacity;
|
|
475
|
+
/**
|
|
476
|
+
* Total number of allocations (successful extractions)
|
|
477
|
+
*/
|
|
478
|
+
uintptr_t total_allocations;
|
|
479
|
+
/**
|
|
480
|
+
* Number of times pool capacity was exceeded (triggered growth)
|
|
481
|
+
*/
|
|
482
|
+
uintptr_t growth_events;
|
|
483
|
+
/**
|
|
484
|
+
* Estimated memory used by results in bytes
|
|
485
|
+
*/
|
|
486
|
+
uintptr_t estimated_memory_bytes;
|
|
487
|
+
} CResultPoolStats;
|
|
488
|
+
|
|
489
|
+
/**
|
|
490
|
+
* Statistics for string interning efficiency tracking.
|
|
491
|
+
*/
|
|
492
|
+
typedef struct CStringInternStats {
|
|
493
|
+
/**
|
|
494
|
+
* Number of unique strings currently interned
|
|
495
|
+
*/
|
|
496
|
+
uintptr_t unique_count;
|
|
497
|
+
/**
|
|
498
|
+
* Total number of intern requests
|
|
499
|
+
*/
|
|
500
|
+
uintptr_t total_requests;
|
|
501
|
+
/**
|
|
502
|
+
* Number of cache hits (string already interned)
|
|
503
|
+
*/
|
|
504
|
+
uintptr_t cache_hits;
|
|
505
|
+
/**
|
|
506
|
+
* Number of cache misses (new string added)
|
|
507
|
+
*/
|
|
508
|
+
uintptr_t cache_misses;
|
|
509
|
+
/**
|
|
510
|
+
* Estimated memory saved by deduplication (bytes)
|
|
511
|
+
*/
|
|
512
|
+
uintptr_t estimated_memory_saved;
|
|
513
|
+
/**
|
|
514
|
+
* Total memory used by interned strings (bytes)
|
|
515
|
+
*/
|
|
516
|
+
uintptr_t total_memory_bytes;
|
|
517
|
+
} CStringInternStats;
|
|
518
|
+
|
|
519
|
+
/**
|
|
520
|
+
* Extract multiple files in streaming mode with callback-based result delivery.
|
|
521
|
+
*
|
|
522
|
+
* Processes files one at a time without accumulating results in memory.
|
|
523
|
+
* Each result is passed to the callback and then freed automatically.
|
|
524
|
+
*
|
|
525
|
+
* # Arguments
|
|
526
|
+
*
|
|
527
|
+
* * `files` - Array of null-terminated file path strings
|
|
528
|
+
* * `count` - Number of files in the array
|
|
529
|
+
* * `config_json` - Optional JSON configuration string (NULL for defaults)
|
|
530
|
+
* * `result_callback` - Callback invoked for each successful extraction
|
|
531
|
+
* * `user_data` - Optional user context passed to callbacks
|
|
532
|
+
* * `error_callback` - Optional callback invoked for extraction failures
|
|
533
|
+
*
|
|
534
|
+
* # Returns
|
|
535
|
+
*
|
|
536
|
+
* - `0` on success (all files processed or cancelled by callback)
|
|
537
|
+
* - `-1` on error (invalid arguments, configuration parsing failure)
|
|
538
|
+
*
|
|
539
|
+
* # Error Handling
|
|
540
|
+
*
|
|
541
|
+
* - Individual file failures invoke `error_callback` but don't stop processing
|
|
542
|
+
* - Callback can return non-zero to cancel remaining files
|
|
543
|
+
* - Invalid arguments or config parsing errors return `-1` immediately
|
|
544
|
+
*
|
|
545
|
+
* # Safety
|
|
546
|
+
*
|
|
547
|
+
* - `files` must point to valid array of `count` C string pointers
|
|
548
|
+
* - All file path strings must be valid null-terminated UTF-8
|
|
549
|
+
* - `config_json` must be valid null-terminated UTF-8 if not NULL
|
|
550
|
+
* - `result_callback` must be a valid function pointer
|
|
551
|
+
* - `error_callback` must be a valid function pointer if not NULL
|
|
552
|
+
* - Result pointers passed to callbacks are valid only during callback
|
|
553
|
+
* - Callbacks must not store result pointers for later use
|
|
554
|
+
*
|
|
555
|
+
* # Example (C)
|
|
556
|
+
*
|
|
557
|
+
* ```c
|
|
558
|
+
* int process_result(const CExtractionResultView* result, size_t index, void* data) {
|
|
559
|
+
* // Copy data needed beyond callback scope
|
|
560
|
+
* char content[1024];
|
|
561
|
+
* size_t copy_len = result->content_len < 1024 ? result->content_len : 1023;
|
|
562
|
+
* memcpy(content, result->content_ptr, copy_len);
|
|
563
|
+
* content[copy_len] = '\0';
|
|
564
|
+
* return 0; // Continue
|
|
565
|
+
* }
|
|
566
|
+
*
|
|
567
|
+
* void handle_error(size_t index, const char* msg, void* data) {
|
|
568
|
+
* fprintf(stderr, "File %zu failed: %s\n", index, msg);
|
|
569
|
+
* }
|
|
570
|
+
*
|
|
571
|
+
* const char* files[] = {"a.pdf", "b.txt", "c.docx"};
|
|
572
|
+
* kreuzberg_extract_batch_streaming(files, 3, NULL, process_result, NULL, handle_error);
|
|
573
|
+
* ```
|
|
574
|
+
*/
|
|
575
|
+
int kreuzberg_extract_batch_streaming(const char *const *files,
|
|
576
|
+
uintptr_t count,
|
|
577
|
+
const char *config_json,
|
|
578
|
+
ResultCallback result_callback,
|
|
579
|
+
void *user_data,
|
|
580
|
+
struct Option_ErrorCallback error_callback);
|
|
581
|
+
|
|
582
|
+
/**
|
|
583
|
+
* Extract multiple files in parallel streaming mode.
|
|
584
|
+
*
|
|
585
|
+
* Similar to `kreuzberg_extract_batch_streaming` but processes files in parallel
|
|
586
|
+
* using a thread pool. Results are delivered via callback as they complete.
|
|
587
|
+
*
|
|
588
|
+
* # Arguments
|
|
589
|
+
*
|
|
590
|
+
* * `files` - Array of null-terminated file path strings
|
|
591
|
+
* * `count` - Number of files in the array
|
|
592
|
+
* * `config_json` - Optional JSON configuration string (NULL for defaults)
|
|
593
|
+
* * `result_callback` - Thread-safe callback invoked for each successful extraction
|
|
594
|
+
* * `user_data` - Optional user context passed to callbacks (must be thread-safe)
|
|
595
|
+
* * `error_callback` - Optional thread-safe callback invoked for failures
|
|
596
|
+
* * `max_parallel` - Maximum number of parallel extractions (0 = number of CPUs)
|
|
597
|
+
*
|
|
598
|
+
* # Returns
|
|
599
|
+
*
|
|
600
|
+
* - `0` on success (all files processed or cancelled)
|
|
601
|
+
* - `-1` on error (invalid arguments, configuration parsing failure)
|
|
602
|
+
*
|
|
603
|
+
* # Thread Safety
|
|
604
|
+
*
|
|
605
|
+
* - Both callbacks may be invoked concurrently from multiple threads
|
|
606
|
+
* - `user_data` must be thread-safe (e.g., synchronized with mutex)
|
|
607
|
+
* - Callback can set atomic flag to signal cancellation
|
|
608
|
+
*
|
|
609
|
+
* # Safety
|
|
610
|
+
*
|
|
611
|
+
* Same requirements as `kreuzberg_extract_batch_streaming`, plus:
|
|
612
|
+
* - Callbacks must be thread-safe
|
|
613
|
+
* - `user_data` must support concurrent access
|
|
614
|
+
*
|
|
615
|
+
* # Example (C)
|
|
616
|
+
*
|
|
617
|
+
* ```c
|
|
618
|
+
* typedef struct {
|
|
619
|
+
* pthread_mutex_t lock;
|
|
620
|
+
* atomic_int cancel_flag;
|
|
621
|
+
* } BatchContext;
|
|
622
|
+
*
|
|
623
|
+
* int process_result(const CExtractionResultView* result, size_t index, void* data) {
|
|
624
|
+
* BatchContext* ctx = (BatchContext*)data;
|
|
625
|
+
* pthread_mutex_lock(&ctx->lock);
|
|
626
|
+
* // Process result with thread safety
|
|
627
|
+
* pthread_mutex_unlock(&ctx->lock);
|
|
628
|
+
* return atomic_load(&ctx->cancel_flag);
|
|
629
|
+
* }
|
|
630
|
+
* ```
|
|
631
|
+
*/
|
|
632
|
+
int kreuzberg_extract_batch_parallel(const char *const *files,
|
|
633
|
+
uintptr_t count,
|
|
634
|
+
const char *config_json,
|
|
635
|
+
ResultCallback result_callback,
|
|
636
|
+
void *user_data,
|
|
637
|
+
struct Option_ErrorCallback error_callback,
|
|
638
|
+
uintptr_t max_parallel);
|
|
639
|
+
|
|
640
|
+
/**
|
|
641
|
+
* Parse an ExtractionConfig from a JSON string.
|
|
642
|
+
*
|
|
643
|
+
* This is the primary FFI entry point for all language bindings to parse
|
|
644
|
+
* configuration from JSON. Replaces the need for each binding to implement
|
|
645
|
+
* its own JSON parsing logic.
|
|
646
|
+
*
|
|
647
|
+
* # Arguments
|
|
648
|
+
*
|
|
649
|
+
* * `json_config` - Null-terminated C string containing JSON configuration
|
|
650
|
+
*
|
|
651
|
+
* # Returns
|
|
652
|
+
*
|
|
653
|
+
* A pointer to an ExtractionConfig struct that MUST be freed with
|
|
654
|
+
* `kreuzberg_config_free`, or NULL on error (check kreuzberg_last_error).
|
|
655
|
+
*
|
|
656
|
+
* # Safety
|
|
657
|
+
*
|
|
658
|
+
* - `json_config` must be a valid null-terminated C string
|
|
659
|
+
* - The returned pointer must be freed with `kreuzberg_config_free`
|
|
660
|
+
* - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
|
|
661
|
+
*
|
|
662
|
+
* # Example (C)
|
|
663
|
+
*
|
|
664
|
+
* ```c
|
|
665
|
+
* const char* config_json = "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}";
|
|
666
|
+
* ExtractionConfig* config = kreuzberg_config_from_json(config_json);
|
|
667
|
+
* if (config == NULL) {
|
|
668
|
+
* printf("Error: %s\n", kreuzberg_last_error());
|
|
669
|
+
* return 1;
|
|
670
|
+
* }
|
|
671
|
+
*
|
|
672
|
+
* // Use config...
|
|
673
|
+
* // char* result = kreuzberg_extract_file_with_config("doc.pdf", config);
|
|
674
|
+
*
|
|
675
|
+
* kreuzberg_config_free(config);
|
|
676
|
+
* ```
|
|
677
|
+
*/
|
|
678
|
+
ExtractionConfig *kreuzberg_config_from_json(const char *json_config);
|
|
679
|
+
|
|
680
|
+
/**
|
|
681
|
+
* Free an ExtractionConfig allocated by kreuzberg_config_from_json or similar.
|
|
682
|
+
*
|
|
683
|
+
* # Safety
|
|
684
|
+
*
|
|
685
|
+
* - `config` must be a pointer previously returned by a config creation function
|
|
686
|
+
* - `config` can be NULL (no-op)
|
|
687
|
+
* - `config` must not be used after this call
|
|
688
|
+
*
|
|
689
|
+
* # Example (C)
|
|
690
|
+
*
|
|
691
|
+
* ```c
|
|
692
|
+
* ExtractionConfig* config = kreuzberg_config_from_json("{...}");
|
|
693
|
+
* if (config != NULL) {
|
|
694
|
+
* // Use config...
|
|
695
|
+
* kreuzberg_config_free(config);
|
|
696
|
+
* }
|
|
697
|
+
* ```
|
|
698
|
+
*/
|
|
699
|
+
void kreuzberg_config_free(ExtractionConfig *config);
|
|
700
|
+
|
|
701
|
+
/**
|
|
702
|
+
* Validate a JSON config string without parsing it.
|
|
703
|
+
*
|
|
704
|
+
* This function checks if a JSON config string is valid and would parse correctly,
|
|
705
|
+
* without allocating the full ExtractionConfig structure. Useful for validation
|
|
706
|
+
* before committing to parsing.
|
|
707
|
+
*
|
|
708
|
+
* # Arguments
|
|
709
|
+
*
|
|
710
|
+
* * `json_config` - Null-terminated C string containing JSON configuration
|
|
711
|
+
*
|
|
712
|
+
* # Returns
|
|
713
|
+
*
|
|
714
|
+
* - 1 if valid (would parse successfully)
|
|
715
|
+
* - 0 if invalid (check `kreuzberg_last_error` for details)
|
|
716
|
+
*
|
|
717
|
+
* # Safety
|
|
718
|
+
*
|
|
719
|
+
* - `json_config` must be a valid null-terminated C string
|
|
720
|
+
*
|
|
721
|
+
* # Example (C)
|
|
722
|
+
*
|
|
723
|
+
* ```c
|
|
724
|
+
* const char* config_json = "{\"use_cache\": true}";
|
|
725
|
+
* if (kreuzberg_config_is_valid(config_json)) {
|
|
726
|
+
* ExtractionConfig* config = kreuzberg_config_from_json(config_json);
|
|
727
|
+
* // Use config...
|
|
728
|
+
* kreuzberg_config_free(config);
|
|
729
|
+
* } else {
|
|
730
|
+
* printf("Invalid config: %s\n", kreuzberg_last_error());
|
|
731
|
+
* }
|
|
732
|
+
* ```
|
|
733
|
+
*/
|
|
734
|
+
int32_t kreuzberg_config_is_valid(const char *json_config);
|
|
735
|
+
|
|
736
|
+
/**
|
|
737
|
+
* Serialize an ExtractionConfig to JSON string.
|
|
738
|
+
*
|
|
739
|
+
* Converts an ExtractionConfig structure to its JSON representation, allowing
|
|
740
|
+
* bindings to serialize configs without reimplementing serialization logic.
|
|
741
|
+
*
|
|
742
|
+
* # Arguments
|
|
743
|
+
*
|
|
744
|
+
* * `config` - Pointer to an ExtractionConfig structure
|
|
745
|
+
*
|
|
746
|
+
* # Returns
|
|
747
|
+
*
|
|
748
|
+
* A pointer to a C string containing JSON that MUST be freed with `kreuzberg_free_string`.
|
|
749
|
+
* Returns NULL on error (check `kreuzberg_last_error`).
|
|
750
|
+
*
|
|
751
|
+
* # Safety
|
|
752
|
+
*
|
|
753
|
+
* - `config` must be a valid pointer to an ExtractionConfig
|
|
754
|
+
* - `config` cannot be NULL
|
|
755
|
+
* - The returned pointer must be freed with `kreuzberg_free_string`
|
|
756
|
+
*
|
|
757
|
+
* # Example (C)
|
|
758
|
+
*
|
|
759
|
+
* ```c
|
|
760
|
+
* ExtractionConfig* config = kreuzberg_config_from_json("{\"use_cache\": true}");
|
|
761
|
+
* if (config != NULL) {
|
|
762
|
+
* char* json = kreuzberg_config_to_json(config);
|
|
763
|
+
* if (json != NULL) {
|
|
764
|
+
* printf("Serialized: %s\n", json);
|
|
765
|
+
* kreuzberg_free_string(json);
|
|
766
|
+
* }
|
|
767
|
+
* kreuzberg_config_free(config);
|
|
768
|
+
* }
|
|
769
|
+
* ```
|
|
770
|
+
*/
|
|
771
|
+
char *kreuzberg_config_to_json(const ExtractionConfig *config);
|
|
772
|
+
|
|
773
|
+
/**
|
|
774
|
+
* Get a specific field from config as JSON string.
|
|
775
|
+
*
|
|
776
|
+
* Retrieves a nested field from the configuration by path and returns its JSON
|
|
777
|
+
* representation. Supports dot notation for nested fields (e.g., "ocr.backend").
|
|
778
|
+
*
|
|
779
|
+
* # Arguments
|
|
780
|
+
*
|
|
781
|
+
* * `config` - Pointer to an ExtractionConfig structure
|
|
782
|
+
* * `field_name` - Null-terminated C string with field path (e.g., "use_cache", "ocr.backend")
|
|
783
|
+
*
|
|
784
|
+
* # Returns
|
|
785
|
+
*
|
|
786
|
+
* A pointer to a C string containing the field value as JSON, or NULL if:
|
|
787
|
+
* - The field doesn't exist
|
|
788
|
+
* - An error occurs during serialization
|
|
789
|
+
*
|
|
790
|
+
* The returned pointer (if non-NULL) must be freed with `kreuzberg_free_string`.
|
|
791
|
+
*
|
|
792
|
+
* # Safety
|
|
793
|
+
*
|
|
794
|
+
* - `config` must be a valid pointer to an ExtractionConfig
|
|
795
|
+
* - `field_name` must be a valid null-terminated C string
|
|
796
|
+
* - Neither parameter can be NULL
|
|
797
|
+
*
|
|
798
|
+
* # Example (C)
|
|
799
|
+
*
|
|
800
|
+
* ```c
|
|
801
|
+
* ExtractionConfig* config = kreuzberg_config_from_json(
|
|
802
|
+
* "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}"
|
|
803
|
+
* );
|
|
804
|
+
* if (config != NULL) {
|
|
805
|
+
* char* use_cache = kreuzberg_config_get_field(config, "use_cache");
|
|
806
|
+
* char* backend = kreuzberg_config_get_field(config, "ocr.backend");
|
|
807
|
+
*
|
|
808
|
+
* if (use_cache != NULL) {
|
|
809
|
+
* printf("use_cache: %s\n", use_cache);
|
|
810
|
+
* kreuzberg_free_string(use_cache);
|
|
811
|
+
* }
|
|
812
|
+
*
|
|
813
|
+
* if (backend != NULL) {
|
|
814
|
+
* printf("backend: %s\n", backend);
|
|
815
|
+
* kreuzberg_free_string(backend);
|
|
816
|
+
* }
|
|
817
|
+
*
|
|
818
|
+
* kreuzberg_config_free(config);
|
|
819
|
+
* }
|
|
820
|
+
* ```
|
|
821
|
+
*/
|
|
822
|
+
char *kreuzberg_config_get_field(const ExtractionConfig *config, const char *field_name);
|
|
823
|
+
|
|
824
|
+
/**
|
|
825
|
+
* Merge two configs (override takes precedence over base).
|
|
826
|
+
*
|
|
827
|
+
* Performs a shallow merge of two ExtractionConfig structures, where fields
|
|
828
|
+
* from `override_config` take precedence over fields in `base`. The `base`
|
|
829
|
+
* config is modified in-place.
|
|
830
|
+
*
|
|
831
|
+
* # Arguments
|
|
832
|
+
*
|
|
833
|
+
* * `base` - Pointer to the base ExtractionConfig (will be modified)
|
|
834
|
+
* * `override_config` - Pointer to the override ExtractionConfig (read-only)
|
|
835
|
+
*
|
|
836
|
+
* # Returns
|
|
837
|
+
*
|
|
838
|
+
* - 1 on success
|
|
839
|
+
* - 0 on error (check `kreuzberg_last_error`)
|
|
840
|
+
*
|
|
841
|
+
* # Safety
|
|
842
|
+
*
|
|
843
|
+
* - `base` must be a valid mutable pointer to an ExtractionConfig
|
|
844
|
+
* - `override_config` must be a valid pointer to an ExtractionConfig
|
|
845
|
+
* - Neither parameter can be NULL
|
|
846
|
+
* - `base` is modified in-place
|
|
847
|
+
*
|
|
848
|
+
* # Example (C)
|
|
849
|
+
*
|
|
850
|
+
* ```c
|
|
851
|
+
* ExtractionConfig* base = kreuzberg_config_from_json(
|
|
852
|
+
* "{\"use_cache\": true, \"force_ocr\": false}"
|
|
853
|
+
* );
|
|
854
|
+
* ExtractionConfig* override = kreuzberg_config_from_json(
|
|
855
|
+
* "{\"force_ocr\": true}"
|
|
856
|
+
* );
|
|
857
|
+
*
|
|
858
|
+
* if (kreuzberg_config_merge(base, override) == 1) {
|
|
859
|
+
* // base now has: use_cache=true, force_ocr=true
|
|
860
|
+
* char* json = kreuzberg_config_to_json(base);
|
|
861
|
+
* printf("Merged config: %s\n", json);
|
|
862
|
+
* kreuzberg_free_string(json);
|
|
863
|
+
* }
|
|
864
|
+
*
|
|
865
|
+
* kreuzberg_config_free(base);
|
|
866
|
+
* kreuzberg_config_free(override);
|
|
867
|
+
* ```
|
|
868
|
+
*/
|
|
869
|
+
int32_t kreuzberg_config_merge(ExtractionConfig *base, const ExtractionConfig *override_config);
|
|
870
|
+
|
|
871
|
+
/**
|
|
872
|
+
* Load an ExtractionConfig from a file.
|
|
873
|
+
*
|
|
874
|
+
* Returns a JSON string representing the loaded configuration.
|
|
875
|
+
*
|
|
876
|
+
* # Safety
|
|
877
|
+
*
|
|
878
|
+
* - `file_path` must be a valid null-terminated C string
|
|
879
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
880
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
881
|
+
*/
|
|
882
|
+
char *kreuzberg_load_extraction_config_from_file(const char *file_path);
|
|
883
|
+
|
|
884
|
+
/**
|
|
885
|
+
* Load an ExtractionConfig from a file (returns pointer to config struct).
|
|
886
|
+
*
|
|
887
|
+
* # Safety
|
|
888
|
+
*
|
|
889
|
+
* - `path` must be a valid null-terminated C string
|
|
890
|
+
* - The returned pointer must be freed with `kreuzberg_config_free`
|
|
891
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
892
|
+
*
|
|
893
|
+
* # Example (C)
|
|
894
|
+
*
|
|
895
|
+
* ```c
|
|
896
|
+
* ExtractionConfig* config = kreuzberg_config_from_file("config.toml");
|
|
897
|
+
* if (config == NULL) {
|
|
898
|
+
* printf("Error: %s\n", kreuzberg_last_error());
|
|
899
|
+
* return 1;
|
|
900
|
+
* }
|
|
901
|
+
* kreuzberg_config_free(config);
|
|
902
|
+
* ```
|
|
903
|
+
*/
|
|
904
|
+
ExtractionConfig *kreuzberg_config_from_file(const char *path);
|
|
905
|
+
|
|
906
|
+
/**
|
|
907
|
+
* Discover and load an ExtractionConfig by searching parent directories.
|
|
908
|
+
*
|
|
909
|
+
* Searches the current directory and all parent directories for:
|
|
910
|
+
* - `kreuzberg.toml`
|
|
911
|
+
* - `kreuzberg.json`
|
|
912
|
+
*
|
|
913
|
+
* Returns the first config file found as a JSON string.
|
|
914
|
+
*
|
|
915
|
+
* # Safety
|
|
916
|
+
*
|
|
917
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
918
|
+
* - Returns NULL if no config is found or on error
|
|
919
|
+
*
|
|
920
|
+
* # Example (C)
|
|
921
|
+
*
|
|
922
|
+
* ```c
|
|
923
|
+
* char* config_json = kreuzberg_config_discover();
|
|
924
|
+
* if (config_json != NULL) {
|
|
925
|
+
* printf("Discovered config: %s\n", config_json);
|
|
926
|
+
* kreuzberg_free_string(config_json);
|
|
927
|
+
* }
|
|
928
|
+
* ```
|
|
929
|
+
*/
|
|
930
|
+
char *kreuzberg_config_discover(void);
|
|
931
|
+
|
|
932
|
+
/**
|
|
933
|
+
* List available embedding preset names.
|
|
934
|
+
*
|
|
935
|
+
* # Safety
|
|
936
|
+
*
|
|
937
|
+
* - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
|
|
938
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
939
|
+
*/
|
|
940
|
+
char *kreuzberg_list_embedding_presets(void);
|
|
941
|
+
|
|
942
|
+
/**
|
|
943
|
+
* Get a specific embedding preset by name.
|
|
944
|
+
*
|
|
945
|
+
* # Safety
|
|
946
|
+
*
|
|
947
|
+
* - `name` must be a valid null-terminated C string
|
|
948
|
+
* - Returned string is JSON object and must be freed with `kreuzberg_free_string`
|
|
949
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
950
|
+
*/
|
|
951
|
+
char *kreuzberg_get_embedding_preset(const char *name);
|
|
952
|
+
|
|
953
|
+
/**
|
|
954
|
+
* Returns the validation error code (0).
|
|
955
|
+
*
|
|
956
|
+
* # C Signature
|
|
957
|
+
*
|
|
958
|
+
* ```c
|
|
959
|
+
* uint32_t kreuzberg_error_code_validation(void);
|
|
960
|
+
* ```
|
|
961
|
+
*/
|
|
962
|
+
uint32_t kreuzberg_error_code_validation(void);
|
|
963
|
+
|
|
964
|
+
/**
|
|
965
|
+
* Returns the parsing error code (1).
|
|
966
|
+
*
|
|
967
|
+
* # C Signature
|
|
968
|
+
*
|
|
969
|
+
* ```c
|
|
970
|
+
* uint32_t kreuzberg_error_code_parsing(void);
|
|
971
|
+
* ```
|
|
972
|
+
*/
|
|
973
|
+
uint32_t kreuzberg_error_code_parsing(void);
|
|
974
|
+
|
|
975
|
+
/**
|
|
976
|
+
* Returns the OCR error code (2).
|
|
977
|
+
*
|
|
978
|
+
* # C Signature
|
|
979
|
+
*
|
|
980
|
+
* ```c
|
|
981
|
+
* uint32_t kreuzberg_error_code_ocr(void);
|
|
982
|
+
* ```
|
|
983
|
+
*/
|
|
984
|
+
uint32_t kreuzberg_error_code_ocr(void);
|
|
985
|
+
|
|
986
|
+
/**
|
|
987
|
+
* Returns the missing dependency error code (3).
|
|
988
|
+
*
|
|
989
|
+
* # C Signature
|
|
990
|
+
*
|
|
991
|
+
* ```c
|
|
992
|
+
* uint32_t kreuzberg_error_code_missing_dependency(void);
|
|
993
|
+
* ```
|
|
994
|
+
*/
|
|
995
|
+
uint32_t kreuzberg_error_code_missing_dependency(void);
|
|
996
|
+
|
|
997
|
+
/**
|
|
998
|
+
* Returns the I/O error code (4).
|
|
999
|
+
*
|
|
1000
|
+
* # C Signature
|
|
1001
|
+
*
|
|
1002
|
+
* ```c
|
|
1003
|
+
* uint32_t kreuzberg_error_code_io(void);
|
|
1004
|
+
* ```
|
|
1005
|
+
*/
|
|
1006
|
+
uint32_t kreuzberg_error_code_io(void);
|
|
1007
|
+
|
|
1008
|
+
/**
|
|
1009
|
+
* Returns the plugin error code (5).
|
|
1010
|
+
*
|
|
1011
|
+
* # C Signature
|
|
1012
|
+
*
|
|
1013
|
+
* ```c
|
|
1014
|
+
* uint32_t kreuzberg_error_code_plugin(void);
|
|
1015
|
+
* ```
|
|
1016
|
+
*/
|
|
1017
|
+
uint32_t kreuzberg_error_code_plugin(void);
|
|
1018
|
+
|
|
1019
|
+
/**
|
|
1020
|
+
* Returns the unsupported format error code (6).
|
|
1021
|
+
*
|
|
1022
|
+
* # C Signature
|
|
1023
|
+
*
|
|
1024
|
+
* ```c
|
|
1025
|
+
* uint32_t kreuzberg_error_code_unsupported_format(void);
|
|
1026
|
+
* ```
|
|
1027
|
+
*/
|
|
1028
|
+
uint32_t kreuzberg_error_code_unsupported_format(void);
|
|
1029
|
+
|
|
1030
|
+
/**
|
|
1031
|
+
* Returns the internal error code (7).
|
|
1032
|
+
*
|
|
1033
|
+
* # C Signature
|
|
1034
|
+
*
|
|
1035
|
+
* ```c
|
|
1036
|
+
* uint32_t kreuzberg_error_code_internal(void);
|
|
1037
|
+
* ```
|
|
1038
|
+
*/
|
|
1039
|
+
uint32_t kreuzberg_error_code_internal(void);
|
|
1040
|
+
|
|
1041
|
+
/**
|
|
1042
|
+
* Returns the total count of valid error codes.
|
|
1043
|
+
*
|
|
1044
|
+
* Currently 8 error codes (0-7). This helps bindings validate error codes.
|
|
1045
|
+
*
|
|
1046
|
+
* # C Signature
|
|
1047
|
+
*
|
|
1048
|
+
* ```c
|
|
1049
|
+
* uint32_t kreuzberg_error_code_count(void);
|
|
1050
|
+
* ```
|
|
1051
|
+
*/
|
|
1052
|
+
uint32_t kreuzberg_error_code_count(void);
|
|
1053
|
+
|
|
1054
|
+
/**
|
|
1055
|
+
* Returns the name of an error code as a C string.
|
|
1056
|
+
*
|
|
1057
|
+
* # Arguments
|
|
1058
|
+
*
|
|
1059
|
+
* - `code`: Numeric error code (0-7)
|
|
1060
|
+
*
|
|
1061
|
+
* # Returns
|
|
1062
|
+
*
|
|
1063
|
+
* Pointer to a null-terminated C string with the error name (e.g., "validation", "ocr").
|
|
1064
|
+
* Returns a pointer to "unknown" if the code is invalid.
|
|
1065
|
+
*
|
|
1066
|
+
* The returned pointer is valid for the lifetime of the program and should not be freed.
|
|
1067
|
+
*
|
|
1068
|
+
* # Examples
|
|
1069
|
+
*
|
|
1070
|
+
* ```c
|
|
1071
|
+
* const char* name = kreuzberg_error_code_name(0);
|
|
1072
|
+
* printf("%s\n", name); // prints: validation
|
|
1073
|
+
* ```
|
|
1074
|
+
*
|
|
1075
|
+
* # C Signature
|
|
1076
|
+
*
|
|
1077
|
+
* ```c
|
|
1078
|
+
* const char* kreuzberg_error_code_name(uint32_t code);
|
|
1079
|
+
* ```
|
|
1080
|
+
*/
|
|
1081
|
+
const char *kreuzberg_error_code_name(uint32_t code);
|
|
1082
|
+
|
|
1083
|
+
/**
|
|
1084
|
+
* Returns the description of an error code as a C string.
|
|
1085
|
+
*
|
|
1086
|
+
* # Arguments
|
|
1087
|
+
*
|
|
1088
|
+
* - `code`: Numeric error code (0-7)
|
|
1089
|
+
*
|
|
1090
|
+
* # Returns
|
|
1091
|
+
*
|
|
1092
|
+
* Pointer to a null-terminated C string with a description (e.g., "Input validation error").
|
|
1093
|
+
* Returns a pointer to "Unknown error code" if the code is invalid.
|
|
1094
|
+
*
|
|
1095
|
+
* The returned pointer is valid for the lifetime of the program and should not be freed.
|
|
1096
|
+
*
|
|
1097
|
+
* # C Signature
|
|
1098
|
+
*
|
|
1099
|
+
* ```c
|
|
1100
|
+
* const char* kreuzberg_error_code_description(uint32_t code);
|
|
1101
|
+
* ```
|
|
1102
|
+
*/
|
|
1103
|
+
const char *kreuzberg_error_code_description(uint32_t code);
|
|
1104
|
+
|
|
1105
|
+
/**
|
|
1106
|
+
* Retrieves detailed error information from the thread-local error storage.
|
|
1107
|
+
*
|
|
1108
|
+
* Returns structured error details including message, code, type, and source location.
|
|
1109
|
+
* This function queries the error state captured by FFI functions and provides
|
|
1110
|
+
* comprehensive error information for binding implementations.
|
|
1111
|
+
*
|
|
1112
|
+
* # Returns
|
|
1113
|
+
*
|
|
1114
|
+
* A `CErrorDetails` structure with the following characteristics:
|
|
1115
|
+
* - All non-NULL string pointers must be freed with `kreuzberg_free_string()`
|
|
1116
|
+
* - NULL pointers indicate the field is not available
|
|
1117
|
+
* - `error_code` is a numeric code (0-7)
|
|
1118
|
+
* - `source_line` is 0 if unknown
|
|
1119
|
+
* - `is_panic` is 1 if error originated from a panic, 0 otherwise
|
|
1120
|
+
*
|
|
1121
|
+
* # Thread Safety
|
|
1122
|
+
*
|
|
1123
|
+
* This function is thread-safe. Each thread has its own error storage.
|
|
1124
|
+
*
|
|
1125
|
+
* # Example (C)
|
|
1126
|
+
*
|
|
1127
|
+
* ```c
|
|
1128
|
+
* CErrorDetails details = kreuzberg_get_error_details();
|
|
1129
|
+
* printf("Error: %s (code=%u, type=%s)\n", details.message, details.error_code, details.error_type);
|
|
1130
|
+
* if (details.source_file != NULL) {
|
|
1131
|
+
* printf(" at %s:%u in %s\n", details.source_file, details.source_line, details.source_function);
|
|
1132
|
+
* }
|
|
1133
|
+
* kreuzberg_free_string(details.message);
|
|
1134
|
+
* kreuzberg_free_string(details.error_type);
|
|
1135
|
+
* if (details.source_file != NULL) kreuzberg_free_string(details.source_file);
|
|
1136
|
+
* if (details.source_function != NULL) kreuzberg_free_string(details.source_function);
|
|
1137
|
+
* if (details.context_info != NULL) kreuzberg_free_string(details.context_info);
|
|
1138
|
+
* ```
|
|
1139
|
+
*
|
|
1140
|
+
* # C Signature
|
|
1141
|
+
*
|
|
1142
|
+
* ```c
|
|
1143
|
+
* typedef struct {
|
|
1144
|
+
* char* message;
|
|
1145
|
+
* uint32_t error_code;
|
|
1146
|
+
* char* error_type;
|
|
1147
|
+
* char* source_file;
|
|
1148
|
+
* char* source_function;
|
|
1149
|
+
* uint32_t source_line;
|
|
1150
|
+
* char* context_info;
|
|
1151
|
+
* int is_panic;
|
|
1152
|
+
* } CErrorDetails;
|
|
1153
|
+
*
|
|
1154
|
+
* CErrorDetails kreuzberg_get_error_details(void);
|
|
1155
|
+
* ```
|
|
1156
|
+
*/
|
|
1157
|
+
struct CErrorDetails kreuzberg_get_error_details(void);
|
|
1158
|
+
|
|
1159
|
+
/**
|
|
1160
|
+
* Classifies an error based on the error message string.
|
|
1161
|
+
*
|
|
1162
|
+
* Analyzes an error message and attempts to classify it into one of the standard
|
|
1163
|
+
* Kreuzberg error codes (0-7). This is useful for converting error messages from
|
|
1164
|
+
* external libraries or system calls into Kreuzberg error categories.
|
|
1165
|
+
*
|
|
1166
|
+
* # Arguments
|
|
1167
|
+
*
|
|
1168
|
+
* - `error_message`: Pointer to a null-terminated C string with the error message
|
|
1169
|
+
*
|
|
1170
|
+
* # Returns
|
|
1171
|
+
*
|
|
1172
|
+
* Numeric error code (0-7) indicating the most likely error classification.
|
|
1173
|
+
* Returns 7 (Internal) if the message cannot be reliably classified.
|
|
1174
|
+
*
|
|
1175
|
+
* # Classification Rules
|
|
1176
|
+
*
|
|
1177
|
+
* The classifier looks for common keywords and patterns:
|
|
1178
|
+
* - **0 (Validation)**: "invalid", "validation", "parameter", "constraint", "format mismatch"
|
|
1179
|
+
* - **1 (Parsing)**: "parse", "parsing", "corrupt", "unexpected", "malformed", "invalid format"
|
|
1180
|
+
* - **2 (OCR)**: "ocr", "tesseract", "recognition", "optical"
|
|
1181
|
+
* - **3 (MissingDependency)**: "not found", "missing", "dependency", "not installed", "unavailable"
|
|
1182
|
+
* - **4 (Io)**: "io", "file", "read", "write", "permission", "access", "disk", "exists"
|
|
1183
|
+
* - **5 (Plugin)**: "plugin", "loader", "registry", "extension"
|
|
1184
|
+
* - **6 (UnsupportedFormat)**: "unsupported", "unknown format", "MIME type"
|
|
1185
|
+
*
|
|
1186
|
+
* # Thread Safety
|
|
1187
|
+
*
|
|
1188
|
+
* This function is thread-safe and has no side effects.
|
|
1189
|
+
*
|
|
1190
|
+
* # Example (C)
|
|
1191
|
+
*
|
|
1192
|
+
* ```c
|
|
1193
|
+
* uint32_t code = kreuzberg_classify_error("Failed to open file: permission denied");
|
|
1194
|
+
* if (code == kreuzberg_error_code_io()) {
|
|
1195
|
+
* printf("This is an I/O error\n");
|
|
1196
|
+
* }
|
|
1197
|
+
* ```
|
|
1198
|
+
*
|
|
1199
|
+
* # Safety
|
|
1200
|
+
*
|
|
1201
|
+
* - `error_message` must be a valid null-terminated C string or NULL
|
|
1202
|
+
* - `error_message` must remain valid for the duration of the function call
|
|
1203
|
+
*
|
|
1204
|
+
* # C Signature
|
|
1205
|
+
*
|
|
1206
|
+
* ```c
|
|
1207
|
+
* uint32_t kreuzberg_classify_error(const char* error_message);
|
|
1208
|
+
* ```
|
|
1209
|
+
*/
|
|
1210
|
+
uint32_t kreuzberg_classify_error(const char *error_message);
|
|
1211
|
+
|
|
1212
|
+
/**
|
|
1213
|
+
* Extract text and metadata from a file (synchronous).
|
|
1214
|
+
*
|
|
1215
|
+
* # Safety
|
|
1216
|
+
*
|
|
1217
|
+
* - `file_path` must be a valid null-terminated C string
|
|
1218
|
+
* - The returned pointer must be freed with `kreuzberg_free_result`
|
|
1219
|
+
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
1220
|
+
*
|
|
1221
|
+
* # Example (C)
|
|
1222
|
+
*
|
|
1223
|
+
* ```c
|
|
1224
|
+
* const char* path = "/path/to/document.pdf";
|
|
1225
|
+
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
1226
|
+
* if (result != NULL && result->success) {
|
|
1227
|
+
* printf("Content: %s\n", result->content);
|
|
1228
|
+
* printf("MIME: %s\n", result->mime_type);
|
|
1229
|
+
* kreuzberg_free_result(result);
|
|
1230
|
+
* } else {
|
|
1231
|
+
* const char* error = kreuzberg_last_error();
|
|
1232
|
+
* printf("Error: %s\n", error);
|
|
1233
|
+
* }
|
|
1234
|
+
* ```
|
|
1235
|
+
*/
|
|
1236
|
+
struct CExtractionResult *kreuzberg_extract_file_sync(const char *file_path);
|
|
1237
|
+
|
|
1238
|
+
/**
|
|
1239
|
+
* Extract text and metadata from a file with custom configuration (synchronous).
|
|
1240
|
+
*
|
|
1241
|
+
* # Safety
|
|
1242
|
+
*
|
|
1243
|
+
* - `file_path` must be a valid null-terminated C string
|
|
1244
|
+
* - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
1245
|
+
* - The returned pointer must be freed with `kreuzberg_free_result`
|
|
1246
|
+
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
1247
|
+
*
|
|
1248
|
+
* # Example (C)
|
|
1249
|
+
*
|
|
1250
|
+
* ```c
|
|
1251
|
+
* const char* path = "/path/to/document.pdf";
|
|
1252
|
+
* const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
|
|
1253
|
+
* CExtractionResult* result = kreuzberg_extract_file_sync_with_config(path, config);
|
|
1254
|
+
* if (result != NULL && result->success) {
|
|
1255
|
+
* printf("Content: %s\n", result->content);
|
|
1256
|
+
* kreuzberg_free_result(result);
|
|
1257
|
+
* }
|
|
1258
|
+
* ```
|
|
1259
|
+
*/
|
|
1260
|
+
struct CExtractionResult *kreuzberg_extract_file_sync_with_config(const char *file_path,
|
|
1261
|
+
const char *config_json);
|
|
1262
|
+
|
|
1263
|
+
/**
|
|
1264
|
+
* Extract text and metadata from byte array (synchronous).
|
|
1265
|
+
*
|
|
1266
|
+
* # Safety
|
|
1267
|
+
*
|
|
1268
|
+
* - `data` must be a valid pointer to a byte array of length `data_len`
|
|
1269
|
+
* - `mime_type` must be a valid null-terminated C string
|
|
1270
|
+
* - The returned pointer must be freed with `kreuzberg_free_result`
|
|
1271
|
+
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
1272
|
+
*
|
|
1273
|
+
* # Example (C)
|
|
1274
|
+
*
|
|
1275
|
+
* ```c
|
|
1276
|
+
* const uint8_t* data = ...; // Document bytes
|
|
1277
|
+
* size_t len = ...; // Length of data
|
|
1278
|
+
* const char* mime = "application/pdf";
|
|
1279
|
+
* CExtractionResult* result = kreuzberg_extract_bytes_sync(data, len, mime);
|
|
1280
|
+
* if (result != NULL && result->success) {
|
|
1281
|
+
* printf("Content: %s\n", result->content);
|
|
1282
|
+
* kreuzberg_free_result(result);
|
|
1283
|
+
* } else {
|
|
1284
|
+
* const char* error = kreuzberg_last_error();
|
|
1285
|
+
* printf("Error: %s\n", error);
|
|
1286
|
+
* }
|
|
1287
|
+
* ```
|
|
1288
|
+
*/
|
|
1289
|
+
struct CExtractionResult *kreuzberg_extract_bytes_sync(const uint8_t *data,
|
|
1290
|
+
uintptr_t data_len,
|
|
1291
|
+
const char *mime_type);
|
|
1292
|
+
|
|
1293
|
+
/**
|
|
1294
|
+
* Extract text and metadata from byte array with custom configuration (synchronous).
|
|
1295
|
+
*
|
|
1296
|
+
* # Safety
|
|
1297
|
+
*
|
|
1298
|
+
* - `data` must be a valid pointer to a byte array of length `data_len`
|
|
1299
|
+
* - `mime_type` must be a valid null-terminated C string
|
|
1300
|
+
* - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
1301
|
+
* - The returned pointer must be freed with `kreuzberg_free_result`
|
|
1302
|
+
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
1303
|
+
*
|
|
1304
|
+
* # Example (C)
|
|
1305
|
+
*
|
|
1306
|
+
* ```c
|
|
1307
|
+
* const uint8_t* data = ...; // Document bytes
|
|
1308
|
+
* size_t len = ...; // Length of data
|
|
1309
|
+
* const char* mime = "application/pdf";
|
|
1310
|
+
* const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
|
|
1311
|
+
* CExtractionResult* result = kreuzberg_extract_bytes_sync_with_config(data, len, mime, config);
|
|
1312
|
+
* if (result != NULL && result->success) {
|
|
1313
|
+
* printf("Content: %s\n", result->content);
|
|
1314
|
+
* kreuzberg_free_result(result);
|
|
1315
|
+
* }
|
|
1316
|
+
* ```
|
|
1317
|
+
*/
|
|
1318
|
+
struct CExtractionResult *kreuzberg_extract_bytes_sync_with_config(const uint8_t *data,
|
|
1319
|
+
uintptr_t data_len,
|
|
1320
|
+
const char *mime_type,
|
|
1321
|
+
const char *config_json);
|
|
1322
|
+
|
|
1323
|
+
/**
|
|
1324
|
+
* Batch extract text and metadata from multiple files (synchronous).
|
|
1325
|
+
*
|
|
1326
|
+
* # Safety
|
|
1327
|
+
*
|
|
1328
|
+
* - `file_paths` must be a valid pointer to an array of null-terminated C strings
|
|
1329
|
+
* - `count` must be the number of file paths in the array
|
|
1330
|
+
* - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
1331
|
+
* - The returned pointer must be freed with `kreuzberg_free_batch_result`
|
|
1332
|
+
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
1333
|
+
*
|
|
1334
|
+
* # Critical Memory Management
|
|
1335
|
+
*
|
|
1336
|
+
* This function has special memory management requirements due to the need to allocate
|
|
1337
|
+
* an array of result pointers:
|
|
1338
|
+
*
|
|
1339
|
+
* 1. Results are collected in a Vec<*mut CExtractionResult>
|
|
1340
|
+
* 2. The vec is converted to a boxed slice (changes allocation metadata)
|
|
1341
|
+
* 3. The boxed slice pointer is cast to *mut *mut CExtractionResult
|
|
1342
|
+
* 4. This pointer is stored in CBatchResult
|
|
1343
|
+
* 5. Deallocation must reverse this process using slice_from_raw_parts
|
|
1344
|
+
*
|
|
1345
|
+
* The Go segfault issue was caused by incorrect deallocation in the memory module.
|
|
1346
|
+
* This allocation pattern must be perfectly mirrored in the free function.
|
|
1347
|
+
*/
|
|
1348
|
+
struct CBatchResult *kreuzberg_batch_extract_files_sync(const char *const *file_paths,
|
|
1349
|
+
uintptr_t count,
|
|
1350
|
+
const char *config_json);
|
|
1351
|
+
|
|
1352
|
+
/**
|
|
1353
|
+
* Batch extract text and metadata from multiple byte arrays (synchronous).
|
|
1354
|
+
*
|
|
1355
|
+
* # Safety
|
|
1356
|
+
*
|
|
1357
|
+
* - `items` must be a valid pointer to an array of CBytesWithMime structures
|
|
1358
|
+
* - `count` must be the number of items in the array
|
|
1359
|
+
* - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
1360
|
+
* - The returned pointer must be freed with `kreuzberg_free_batch_result`
|
|
1361
|
+
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
1362
|
+
*
|
|
1363
|
+
* # Critical Memory Management
|
|
1364
|
+
*
|
|
1365
|
+
* This function shares the same critical memory management pattern as
|
|
1366
|
+
* `kreuzberg_batch_extract_files_sync`. See that function's documentation
|
|
1367
|
+
* for details on the Box/Vec/slice allocation pattern.
|
|
1368
|
+
*/
|
|
1369
|
+
struct CBatchResult *kreuzberg_batch_extract_bytes_sync(const struct CBytesWithMime *items,
|
|
1370
|
+
uintptr_t count,
|
|
1371
|
+
const char *config_json);
|
|
1372
|
+
|
|
1373
|
+
/**
|
|
1374
|
+
* Free a batch result returned by batch extraction functions.
|
|
1375
|
+
*
|
|
1376
|
+
* # Safety
|
|
1377
|
+
*
|
|
1378
|
+
* - `batch_result` must be a pointer previously returned by a batch extraction function
|
|
1379
|
+
* - `batch_result` can be NULL (no-op)
|
|
1380
|
+
* - `batch_result` must not be used after this call
|
|
1381
|
+
* - All individual results in the batch will be freed automatically
|
|
1382
|
+
*
|
|
1383
|
+
* # Memory Layout
|
|
1384
|
+
*
|
|
1385
|
+
* CRITICAL: The results array is allocated as `Box<[*mut CExtractionResult]>` (boxed slice),
|
|
1386
|
+
* NOT as `Vec<*mut CExtractionResult>`. We must use `Box::from_raw` with a slice pointer,
|
|
1387
|
+
* not `Vec::from_raw_parts`, to avoid Box/Vec mismatch that causes segfaults.
|
|
1388
|
+
*
|
|
1389
|
+
* # Example (C)
|
|
1390
|
+
*
|
|
1391
|
+
* ```c
|
|
1392
|
+
* CBatchResult* batch = kreuzberg_extract_batch_sync(paths, count);
|
|
1393
|
+
* // Use batch...
|
|
1394
|
+
* kreuzberg_free_batch_result(batch);
|
|
1395
|
+
* // batch is now invalid
|
|
1396
|
+
* ```
|
|
1397
|
+
*/
|
|
1398
|
+
void kreuzberg_free_batch_result(struct CBatchResult *batch_result);
|
|
1399
|
+
|
|
1400
|
+
/**
|
|
1401
|
+
* Free a string returned by Kreuzberg functions.
|
|
1402
|
+
*
|
|
1403
|
+
* # Safety
|
|
1404
|
+
*
|
|
1405
|
+
* - `s` must be a string previously returned by a Kreuzberg function
|
|
1406
|
+
* - `s` can be NULL (no-op)
|
|
1407
|
+
* - `s` must not be used after this call
|
|
1408
|
+
*
|
|
1409
|
+
* # Example (C)
|
|
1410
|
+
*
|
|
1411
|
+
* ```c
|
|
1412
|
+
* char* str = result->content;
|
|
1413
|
+
* kreuzberg_free_string(str);
|
|
1414
|
+
* // str is now invalid
|
|
1415
|
+
* ```
|
|
1416
|
+
*/
|
|
1417
|
+
void kreuzberg_free_string(char *s);
|
|
1418
|
+
|
|
1419
|
+
/**
|
|
1420
|
+
* Clone a null-terminated string using Rust's allocator.
|
|
1421
|
+
*
|
|
1422
|
+
* # Safety
|
|
1423
|
+
*
|
|
1424
|
+
* - `s` must be a valid null-terminated UTF-8 string
|
|
1425
|
+
* - Returned pointer must be freed with `kreuzberg_free_string`
|
|
1426
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1427
|
+
*/
|
|
1428
|
+
char *kreuzberg_clone_string(const char *s);
|
|
1429
|
+
|
|
1430
|
+
/**
|
|
1431
|
+
* Free an extraction result returned by `kreuzberg_extract_file_sync`.
|
|
1432
|
+
*
|
|
1433
|
+
* # Safety
|
|
1434
|
+
*
|
|
1435
|
+
* - `result` must be a pointer previously returned by `kreuzberg_extract_file_sync`
|
|
1436
|
+
* - `result` can be NULL (no-op)
|
|
1437
|
+
* - `result` must not be used after this call
|
|
1438
|
+
* - All string fields within the result will be freed automatically
|
|
1439
|
+
*
|
|
1440
|
+
* # Memory Layout
|
|
1441
|
+
*
|
|
1442
|
+
* This function frees all 12 string fields in CExtractionResult:
|
|
1443
|
+
* 1. content
|
|
1444
|
+
* 2. mime_type
|
|
1445
|
+
* 3. language
|
|
1446
|
+
* 4. date
|
|
1447
|
+
* 5. subject
|
|
1448
|
+
* 6. tables_json
|
|
1449
|
+
* 7. detected_languages_json
|
|
1450
|
+
* 8. metadata_json
|
|
1451
|
+
* 9. chunks_json
|
|
1452
|
+
* 10. images_json
|
|
1453
|
+
* 11. page_structure_json (FIXED: was missing before PR #3)
|
|
1454
|
+
* 12. pages_json (FIXED: was missing before PR #3)
|
|
1455
|
+
*
|
|
1456
|
+
* # Example (C)
|
|
1457
|
+
*
|
|
1458
|
+
* ```c
|
|
1459
|
+
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
1460
|
+
* // Use result...
|
|
1461
|
+
* kreuzberg_free_result(result);
|
|
1462
|
+
* // result is now invalid
|
|
1463
|
+
* ```
|
|
1464
|
+
*/
|
|
1465
|
+
void kreuzberg_free_result(struct CExtractionResult *result);
|
|
1466
|
+
|
|
1467
|
+
/**
|
|
1468
|
+
* Detect MIME type from a file path.
|
|
1469
|
+
*
|
|
1470
|
+
* # Safety
|
|
1471
|
+
*
|
|
1472
|
+
* - `file_path` must be a valid null-terminated C string
|
|
1473
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
1474
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1475
|
+
*/
|
|
1476
|
+
char *kreuzberg_detect_mime_type(const char *file_path, bool check_exists);
|
|
1477
|
+
|
|
1478
|
+
/**
|
|
1479
|
+
* Validate that a MIME type is supported by Kreuzberg.
|
|
1480
|
+
*
|
|
1481
|
+
* # Safety
|
|
1482
|
+
*
|
|
1483
|
+
* - `mime_type` must be a valid null-terminated C string
|
|
1484
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
1485
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1486
|
+
*/
|
|
1487
|
+
char *kreuzberg_validate_mime_type(const char *mime_type);
|
|
1488
|
+
|
|
1489
|
+
/**
|
|
1490
|
+
* Detect MIME type from raw bytes.
|
|
1491
|
+
*
|
|
1492
|
+
* # Safety
|
|
1493
|
+
*
|
|
1494
|
+
* - `bytes` must point to a valid buffer of at least `len` bytes
|
|
1495
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
1496
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1497
|
+
*
|
|
1498
|
+
* # Example (C)
|
|
1499
|
+
*
|
|
1500
|
+
* ```c
|
|
1501
|
+
* uint8_t data[512];
|
|
1502
|
+
* // ... read data ...
|
|
1503
|
+
* char* mime = kreuzberg_detect_mime_type_from_bytes(data, 512);
|
|
1504
|
+
* if (mime != NULL) {
|
|
1505
|
+
* printf("Detected MIME type: %s\n", mime);
|
|
1506
|
+
* kreuzberg_free_string(mime);
|
|
1507
|
+
* }
|
|
1508
|
+
* ```
|
|
1509
|
+
*/
|
|
1510
|
+
char *kreuzberg_detect_mime_type_from_bytes(const uint8_t *bytes, uintptr_t len);
|
|
1511
|
+
|
|
1512
|
+
/**
|
|
1513
|
+
* Detect MIME type from file path (checks extension and reads file content).
|
|
1514
|
+
*
|
|
1515
|
+
* # Safety
|
|
1516
|
+
*
|
|
1517
|
+
* - `file_path` must be a valid null-terminated C string
|
|
1518
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
1519
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1520
|
+
*
|
|
1521
|
+
* # Example (C)
|
|
1522
|
+
*
|
|
1523
|
+
* ```c
|
|
1524
|
+
* char* mime = kreuzberg_detect_mime_type_from_path("document.pdf");
|
|
1525
|
+
* if (mime == NULL) {
|
|
1526
|
+
* const char* error = kreuzberg_last_error();
|
|
1527
|
+
* printf("Failed to detect MIME type: %s\n", error);
|
|
1528
|
+
* } else {
|
|
1529
|
+
* printf("MIME type: %s\n", mime);
|
|
1530
|
+
* kreuzberg_free_string(mime);
|
|
1531
|
+
* }
|
|
1532
|
+
* ```
|
|
1533
|
+
*/
|
|
1534
|
+
char *kreuzberg_detect_mime_type_from_path(const char *file_path);
|
|
1535
|
+
|
|
1536
|
+
/**
|
|
1537
|
+
* Get file extensions for a MIME type.
|
|
1538
|
+
*
|
|
1539
|
+
* Returns a JSON array of file extensions (e.g., ["pdf"] for "application/pdf").
|
|
1540
|
+
*
|
|
1541
|
+
* # Safety
|
|
1542
|
+
*
|
|
1543
|
+
* - `mime_type` must be a valid null-terminated C string
|
|
1544
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
1545
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1546
|
+
*
|
|
1547
|
+
* # Example (C)
|
|
1548
|
+
*
|
|
1549
|
+
* ```c
|
|
1550
|
+
* char* extensions = kreuzberg_get_extensions_for_mime("application/pdf");
|
|
1551
|
+
* if (extensions != NULL) {
|
|
1552
|
+
* printf("Extensions: %s\n", extensions);
|
|
1553
|
+
* kreuzberg_free_string(extensions);
|
|
1554
|
+
* }
|
|
1555
|
+
* ```
|
|
1556
|
+
*/
|
|
1557
|
+
char *kreuzberg_get_extensions_for_mime(const char *mime_type);
|
|
1558
|
+
|
|
1559
|
+
/**
|
|
1560
|
+
* Register a custom DocumentExtractor via FFI callback.
|
|
1561
|
+
*
|
|
1562
|
+
* # Safety
|
|
1563
|
+
*
|
|
1564
|
+
* - `name` must be a valid null-terminated C string
|
|
1565
|
+
* - `callback` must be a valid function pointer that:
|
|
1566
|
+
* - Does not store the content, mime_type, or config_json pointers
|
|
1567
|
+
* - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
1568
|
+
* - The returned string must be freeable by kreuzberg_free_string
|
|
1569
|
+
* - `mime_types` must be a valid null-terminated C string containing comma-separated MIME types
|
|
1570
|
+
* - `priority` determines the order of selection (higher priority preferred)
|
|
1571
|
+
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1572
|
+
*
|
|
1573
|
+
* # Example (C)
|
|
1574
|
+
*
|
|
1575
|
+
* ```c
|
|
1576
|
+
* char* my_extractor(const uint8_t* content, size_t len, const char* mime_type, const char* config) {
|
|
1577
|
+
* // Extract content from bytes, return JSON ExtractionResult
|
|
1578
|
+
* return strdup("{\"content\":\"extracted text\",\"mime_type\":\"text/plain\",\"metadata\":{}}");
|
|
1579
|
+
* }
|
|
1580
|
+
*
|
|
1581
|
+
* bool success = kreuzberg_register_document_extractor(
|
|
1582
|
+
* "my-extractor",
|
|
1583
|
+
* my_extractor,
|
|
1584
|
+
* "application/x-custom,text/x-custom",
|
|
1585
|
+
* 100
|
|
1586
|
+
* );
|
|
1587
|
+
* if (!success) {
|
|
1588
|
+
* const char* error = kreuzberg_last_error();
|
|
1589
|
+
* printf("Failed to register: %s\n", error);
|
|
1590
|
+
* }
|
|
1591
|
+
* ```
|
|
1592
|
+
*/
|
|
1593
|
+
bool kreuzberg_register_document_extractor(const char *name,
|
|
1594
|
+
DocumentExtractorCallback callback,
|
|
1595
|
+
const char *mime_types,
|
|
1596
|
+
int32_t priority);
|
|
1597
|
+
|
|
1598
|
+
/**
|
|
1599
|
+
* Unregister a DocumentExtractor by name.
|
|
1600
|
+
*
|
|
1601
|
+
* # Safety
|
|
1602
|
+
*
|
|
1603
|
+
* - `name` must be a valid null-terminated C string
|
|
1604
|
+
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1605
|
+
*
|
|
1606
|
+
* # Example (C)
|
|
1607
|
+
*
|
|
1608
|
+
* ```c
|
|
1609
|
+
* bool success = kreuzberg_unregister_document_extractor("my-extractor");
|
|
1610
|
+
* if (!success) {
|
|
1611
|
+
* const char* error = kreuzberg_last_error();
|
|
1612
|
+
* printf("Failed to unregister: %s\n", error);
|
|
1613
|
+
* }
|
|
1614
|
+
* ```
|
|
1615
|
+
*/
|
|
1616
|
+
bool kreuzberg_unregister_document_extractor(const char *name);
|
|
1617
|
+
|
|
1618
|
+
/**
|
|
1619
|
+
* List all registered DocumentExtractors as a JSON array of names.
|
|
1620
|
+
*
|
|
1621
|
+
* # Safety
|
|
1622
|
+
*
|
|
1623
|
+
* - Returned string must be freed with `kreuzberg_free_string`.
|
|
1624
|
+
* - Returns NULL on error (check `kreuzberg_last_error`).
|
|
1625
|
+
*/
|
|
1626
|
+
char *kreuzberg_list_document_extractors(void);
|
|
1627
|
+
|
|
1628
|
+
/**
|
|
1629
|
+
* Clear all registered DocumentExtractors.
|
|
1630
|
+
*
|
|
1631
|
+
* # Safety
|
|
1632
|
+
*
|
|
1633
|
+
* - Removes all registered extractors. Subsequent extractions will use only built-in extractors.
|
|
1634
|
+
* - Returns true on success, false on error.
|
|
1635
|
+
*
|
|
1636
|
+
* # Example (C)
|
|
1637
|
+
*
|
|
1638
|
+
* ```c
|
|
1639
|
+
* bool success = kreuzberg_clear_document_extractors();
|
|
1640
|
+
* if (!success) {
|
|
1641
|
+
* const char* error = kreuzberg_last_error();
|
|
1642
|
+
* printf("Failed to clear document extractors: %s\n", error);
|
|
1643
|
+
* }
|
|
1644
|
+
* ```
|
|
1645
|
+
*/
|
|
1646
|
+
bool kreuzberg_clear_document_extractors(void);
|
|
1647
|
+
|
|
1648
|
+
/**
|
|
1649
|
+
* Register a custom OCR backend via FFI callback.
|
|
1650
|
+
*
|
|
1651
|
+
* # Safety
|
|
1652
|
+
*
|
|
1653
|
+
* - `name` must be a valid null-terminated C string
|
|
1654
|
+
* - `callback` must be a valid function pointer that:
|
|
1655
|
+
* - Does not store the image_bytes pointer
|
|
1656
|
+
* - Returns a null-terminated UTF-8 string or NULL on error
|
|
1657
|
+
* - The returned string must be freeable by kreuzberg_free_string
|
|
1658
|
+
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1659
|
+
*
|
|
1660
|
+
* # Example (C)
|
|
1661
|
+
*
|
|
1662
|
+
* ```c
|
|
1663
|
+
* char* my_ocr_backend(const uint8_t* image_bytes, size_t image_length, const char* config_json) {
|
|
1664
|
+
* // Implement OCR logic here
|
|
1665
|
+
* // Return allocated string with result, or NULL on error
|
|
1666
|
+
* return strdup("Extracted text");
|
|
1667
|
+
* }
|
|
1668
|
+
*
|
|
1669
|
+
* bool success = kreuzberg_register_ocr_backend("my-ocr", my_ocr_backend);
|
|
1670
|
+
* if (!success) {
|
|
1671
|
+
* const char* error = kreuzberg_last_error();
|
|
1672
|
+
* printf("Failed to register: %s\n", error);
|
|
1673
|
+
* }
|
|
1674
|
+
* ```
|
|
1675
|
+
*/
|
|
1676
|
+
bool kreuzberg_register_ocr_backend(const char *name, OcrBackendCallback callback);
|
|
1677
|
+
|
|
1678
|
+
/**
|
|
1679
|
+
* Register a custom OCR backend with explicit language support via FFI callback.
|
|
1680
|
+
*
|
|
1681
|
+
* # Safety
|
|
1682
|
+
*
|
|
1683
|
+
* - `languages_json` must be a null-terminated JSON array of language codes or NULL
|
|
1684
|
+
* - See `kreuzberg_register_ocr_backend` for additional safety notes.
|
|
1685
|
+
*/
|
|
1686
|
+
bool kreuzberg_register_ocr_backend_with_languages(const char *name,
|
|
1687
|
+
OcrBackendCallback callback,
|
|
1688
|
+
const char *languages_json);
|
|
1689
|
+
|
|
1690
|
+
/**
|
|
1691
|
+
* Unregister an OCR backend by name.
|
|
1692
|
+
*
|
|
1693
|
+
* # Safety
|
|
1694
|
+
*
|
|
1695
|
+
* - `name` must be a valid null-terminated C string
|
|
1696
|
+
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1697
|
+
*
|
|
1698
|
+
* # Example (C)
|
|
1699
|
+
*
|
|
1700
|
+
* ```c
|
|
1701
|
+
* bool success = kreuzberg_unregister_ocr_backend("custom-ocr");
|
|
1702
|
+
* if (!success) {
|
|
1703
|
+
* const char* error = kreuzberg_last_error();
|
|
1704
|
+
* printf("Failed to unregister: %s\n", error);
|
|
1705
|
+
* }
|
|
1706
|
+
* ```
|
|
1707
|
+
*/
|
|
1708
|
+
bool kreuzberg_unregister_ocr_backend(const char *name);
|
|
1709
|
+
|
|
1710
|
+
/**
|
|
1711
|
+
* List all registered OCR backends as a JSON array of names.
|
|
1712
|
+
*
|
|
1713
|
+
* # Safety
|
|
1714
|
+
*
|
|
1715
|
+
* - Returned string must be freed with `kreuzberg_free_string`.
|
|
1716
|
+
* - Returns NULL on error (check `kreuzberg_last_error`).
|
|
1717
|
+
*
|
|
1718
|
+
* # Example (C)
|
|
1719
|
+
*
|
|
1720
|
+
* ```c
|
|
1721
|
+
* char* backends = kreuzberg_list_ocr_backends();
|
|
1722
|
+
* if (backends == NULL) {
|
|
1723
|
+
* const char* error = kreuzberg_last_error();
|
|
1724
|
+
* printf("Failed to list backends: %s\n", error);
|
|
1725
|
+
* } else {
|
|
1726
|
+
* printf("OCR backends: %s\n", backends);
|
|
1727
|
+
* kreuzberg_free_string(backends);
|
|
1728
|
+
* }
|
|
1729
|
+
* ```
|
|
1730
|
+
*/
|
|
1731
|
+
char *kreuzberg_list_ocr_backends(void);
|
|
1732
|
+
|
|
1733
|
+
/**
|
|
1734
|
+
* Clear all registered OCR backends.
|
|
1735
|
+
*
|
|
1736
|
+
* # Safety
|
|
1737
|
+
*
|
|
1738
|
+
* - Removes all registered OCR backends. Subsequent extractions will use only built-in backends.
|
|
1739
|
+
* - Returns true on success, false on error.
|
|
1740
|
+
*
|
|
1741
|
+
* # Example (C)
|
|
1742
|
+
*
|
|
1743
|
+
* ```c
|
|
1744
|
+
* bool success = kreuzberg_clear_ocr_backends();
|
|
1745
|
+
* if (!success) {
|
|
1746
|
+
* const char* error = kreuzberg_last_error();
|
|
1747
|
+
* printf("Failed to clear OCR backends: %s\n", error);
|
|
1748
|
+
* }
|
|
1749
|
+
* ```
|
|
1750
|
+
*/
|
|
1751
|
+
bool kreuzberg_clear_ocr_backends(void);
|
|
1752
|
+
|
|
1753
|
+
/**
|
|
1754
|
+
* Get supported languages for an OCR backend.
|
|
1755
|
+
*
|
|
1756
|
+
* Returns a JSON array of supported language codes for the given backend.
|
|
1757
|
+
* Supported backends: "easyocr", "paddleocr", "tesseract"
|
|
1758
|
+
*
|
|
1759
|
+
* # Safety
|
|
1760
|
+
*
|
|
1761
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
1762
|
+
* - Returns NULL if backend not found or on error (check `kreuzberg_last_error`)
|
|
1763
|
+
*
|
|
1764
|
+
* # Example (C)
|
|
1765
|
+
*
|
|
1766
|
+
* ```c
|
|
1767
|
+
* char* languages = kreuzberg_get_ocr_languages("easyocr");
|
|
1768
|
+
* if (languages != NULL) {
|
|
1769
|
+
* printf("EasyOCR languages: %s\n", languages);
|
|
1770
|
+
* kreuzberg_free_string(languages);
|
|
1771
|
+
* }
|
|
1772
|
+
* ```
|
|
1773
|
+
*/
|
|
1774
|
+
char *kreuzberg_get_ocr_languages(const char *backend);
|
|
1775
|
+
|
|
1776
|
+
/**
|
|
1777
|
+
* Check if a language is supported by an OCR backend.
|
|
1778
|
+
*
|
|
1779
|
+
* Returns 1 (true) if the language is supported, 0 (false) otherwise.
|
|
1780
|
+
*
|
|
1781
|
+
* # Arguments
|
|
1782
|
+
*
|
|
1783
|
+
* * `backend` - Backend name (e.g., "easyocr", "paddleocr", "tesseract")
|
|
1784
|
+
* * `language` - Language code to check
|
|
1785
|
+
*
|
|
1786
|
+
* # Returns
|
|
1787
|
+
*
|
|
1788
|
+
* 1 if supported, 0 if not supported or backend not found.
|
|
1789
|
+
*
|
|
1790
|
+
* # Example (C)
|
|
1791
|
+
*
|
|
1792
|
+
* ```c
|
|
1793
|
+
* int is_supported = kreuzberg_is_language_supported("easyocr", "en");
|
|
1794
|
+
* if (is_supported) {
|
|
1795
|
+
* printf("English is supported by EasyOCR\n");
|
|
1796
|
+
* }
|
|
1797
|
+
* ```
|
|
1798
|
+
*
|
|
1799
|
+
* # Safety
|
|
1800
|
+
*
|
|
1801
|
+
* - `backend` and `language` must be valid pointers to valid UTF-8 C strings.
|
|
1802
|
+
* - Both pointers can be checked for NULL; returns 0 if either is NULL.
|
|
1803
|
+
* - The C strings must remain valid for the duration of the function call.
|
|
1804
|
+
*/
|
|
1805
|
+
int32_t kreuzberg_is_language_supported(const char *backend, const char *language);
|
|
1806
|
+
|
|
1807
|
+
/**
|
|
1808
|
+
* Get list of all registered OCR backends with language support.
|
|
1809
|
+
*
|
|
1810
|
+
* Returns a JSON object mapping backend names to language counts.
|
|
1811
|
+
* Example: `{"easyocr": 80, "paddleocr": 14, "tesseract": 100}`
|
|
1812
|
+
*
|
|
1813
|
+
* # Safety
|
|
1814
|
+
*
|
|
1815
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
1816
|
+
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1817
|
+
*
|
|
1818
|
+
* # Example (C)
|
|
1819
|
+
*
|
|
1820
|
+
* ```c
|
|
1821
|
+
* char* backends = kreuzberg_list_ocr_backends_with_languages();
|
|
1822
|
+
* if (backends != NULL) {
|
|
1823
|
+
* printf("Available backends: %s\n", backends);
|
|
1824
|
+
* kreuzberg_free_string(backends);
|
|
1825
|
+
* }
|
|
1826
|
+
* ```
|
|
1827
|
+
*/
|
|
1828
|
+
char *kreuzberg_list_ocr_backends_with_languages(void);
|
|
1829
|
+
|
|
1830
|
+
/**
|
|
1831
|
+
* Register a custom PostProcessor via FFI callback.
|
|
1832
|
+
*
|
|
1833
|
+
* # Safety
|
|
1834
|
+
*
|
|
1835
|
+
* - `name` must be a valid null-terminated C string
|
|
1836
|
+
* - `callback` must be a valid function pointer that:
|
|
1837
|
+
* - Does not store the result_json pointer
|
|
1838
|
+
* - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
1839
|
+
* - The returned string must be freeable by kreuzberg_free_string
|
|
1840
|
+
* - `priority` determines the order of execution (higher priority runs first)
|
|
1841
|
+
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1842
|
+
*
|
|
1843
|
+
* # Example (C)
|
|
1844
|
+
*
|
|
1845
|
+
* ```c
|
|
1846
|
+
* char* my_post_processor(const char* result_json) {
|
|
1847
|
+
* // Parse result_json, modify it, return JSON string
|
|
1848
|
+
* return strdup("{\"content\":\"PROCESSED\"}");
|
|
1849
|
+
* }
|
|
1850
|
+
*
|
|
1851
|
+
* bool success = kreuzberg_register_post_processor("my-processor", my_post_processor, 100);
|
|
1852
|
+
* if (!success) {
|
|
1853
|
+
* const char* error = kreuzberg_last_error();
|
|
1854
|
+
* printf("Failed to register: %s\n", error);
|
|
1855
|
+
* }
|
|
1856
|
+
* ```
|
|
1857
|
+
*/
|
|
1858
|
+
bool kreuzberg_register_post_processor(const char *name,
|
|
1859
|
+
PostProcessorCallback callback,
|
|
1860
|
+
int32_t priority);
|
|
1861
|
+
|
|
1862
|
+
/**
|
|
1863
|
+
* Register a custom PostProcessor with an explicit processing stage.
|
|
1864
|
+
*
|
|
1865
|
+
* # Safety
|
|
1866
|
+
*
|
|
1867
|
+
* - `name` must be a valid null-terminated C string
|
|
1868
|
+
* - `stage` must be a valid null-terminated C string containing "early", "middle", or "late"
|
|
1869
|
+
* - `callback` must be a valid function pointer that:
|
|
1870
|
+
* - Does not store the result_json pointer
|
|
1871
|
+
* - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
1872
|
+
* - The returned string must be freeable by kreuzberg_free_string
|
|
1873
|
+
* - `priority` determines the order of execution within the stage (higher priority runs first)
|
|
1874
|
+
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1875
|
+
*/
|
|
1876
|
+
bool kreuzberg_register_post_processor_with_stage(const char *name,
|
|
1877
|
+
PostProcessorCallback callback,
|
|
1878
|
+
int32_t priority,
|
|
1879
|
+
const char *stage);
|
|
1880
|
+
|
|
1881
|
+
/**
|
|
1882
|
+
* Unregister a PostProcessor by name.
|
|
1883
|
+
*
|
|
1884
|
+
* # Safety
|
|
1885
|
+
*
|
|
1886
|
+
* - `name` must be a valid null-terminated C string
|
|
1887
|
+
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1888
|
+
*
|
|
1889
|
+
* # Example (C)
|
|
1890
|
+
*
|
|
1891
|
+
* ```c
|
|
1892
|
+
* bool success = kreuzberg_unregister_post_processor("my-processor");
|
|
1893
|
+
* if (!success) {
|
|
1894
|
+
* const char* error = kreuzberg_last_error();
|
|
1895
|
+
* printf("Failed to unregister: %s\n", error);
|
|
1896
|
+
* }
|
|
1897
|
+
* ```
|
|
1898
|
+
*/
|
|
1899
|
+
bool kreuzberg_unregister_post_processor(const char *name);
|
|
1900
|
+
|
|
1901
|
+
/**
|
|
1902
|
+
* Clear all registered PostProcessors.
|
|
1903
|
+
*
|
|
1904
|
+
* # Safety
|
|
1905
|
+
*
|
|
1906
|
+
* - Removes all registered processors. Subsequent extractions will run without them.
|
|
1907
|
+
* - Returns true on success, false on error.
|
|
1908
|
+
*/
|
|
1909
|
+
bool kreuzberg_clear_post_processors(void);
|
|
1910
|
+
|
|
1911
|
+
/**
|
|
1912
|
+
* List all registered PostProcessors as a JSON array of names.
|
|
1913
|
+
*
|
|
1914
|
+
* # Safety
|
|
1915
|
+
*
|
|
1916
|
+
* - Returned string must be freed with `kreuzberg_free_string`.
|
|
1917
|
+
* - Returns NULL on error (check `kreuzberg_last_error`).
|
|
1918
|
+
*/
|
|
1919
|
+
char *kreuzberg_list_post_processors(void);
|
|
1920
|
+
|
|
1921
|
+
/**
|
|
1922
|
+
* Register a custom Validator via FFI callback.
|
|
1923
|
+
*
|
|
1924
|
+
* # Safety
|
|
1925
|
+
*
|
|
1926
|
+
* - `name` must be a valid null-terminated C string
|
|
1927
|
+
* - `callback` must be a valid function pointer that:
|
|
1928
|
+
* - Does not store the result_json pointer
|
|
1929
|
+
* - Returns a null-terminated UTF-8 string (error message) if validation fails
|
|
1930
|
+
* - Returns NULL if validation passes
|
|
1931
|
+
* - The returned string must be freeable by kreuzberg_free_string
|
|
1932
|
+
* - `priority` determines the order of validation (higher priority runs first)
|
|
1933
|
+
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1934
|
+
*
|
|
1935
|
+
* # Example (C)
|
|
1936
|
+
*
|
|
1937
|
+
* ```c
|
|
1938
|
+
* char* my_validator(const char* result_json) {
|
|
1939
|
+
* // Parse result_json, validate it
|
|
1940
|
+
* // Return error message if validation fails, NULL if passes
|
|
1941
|
+
* if (invalid) {
|
|
1942
|
+
* return strdup("Validation failed: content too short");
|
|
1943
|
+
* }
|
|
1944
|
+
* return NULL;
|
|
1945
|
+
* }
|
|
1946
|
+
*
|
|
1947
|
+
* bool success = kreuzberg_register_validator("my-validator", my_validator, 100);
|
|
1948
|
+
* if (!success) {
|
|
1949
|
+
* const char* error = kreuzberg_last_error();
|
|
1950
|
+
* printf("Failed to register: %s\n", error);
|
|
1951
|
+
* }
|
|
1952
|
+
* ```
|
|
1953
|
+
*/
|
|
1954
|
+
bool kreuzberg_register_validator(const char *name, ValidatorCallback callback, int32_t priority);
|
|
1955
|
+
|
|
1956
|
+
/**
|
|
1957
|
+
* Unregister a Validator by name.
|
|
1958
|
+
*
|
|
1959
|
+
* # Safety
|
|
1960
|
+
*
|
|
1961
|
+
* - `name` must be a valid null-terminated C string
|
|
1962
|
+
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
1963
|
+
*
|
|
1964
|
+
* # Example (C)
|
|
1965
|
+
*
|
|
1966
|
+
* ```c
|
|
1967
|
+
* bool success = kreuzberg_unregister_validator("my-validator");
|
|
1968
|
+
* if (!success) {
|
|
1969
|
+
* const char* error = kreuzberg_last_error();
|
|
1970
|
+
* printf("Failed to unregister: %s\n", error);
|
|
1971
|
+
* }
|
|
1972
|
+
* ```
|
|
1973
|
+
*/
|
|
1974
|
+
bool kreuzberg_unregister_validator(const char *name);
|
|
1975
|
+
|
|
1976
|
+
/**
|
|
1977
|
+
* Clear all registered Validators.
|
|
1978
|
+
*
|
|
1979
|
+
* # Safety
|
|
1980
|
+
*
|
|
1981
|
+
* - Removes all validators. Subsequent extractions will skip custom validation.
|
|
1982
|
+
* - Returns true on success, false on error.
|
|
1983
|
+
*/
|
|
1984
|
+
bool kreuzberg_clear_validators(void);
|
|
1985
|
+
|
|
1986
|
+
/**
|
|
1987
|
+
* List all registered Validators as a JSON array of names.
|
|
1988
|
+
*
|
|
1989
|
+
* # Safety
|
|
1990
|
+
*
|
|
1991
|
+
* - Returned string must be freed with `kreuzberg_free_string`.
|
|
1992
|
+
* - Returns NULL on error (check `kreuzberg_last_error`).
|
|
1993
|
+
*/
|
|
1994
|
+
char *kreuzberg_list_validators(void);
|
|
1995
|
+
|
|
1996
|
+
/**
|
|
1997
|
+
* Get page count from extraction result.
|
|
1998
|
+
*
|
|
1999
|
+
* Returns the total number of pages/slides/sheets detected in the document.
|
|
2000
|
+
*
|
|
2001
|
+
* # Arguments
|
|
2002
|
+
*
|
|
2003
|
+
* * `result` - Pointer to an ExtractionResult structure
|
|
2004
|
+
*
|
|
2005
|
+
* # Returns
|
|
2006
|
+
*
|
|
2007
|
+
* The page count (>= 0) if successful, or -1 on error (check `kreuzberg_last_error`).
|
|
2008
|
+
*
|
|
2009
|
+
* # Safety
|
|
2010
|
+
*
|
|
2011
|
+
* - `result` must be a valid pointer to an ExtractionResult
|
|
2012
|
+
* - `result` cannot be NULL
|
|
2013
|
+
*
|
|
2014
|
+
* # Example (C)
|
|
2015
|
+
*
|
|
2016
|
+
* ```c
|
|
2017
|
+
* ExtractionResult* result = kreuzberg_extract_file("document.pdf", NULL);
|
|
2018
|
+
* if (result != NULL) {
|
|
2019
|
+
* int page_count = kreuzberg_result_get_page_count(result);
|
|
2020
|
+
* if (page_count >= 0) {
|
|
2021
|
+
* printf("Document has %d pages\n", page_count);
|
|
2022
|
+
* }
|
|
2023
|
+
* kreuzberg_result_free(result);
|
|
2024
|
+
* }
|
|
2025
|
+
* ```
|
|
2026
|
+
*/
|
|
2027
|
+
int32_t kreuzberg_result_get_page_count(const ExtractionResult *result);
|
|
2028
|
+
|
|
2029
|
+
/**
|
|
2030
|
+
* Get chunk count from extraction result.
|
|
2031
|
+
*
|
|
2032
|
+
* Returns the number of text chunks when chunking is enabled, or 0 if chunking
|
|
2033
|
+
* was not performed.
|
|
2034
|
+
*
|
|
2035
|
+
* # Arguments
|
|
2036
|
+
*
|
|
2037
|
+
* * `result` - Pointer to an ExtractionResult structure
|
|
2038
|
+
*
|
|
2039
|
+
* # Returns
|
|
2040
|
+
*
|
|
2041
|
+
* The chunk count (>= 0) if successful, or -1 on error (check `kreuzberg_last_error`).
|
|
2042
|
+
*
|
|
2043
|
+
* # Safety
|
|
2044
|
+
*
|
|
2045
|
+
* - `result` must be a valid pointer to an ExtractionResult
|
|
2046
|
+
* - `result` cannot be NULL
|
|
2047
|
+
*
|
|
2048
|
+
* # Example (C)
|
|
2049
|
+
*
|
|
2050
|
+
* ```c
|
|
2051
|
+
* ExtractionResult* result = kreuzberg_extract_file("document.pdf", config);
|
|
2052
|
+
* if (result != NULL) {
|
|
2053
|
+
* int chunk_count = kreuzberg_result_get_chunk_count(result);
|
|
2054
|
+
* if (chunk_count >= 0) {
|
|
2055
|
+
* printf("Document has %d chunks\n", chunk_count);
|
|
2056
|
+
* }
|
|
2057
|
+
* kreuzberg_result_free(result);
|
|
2058
|
+
* }
|
|
2059
|
+
* ```
|
|
2060
|
+
*/
|
|
2061
|
+
int32_t kreuzberg_result_get_chunk_count(const ExtractionResult *result);
|
|
2062
|
+
|
|
2063
|
+
/**
|
|
2064
|
+
* Get detected language from extraction result.
|
|
2065
|
+
*
|
|
2066
|
+
* Returns the primary detected language as an ISO 639 language code.
|
|
2067
|
+
* If multiple languages were detected, returns the primary one.
|
|
2068
|
+
*
|
|
2069
|
+
* # Arguments
|
|
2070
|
+
*
|
|
2071
|
+
* * `result` - Pointer to an ExtractionResult structure
|
|
2072
|
+
*
|
|
2073
|
+
* # Returns
|
|
2074
|
+
*
|
|
2075
|
+
* A pointer to a C string containing the language code (e.g., "en", "de"),
|
|
2076
|
+
* or NULL if no language was detected or on error (check `kreuzberg_last_error`).
|
|
2077
|
+
*
|
|
2078
|
+
* The returned pointer must be freed with `kreuzberg_free_string()`.
|
|
2079
|
+
*
|
|
2080
|
+
* # Safety
|
|
2081
|
+
*
|
|
2082
|
+
* - `result` must be a valid pointer to an ExtractionResult
|
|
2083
|
+
* - `result` cannot be NULL
|
|
2084
|
+
* - The returned pointer (if non-NULL) must be freed with `kreuzberg_free_string`
|
|
2085
|
+
*
|
|
2086
|
+
* # Example (C)
|
|
2087
|
+
*
|
|
2088
|
+
* ```c
|
|
2089
|
+
* ExtractionResult* result = kreuzberg_extract_file("document.pdf", NULL);
|
|
2090
|
+
* if (result != NULL) {
|
|
2091
|
+
* char* language = kreuzberg_result_get_detected_language(result);
|
|
2092
|
+
* if (language != NULL) {
|
|
2093
|
+
* printf("Detected language: %s\n", language);
|
|
2094
|
+
* kreuzberg_free_string(language);
|
|
2095
|
+
* }
|
|
2096
|
+
* kreuzberg_result_free(result);
|
|
2097
|
+
* }
|
|
2098
|
+
* ```
|
|
2099
|
+
*/
|
|
2100
|
+
char *kreuzberg_result_get_detected_language(const ExtractionResult *result);
|
|
2101
|
+
|
|
2102
|
+
/**
|
|
2103
|
+
* Get a metadata field by name.
|
|
2104
|
+
*
|
|
2105
|
+
* Retrieves a metadata field from the extraction result and returns its value
|
|
2106
|
+
* as a JSON string. Supports nested fields with dot notation (e.g., "format.pages").
|
|
2107
|
+
*
|
|
2108
|
+
* # Arguments
|
|
2109
|
+
*
|
|
2110
|
+
* * `result` - Pointer to an ExtractionResult structure
|
|
2111
|
+
* * `field_name` - Null-terminated C string with the field name
|
|
2112
|
+
*
|
|
2113
|
+
* # Returns
|
|
2114
|
+
*
|
|
2115
|
+
* A CMetadataField structure containing:
|
|
2116
|
+
* - `name`: The field name (caller should not free)
|
|
2117
|
+
* - `json_value`: Pointer to field value as JSON string (must free with `kreuzberg_free_string`),
|
|
2118
|
+
* or NULL if field doesn't exist
|
|
2119
|
+
* - `is_null`: 1 if field doesn't exist, 0 if it does
|
|
2120
|
+
*
|
|
2121
|
+
* # Safety
|
|
2122
|
+
*
|
|
2123
|
+
* - `result` must be a valid pointer to an ExtractionResult
|
|
2124
|
+
* - `field_name` must be a valid null-terminated C string
|
|
2125
|
+
* - Neither parameter can be NULL
|
|
2126
|
+
* - The returned `json_value` (if non-NULL) must be freed with `kreuzberg_free_string`
|
|
2127
|
+
*
|
|
2128
|
+
* # Example (C)
|
|
2129
|
+
*
|
|
2130
|
+
* ```c
|
|
2131
|
+
* ExtractionResult* result = kreuzberg_extract_file("document.pdf", NULL);
|
|
2132
|
+
* if (result != NULL) {
|
|
2133
|
+
* CMetadataField title_field = kreuzberg_result_get_metadata_field(result, "title");
|
|
2134
|
+
* if (!title_field.is_null) {
|
|
2135
|
+
* printf("Title: %s\n", title_field.json_value);
|
|
2136
|
+
* kreuzberg_free_string(title_field.json_value);
|
|
2137
|
+
* }
|
|
2138
|
+
*
|
|
2139
|
+
* CMetadataField author_field = kreuzberg_result_get_metadata_field(result, "authors");
|
|
2140
|
+
* if (!author_field.is_null) {
|
|
2141
|
+
* printf("Authors: %s\n", author_field.json_value);
|
|
2142
|
+
* kreuzberg_free_string(author_field.json_value);
|
|
2143
|
+
* }
|
|
2144
|
+
*
|
|
2145
|
+
* kreuzberg_result_free(result);
|
|
2146
|
+
* }
|
|
2147
|
+
* ```
|
|
2148
|
+
*/
|
|
2149
|
+
struct CMetadataField kreuzberg_result_get_metadata_field(const ExtractionResult *result,
|
|
2150
|
+
const char *field_name);
|
|
2151
|
+
|
|
2152
|
+
/**
|
|
2153
|
+
* Create a new result pool with specified initial capacity.
|
|
2154
|
+
*
|
|
2155
|
+
* Pre-allocates storage for `capacity` results to reduce allocation overhead.
|
|
2156
|
+
* Pool automatically grows if capacity is exceeded.
|
|
2157
|
+
*
|
|
2158
|
+
* # Arguments
|
|
2159
|
+
*
|
|
2160
|
+
* * `capacity` - Initial capacity (number of results to pre-allocate storage for)
|
|
2161
|
+
*
|
|
2162
|
+
* # Returns
|
|
2163
|
+
*
|
|
2164
|
+
* Pointer to allocated pool, or NULL on allocation failure (check `kreuzberg_last_error`).
|
|
2165
|
+
*
|
|
2166
|
+
* # Memory Management
|
|
2167
|
+
*
|
|
2168
|
+
* Caller must free the returned pool with `kreuzberg_result_pool_free()`.
|
|
2169
|
+
*
|
|
2170
|
+
* # Example (C)
|
|
2171
|
+
*
|
|
2172
|
+
* ```c
|
|
2173
|
+
* CResultPool* pool = kreuzberg_result_pool_new(100);
|
|
2174
|
+
* if (pool == NULL) {
|
|
2175
|
+
* fprintf(stderr, "Failed to create pool: %s\n", kreuzberg_last_error());
|
|
2176
|
+
* return;
|
|
2177
|
+
* }
|
|
2178
|
+
* // Use pool...
|
|
2179
|
+
* kreuzberg_result_pool_free(pool);
|
|
2180
|
+
* ```
|
|
2181
|
+
*/
|
|
2182
|
+
struct ResultPool *kreuzberg_result_pool_new(uintptr_t capacity);
|
|
2183
|
+
|
|
2184
|
+
/**
|
|
2185
|
+
* Reset pool by clearing all results.
|
|
2186
|
+
*
|
|
2187
|
+
* Removes all results from the pool but retains allocated capacity.
|
|
2188
|
+
* After reset, pool can be reused for new extractions.
|
|
2189
|
+
*
|
|
2190
|
+
* # Arguments
|
|
2191
|
+
*
|
|
2192
|
+
* * `pool` - Pointer to result pool
|
|
2193
|
+
*
|
|
2194
|
+
* # Safety
|
|
2195
|
+
*
|
|
2196
|
+
* - `pool` must be a valid pointer returned by `kreuzberg_result_pool_new()`
|
|
2197
|
+
* - `pool` cannot be NULL
|
|
2198
|
+
* - All result pointers obtained from this pool become invalid after reset
|
|
2199
|
+
* - Must not be called concurrently with extractions using same pool
|
|
2200
|
+
*
|
|
2201
|
+
* # Example (C)
|
|
2202
|
+
*
|
|
2203
|
+
* ```c
|
|
2204
|
+
* CResultPool* pool = kreuzberg_result_pool_new(100);
|
|
2205
|
+
*
|
|
2206
|
+
* // Process batch 1
|
|
2207
|
+
* for (int i = 0; i < 50; i++) {
|
|
2208
|
+
* kreuzberg_extract_file_into_pool(files[i], NULL, pool);
|
|
2209
|
+
* }
|
|
2210
|
+
*
|
|
2211
|
+
* // Reset and reuse
|
|
2212
|
+
* kreuzberg_result_pool_reset(pool);
|
|
2213
|
+
*
|
|
2214
|
+
* // Process batch 2
|
|
2215
|
+
* for (int i = 0; i < 50; i++) {
|
|
2216
|
+
* kreuzberg_extract_file_into_pool(other_files[i], NULL, pool);
|
|
2217
|
+
* }
|
|
2218
|
+
*
|
|
2219
|
+
* kreuzberg_result_pool_free(pool);
|
|
2220
|
+
* ```
|
|
2221
|
+
*/
|
|
2222
|
+
void kreuzberg_result_pool_reset(struct ResultPool *pool);
|
|
2223
|
+
|
|
2224
|
+
/**
|
|
2225
|
+
* Free result pool and all contained results.
|
|
2226
|
+
*
|
|
2227
|
+
* Releases all memory associated with the pool. All result pointers
|
|
2228
|
+
* obtained from this pool become invalid.
|
|
2229
|
+
*
|
|
2230
|
+
* # Arguments
|
|
2231
|
+
*
|
|
2232
|
+
* * `pool` - Pointer to result pool
|
|
2233
|
+
*
|
|
2234
|
+
* # Safety
|
|
2235
|
+
*
|
|
2236
|
+
* - `pool` must be a valid pointer returned by `kreuzberg_result_pool_new()`
|
|
2237
|
+
* - `pool` can be NULL (no-op)
|
|
2238
|
+
* - All result pointers from this pool become invalid after free
|
|
2239
|
+
* - Must not be called twice on same pool (double-free)
|
|
2240
|
+
* - Must not be called concurrently with other pool operations
|
|
2241
|
+
*
|
|
2242
|
+
* # Example (C)
|
|
2243
|
+
*
|
|
2244
|
+
* ```c
|
|
2245
|
+
* CResultPool* pool = kreuzberg_result_pool_new(100);
|
|
2246
|
+
* // Use pool...
|
|
2247
|
+
* kreuzberg_result_pool_free(pool);
|
|
2248
|
+
* pool = NULL; // Prevent double-free
|
|
2249
|
+
* ```
|
|
2250
|
+
*/
|
|
2251
|
+
void kreuzberg_result_pool_free(struct ResultPool *pool);
|
|
2252
|
+
|
|
2253
|
+
/**
|
|
2254
|
+
* Get statistics about pool usage and efficiency.
|
|
2255
|
+
*
|
|
2256
|
+
* Returns metrics about current pool state, allocation counts, and memory usage.
|
|
2257
|
+
*
|
|
2258
|
+
* # Arguments
|
|
2259
|
+
*
|
|
2260
|
+
* * `pool` - Pointer to result pool
|
|
2261
|
+
*
|
|
2262
|
+
* # Returns
|
|
2263
|
+
*
|
|
2264
|
+
* Statistics structure with current metrics, or zeroed structure on error.
|
|
2265
|
+
*
|
|
2266
|
+
* # Safety
|
|
2267
|
+
*
|
|
2268
|
+
* - `pool` must be a valid pointer returned by `kreuzberg_result_pool_new()`
|
|
2269
|
+
* - `pool` cannot be NULL
|
|
2270
|
+
*
|
|
2271
|
+
* # Example (C)
|
|
2272
|
+
*
|
|
2273
|
+
* ```c
|
|
2274
|
+
* CResultPoolStats stats = kreuzberg_result_pool_stats(pool);
|
|
2275
|
+
* printf("Pool: %zu/%zu results, %zu allocations, %zu bytes\n",
|
|
2276
|
+
* stats.current_count, stats.capacity,
|
|
2277
|
+
* stats.total_allocations, stats.estimated_memory_bytes);
|
|
2278
|
+
*
|
|
2279
|
+
* if (stats.growth_events > 0) {
|
|
2280
|
+
* printf("Warning: Pool grew %zu times (consider larger initial capacity)\n",
|
|
2281
|
+
* stats.growth_events);
|
|
2282
|
+
* }
|
|
2283
|
+
* ```
|
|
2284
|
+
*/
|
|
2285
|
+
struct CResultPoolStats kreuzberg_result_pool_stats(const struct ResultPool *pool);
|
|
2286
|
+
|
|
2287
|
+
/**
|
|
2288
|
+
* Extract file and store result in pool.
|
|
2289
|
+
*
|
|
2290
|
+
* Extracts document content and adds result to pool. Returns borrowed reference
|
|
2291
|
+
* to result that remains valid until pool is reset or freed.
|
|
2292
|
+
*
|
|
2293
|
+
* # Arguments
|
|
2294
|
+
*
|
|
2295
|
+
* * `file_path` - Null-terminated UTF-8 file path
|
|
2296
|
+
* * `config_json` - Optional JSON configuration string (NULL for defaults)
|
|
2297
|
+
* * `pool` - Pointer to result pool
|
|
2298
|
+
*
|
|
2299
|
+
* # Returns
|
|
2300
|
+
*
|
|
2301
|
+
* Borrowed pointer to extraction result view, or NULL on error (check `kreuzberg_last_error`).
|
|
2302
|
+
* Result remains valid until pool is reset or freed.
|
|
2303
|
+
*
|
|
2304
|
+
* # Safety
|
|
2305
|
+
*
|
|
2306
|
+
* - `file_path` must be valid null-terminated UTF-8 string
|
|
2307
|
+
* - `config_json` must be valid null-terminated UTF-8 if not NULL
|
|
2308
|
+
* - `pool` must be valid pointer returned by `kreuzberg_result_pool_new()`
|
|
2309
|
+
* - None can be NULL (except config_json which is optional)
|
|
2310
|
+
* - Returned pointer is borrowed from pool (do not free separately)
|
|
2311
|
+
* - Returned pointer becomes invalid when pool is reset or freed
|
|
2312
|
+
*
|
|
2313
|
+
* # Example (C)
|
|
2314
|
+
*
|
|
2315
|
+
* ```c
|
|
2316
|
+
* CResultPool* pool = kreuzberg_result_pool_new(100);
|
|
2317
|
+
*
|
|
2318
|
+
* const CExtractionResultView* result = kreuzberg_extract_file_into_pool(
|
|
2319
|
+
* "document.pdf", NULL, pool
|
|
2320
|
+
* );
|
|
2321
|
+
*
|
|
2322
|
+
* if (result != NULL) {
|
|
2323
|
+
* // Access result fields
|
|
2324
|
+
* printf("Content length: %zu\n", result->content_len);
|
|
2325
|
+
* printf("MIME type: %.*s\n",
|
|
2326
|
+
* (int)result->mime_type_len,
|
|
2327
|
+
* result->mime_type_ptr);
|
|
2328
|
+
* }
|
|
2329
|
+
*
|
|
2330
|
+
* // Result remains valid until pool is reset/freed
|
|
2331
|
+
* kreuzberg_result_pool_free(pool);
|
|
2332
|
+
* ```
|
|
2333
|
+
*/
|
|
2334
|
+
const struct CExtractionResultView *kreuzberg_extract_file_into_pool(const char *file_path,
|
|
2335
|
+
const char *config_json,
|
|
2336
|
+
struct ResultPool *pool);
|
|
2337
|
+
|
|
2338
|
+
/**
|
|
2339
|
+
* Extract file into pool and get zero-copy view.
|
|
2340
|
+
*
|
|
2341
|
+
* Convenience function that combines extraction and view creation.
|
|
2342
|
+
* Equivalent to `kreuzberg_extract_file_into_pool()` followed by
|
|
2343
|
+
* `kreuzberg_get_result_view()`.
|
|
2344
|
+
*
|
|
2345
|
+
* # Arguments
|
|
2346
|
+
*
|
|
2347
|
+
* Same as `kreuzberg_extract_file_into_pool()`
|
|
2348
|
+
*
|
|
2349
|
+
* # Returns
|
|
2350
|
+
*
|
|
2351
|
+
* Zero-copy view of result, or zeroed view on error.
|
|
2352
|
+
*
|
|
2353
|
+
* # Safety
|
|
2354
|
+
*
|
|
2355
|
+
* Same requirements as `kreuzberg_extract_file_into_pool()`.
|
|
2356
|
+
* View is valid until pool is reset or freed.
|
|
2357
|
+
*/
|
|
2358
|
+
struct CExtractionResultView kreuzberg_extract_file_into_pool_view(const char *file_path,
|
|
2359
|
+
const char *config_json,
|
|
2360
|
+
struct ResultPool *pool);
|
|
2361
|
+
|
|
2362
|
+
/**
|
|
2363
|
+
* Get a zero-copy view of an extraction result.
|
|
2364
|
+
*
|
|
2365
|
+
* Creates a view structure with direct pointers to result data without allocation.
|
|
2366
|
+
* The view is valid only while the source `result` remains valid.
|
|
2367
|
+
*
|
|
2368
|
+
* # Arguments
|
|
2369
|
+
*
|
|
2370
|
+
* * `result` - Pointer to an ExtractionResult structure
|
|
2371
|
+
* * `out_view` - Pointer to a CExtractionResultView structure to populate
|
|
2372
|
+
*
|
|
2373
|
+
* # Returns
|
|
2374
|
+
*
|
|
2375
|
+
* 0 on success, -1 on error (check `kreuzberg_last_error`).
|
|
2376
|
+
*
|
|
2377
|
+
* # Safety
|
|
2378
|
+
*
|
|
2379
|
+
* - `result` must be a valid pointer to an ExtractionResult
|
|
2380
|
+
* - `out_view` must be a valid pointer to writable memory
|
|
2381
|
+
* - Neither parameter can be NULL
|
|
2382
|
+
* - The returned view is valid ONLY while `result` is not freed
|
|
2383
|
+
* - Caller MUST NOT use the view after calling `kreuzberg_result_free(result)`
|
|
2384
|
+
*
|
|
2385
|
+
* # Lifetime Safety
|
|
2386
|
+
*
|
|
2387
|
+
* ```text
|
|
2388
|
+
* ExtractionResult lifetime: |-------------------------------------|
|
|
2389
|
+
* View lifetime: |----------------------|
|
|
2390
|
+
* SAFE FREE → INVALID
|
|
2391
|
+
* ```
|
|
2392
|
+
*
|
|
2393
|
+
* # Example (C)
|
|
2394
|
+
*
|
|
2395
|
+
* ```c
|
|
2396
|
+
* ExtractionResult* result = kreuzberg_extract_file("document.pdf", NULL);
|
|
2397
|
+
* if (result != NULL) {
|
|
2398
|
+
* CExtractionResultView view;
|
|
2399
|
+
* if (kreuzberg_get_result_view(result, &view) == 0) {
|
|
2400
|
+
* // Direct access to content without copying
|
|
2401
|
+
* printf("Content length: %zu bytes\n", view.content_len);
|
|
2402
|
+
* printf("MIME type: %.*s\n", (int)view.mime_type_len, view.mime_type_ptr);
|
|
2403
|
+
* printf("Tables: %zu, Chunks: %zu\n", view.table_count, view.chunk_count);
|
|
2404
|
+
*
|
|
2405
|
+
* // No need to free the view (no allocations)
|
|
2406
|
+
* }
|
|
2407
|
+
*
|
|
2408
|
+
* kreuzberg_result_free(result); // After this, view is INVALID
|
|
2409
|
+
* }
|
|
2410
|
+
* ```
|
|
2411
|
+
*/
|
|
2412
|
+
int32_t kreuzberg_get_result_view(const ExtractionResult *result,
|
|
2413
|
+
struct CExtractionResultView *out_view);
|
|
2414
|
+
|
|
2415
|
+
/**
|
|
2416
|
+
* Get direct access to content from a result view.
|
|
2417
|
+
*
|
|
2418
|
+
* Helper function to retrieve content as a slice without copying.
|
|
2419
|
+
*
|
|
2420
|
+
* # Arguments
|
|
2421
|
+
*
|
|
2422
|
+
* * `view` - Pointer to a CExtractionResultView structure
|
|
2423
|
+
* * `out_ptr` - Pointer to receive the content pointer
|
|
2424
|
+
* * `out_len` - Pointer to receive the content length
|
|
2425
|
+
*
|
|
2426
|
+
* # Returns
|
|
2427
|
+
*
|
|
2428
|
+
* 0 on success, -1 on error (check `kreuzberg_last_error`).
|
|
2429
|
+
*
|
|
2430
|
+
* # Safety
|
|
2431
|
+
*
|
|
2432
|
+
* - `view` must be a valid pointer to a CExtractionResultView
|
|
2433
|
+
* - `out_ptr` and `out_len` must be valid writable pointers
|
|
2434
|
+
* - The returned content pointer is valid only while the source ExtractionResult is valid
|
|
2435
|
+
*
|
|
2436
|
+
* # Example (C)
|
|
2437
|
+
*
|
|
2438
|
+
* ```c
|
|
2439
|
+
* const uint8_t* content;
|
|
2440
|
+
* size_t content_len;
|
|
2441
|
+
* if (kreuzberg_view_get_content(&view, &content, &content_len) == 0) {
|
|
2442
|
+
* // Process content directly without copying
|
|
2443
|
+
* fwrite(content, 1, content_len, stdout);
|
|
2444
|
+
* }
|
|
2445
|
+
* ```
|
|
2446
|
+
*/
|
|
2447
|
+
int32_t kreuzberg_view_get_content(const struct CExtractionResultView *view,
|
|
2448
|
+
const uint8_t **out_ptr,
|
|
2449
|
+
uintptr_t *out_len);
|
|
2450
|
+
|
|
2451
|
+
/**
|
|
2452
|
+
* Get direct access to MIME type from a result view.
|
|
2453
|
+
*
|
|
2454
|
+
* # Arguments
|
|
2455
|
+
*
|
|
2456
|
+
* * `view` - Pointer to a CExtractionResultView structure
|
|
2457
|
+
* * `out_ptr` - Pointer to receive the MIME type pointer
|
|
2458
|
+
* * `out_len` - Pointer to receive the MIME type length
|
|
2459
|
+
*
|
|
2460
|
+
* # Returns
|
|
2461
|
+
*
|
|
2462
|
+
* 0 on success, -1 on error (check `kreuzberg_last_error`).
|
|
2463
|
+
*
|
|
2464
|
+
* # Safety
|
|
2465
|
+
*
|
|
2466
|
+
* - `view` must be a valid pointer to a CExtractionResultView
|
|
2467
|
+
* - `out_ptr` and `out_len` must be valid writable pointers
|
|
2468
|
+
* - The returned MIME type pointer is valid only while the source ExtractionResult is valid
|
|
2469
|
+
*
|
|
2470
|
+
* # Example (C)
|
|
2471
|
+
*
|
|
2472
|
+
* ```c
|
|
2473
|
+
* const uint8_t* mime_type;
|
|
2474
|
+
* size_t mime_len;
|
|
2475
|
+
* if (kreuzberg_view_get_mime_type(&view, &mime_type, &mime_len) == 0) {
|
|
2476
|
+
* printf("MIME: %.*s\n", (int)mime_len, mime_type);
|
|
2477
|
+
* }
|
|
2478
|
+
* ```
|
|
2479
|
+
*/
|
|
2480
|
+
int32_t kreuzberg_view_get_mime_type(const struct CExtractionResultView *view,
|
|
2481
|
+
const uint8_t **out_ptr,
|
|
2482
|
+
uintptr_t *out_len);
|
|
2483
|
+
|
|
2484
|
+
/**
|
|
2485
|
+
* Intern a string and return pointer to shared C string.
|
|
2486
|
+
*
|
|
2487
|
+
* If the string has already been interned, returns pointer to existing allocation.
|
|
2488
|
+
* Otherwise, creates new allocation. Pointer remains valid until all references
|
|
2489
|
+
* are freed with `kreuzberg_free_interned_string()`.
|
|
2490
|
+
*
|
|
2491
|
+
* # Arguments
|
|
2492
|
+
*
|
|
2493
|
+
* * `s` - Null-terminated UTF-8 string to intern
|
|
2494
|
+
*
|
|
2495
|
+
* # Returns
|
|
2496
|
+
*
|
|
2497
|
+
* Pointer to interned C string, or NULL on error (invalid UTF-8, allocation failure).
|
|
2498
|
+
* Caller must eventually free with `kreuzberg_free_interned_string()`.
|
|
2499
|
+
*
|
|
2500
|
+
* # Reference Counting
|
|
2501
|
+
*
|
|
2502
|
+
* Multiple calls with the same string return the same pointer but increment
|
|
2503
|
+
* an internal reference count. The string is freed only when all references
|
|
2504
|
+
* are released.
|
|
2505
|
+
*
|
|
2506
|
+
* # Thread Safety
|
|
2507
|
+
*
|
|
2508
|
+
* Thread-safe. Multiple threads can call concurrently.
|
|
2509
|
+
*
|
|
2510
|
+
* # Safety
|
|
2511
|
+
*
|
|
2512
|
+
* - `s` must be valid null-terminated UTF-8 string
|
|
2513
|
+
* - `s` cannot be NULL
|
|
2514
|
+
* - Returned pointer must not be modified
|
|
2515
|
+
* - Caller must call `kreuzberg_free_interned_string()` for each `kreuzberg_intern_string()` call
|
|
2516
|
+
*
|
|
2517
|
+
* # Example (C)
|
|
2518
|
+
*
|
|
2519
|
+
* ```c
|
|
2520
|
+
* const char* mime1 = kreuzberg_intern_string("application/pdf");
|
|
2521
|
+
* const char* mime2 = kreuzberg_intern_string("application/pdf");
|
|
2522
|
+
*
|
|
2523
|
+
* // Same string = same pointer (memory shared)
|
|
2524
|
+
* assert(mime1 == mime2);
|
|
2525
|
+
*
|
|
2526
|
+
* // Free each reference
|
|
2527
|
+
* kreuzberg_free_interned_string(mime1);
|
|
2528
|
+
* kreuzberg_free_interned_string(mime2);
|
|
2529
|
+
* ```
|
|
2530
|
+
*/
|
|
2531
|
+
const char *kreuzberg_intern_string(const char *s);
|
|
2532
|
+
|
|
2533
|
+
/**
|
|
2534
|
+
* Free an interned string reference.
|
|
2535
|
+
*
|
|
2536
|
+
* Decrements reference count for the interned string. If reference count
|
|
2537
|
+
* reaches zero, the string is freed from the intern table.
|
|
2538
|
+
*
|
|
2539
|
+
* # Arguments
|
|
2540
|
+
*
|
|
2541
|
+
* * `s` - Pointer returned by `kreuzberg_intern_string()`
|
|
2542
|
+
*
|
|
2543
|
+
* # Safety
|
|
2544
|
+
*
|
|
2545
|
+
* - `s` must be a pointer returned by `kreuzberg_intern_string()`
|
|
2546
|
+
* - `s` can be NULL (no-op)
|
|
2547
|
+
* - Must not be called twice on same pointer (double-free)
|
|
2548
|
+
* - Pointer becomes invalid after last reference is freed
|
|
2549
|
+
*
|
|
2550
|
+
* # Example (C)
|
|
2551
|
+
*
|
|
2552
|
+
* ```c
|
|
2553
|
+
* const char* mime = kreuzberg_intern_string("application/pdf");
|
|
2554
|
+
* // Use mime...
|
|
2555
|
+
* kreuzberg_free_interned_string(mime);
|
|
2556
|
+
* // Don't use mime after this point
|
|
2557
|
+
* ```
|
|
2558
|
+
*/
|
|
2559
|
+
void kreuzberg_free_interned_string(const char *s);
|
|
2560
|
+
|
|
2561
|
+
/**
|
|
2562
|
+
* Get statistics about string interning efficiency.
|
|
2563
|
+
*
|
|
2564
|
+
* Returns metrics about unique strings, cache hits/misses, and memory savings.
|
|
2565
|
+
*
|
|
2566
|
+
* # Returns
|
|
2567
|
+
*
|
|
2568
|
+
* Statistics structure with current metrics.
|
|
2569
|
+
*
|
|
2570
|
+
* # Example (C)
|
|
2571
|
+
*
|
|
2572
|
+
* ```c
|
|
2573
|
+
* CStringInternStats stats = kreuzberg_string_intern_stats();
|
|
2574
|
+
* printf("Interned: %zu unique strings\n", stats.unique_count);
|
|
2575
|
+
* printf("Requests: %zu total (%zu hits, %zu misses)\n",
|
|
2576
|
+
* stats.total_requests, stats.cache_hits, stats.cache_misses);
|
|
2577
|
+
* printf("Memory saved: %zu bytes\n", stats.estimated_memory_saved);
|
|
2578
|
+
* printf("Hit rate: %.1f%%\n",
|
|
2579
|
+
* 100.0 * stats.cache_hits / stats.total_requests);
|
|
2580
|
+
* ```
|
|
2581
|
+
*/
|
|
2582
|
+
struct CStringInternStats kreuzberg_string_intern_stats(void);
|
|
2583
|
+
|
|
2584
|
+
/**
|
|
2585
|
+
* Reset the intern table, freeing all interned strings.
|
|
2586
|
+
*
|
|
2587
|
+
* **WARNING**: This invalidates all pointers returned by `kreuzberg_intern_string()`.
|
|
2588
|
+
* Only use during shutdown or testing.
|
|
2589
|
+
*
|
|
2590
|
+
* # Safety
|
|
2591
|
+
*
|
|
2592
|
+
* - Must not be called while any interned string pointers are in use
|
|
2593
|
+
* - All existing interned pointers become invalid
|
|
2594
|
+
* - Thread-safe but can race with concurrent intern operations
|
|
2595
|
+
*/
|
|
2596
|
+
void kreuzberg_string_intern_reset(void);
|
|
2597
|
+
|
|
2598
|
+
/**
|
|
2599
|
+
* Get the last error message from a failed operation.
|
|
2600
|
+
*
|
|
2601
|
+
* # Safety
|
|
2602
|
+
*
|
|
2603
|
+
* - Returns a static string that does not need to be freed
|
|
2604
|
+
* - Returns NULL if no error has occurred
|
|
2605
|
+
* - The returned string is valid until the next Kreuzberg function call on the same thread
|
|
2606
|
+
*
|
|
2607
|
+
* # Example (C)
|
|
2608
|
+
*
|
|
2609
|
+
* ```c
|
|
2610
|
+
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
2611
|
+
* if (result == NULL) {
|
|
2612
|
+
* const char* error = kreuzberg_last_error();
|
|
2613
|
+
* printf("Error: %s\n", error);
|
|
2614
|
+
* }
|
|
2615
|
+
* ```
|
|
2616
|
+
*/
|
|
2617
|
+
const char *kreuzberg_last_error(void);
|
|
2618
|
+
|
|
2619
|
+
/**
|
|
2620
|
+
* Get the error code for the last error.
|
|
2621
|
+
*
|
|
2622
|
+
* Returns the error code as an i32. Error codes are defined in ErrorCode enum:
|
|
2623
|
+
* - 0: Success (no error)
|
|
2624
|
+
* - 1: GenericError
|
|
2625
|
+
* - 2: Panic
|
|
2626
|
+
* - 3: InvalidArgument
|
|
2627
|
+
* - 4: IoError
|
|
2628
|
+
* - 5: ParsingError
|
|
2629
|
+
* - 6: OcrError
|
|
2630
|
+
* - 7: MissingDependency
|
|
2631
|
+
*
|
|
2632
|
+
* # Safety
|
|
2633
|
+
*
|
|
2634
|
+
* This function is thread-safe and always safe to call.
|
|
2635
|
+
*
|
|
2636
|
+
* # Example (C)
|
|
2637
|
+
*
|
|
2638
|
+
* ```c
|
|
2639
|
+
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
2640
|
+
* if (result == NULL) {
|
|
2641
|
+
* int32_t code = kreuzberg_last_error_code();
|
|
2642
|
+
* if (code == 2) {
|
|
2643
|
+
* // A panic occurred
|
|
2644
|
+
* }
|
|
2645
|
+
* }
|
|
2646
|
+
* ```
|
|
2647
|
+
*/
|
|
2648
|
+
int32_t kreuzberg_last_error_code(void);
|
|
2649
|
+
|
|
2650
|
+
/**
|
|
2651
|
+
* Get the panic context for the last error (if it was a panic).
|
|
2652
|
+
*
|
|
2653
|
+
* Returns a JSON object with panic details including:
|
|
2654
|
+
* - file: Source file where panic occurred
|
|
2655
|
+
* - line: Line number in source file
|
|
2656
|
+
* - function: Name of the function that panicked
|
|
2657
|
+
* - message: Panic message
|
|
2658
|
+
* - timestamp_secs: Unix timestamp when panic occurred
|
|
2659
|
+
*
|
|
2660
|
+
* # Safety
|
|
2661
|
+
*
|
|
2662
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
2663
|
+
* - Returns NULL if the last error was not a panic or no error has occurred
|
|
2664
|
+
*
|
|
2665
|
+
* # Example (C)
|
|
2666
|
+
*
|
|
2667
|
+
* ```c
|
|
2668
|
+
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
2669
|
+
* if (result == NULL && kreuzberg_last_error_code() == 2) {
|
|
2670
|
+
* char* context = kreuzberg_last_panic_context();
|
|
2671
|
+
* if (context != NULL) {
|
|
2672
|
+
* printf("Panic context: %s\n", context);
|
|
2673
|
+
* kreuzberg_free_string(context);
|
|
2674
|
+
* }
|
|
2675
|
+
* }
|
|
2676
|
+
* ```
|
|
2677
|
+
*/
|
|
2678
|
+
char *kreuzberg_last_panic_context(void);
|
|
2679
|
+
|
|
2680
|
+
/**
|
|
2681
|
+
* Get the library version string.
|
|
2682
|
+
*
|
|
2683
|
+
* # Safety
|
|
2684
|
+
*
|
|
2685
|
+
* - Returns a static string that does not need to be freed
|
|
2686
|
+
* - The returned string is always valid
|
|
2687
|
+
*
|
|
2688
|
+
* # Example (C)
|
|
2689
|
+
*
|
|
2690
|
+
* ```c
|
|
2691
|
+
* const char* version = kreuzberg_version();
|
|
2692
|
+
* printf("Kreuzberg version: %s\n", version);
|
|
2693
|
+
* ```
|
|
2694
|
+
*/
|
|
2695
|
+
const char *kreuzberg_version(void);
|
|
2696
|
+
|
|
2697
|
+
/**
|
|
2698
|
+
* Validates a binarization method string.
|
|
2699
|
+
*
|
|
2700
|
+
* # Arguments
|
|
2701
|
+
*
|
|
2702
|
+
* * `method` - C string containing the binarization method (e.g., "otsu", "adaptive", "sauvola")
|
|
2703
|
+
*
|
|
2704
|
+
* # Returns
|
|
2705
|
+
*
|
|
2706
|
+
* - `1` if valid
|
|
2707
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2708
|
+
*
|
|
2709
|
+
* # Safety
|
|
2710
|
+
*
|
|
2711
|
+
* * `method` must be a valid pointer to a null-terminated UTF-8 string
|
|
2712
|
+
* * `method` cannot be NULL
|
|
2713
|
+
* * The string must be valid for the duration of the call
|
|
2714
|
+
*
|
|
2715
|
+
* # C Signature
|
|
2716
|
+
*
|
|
2717
|
+
* ```c
|
|
2718
|
+
* int32_t kreuzberg_validate_binarization_method(const char* method);
|
|
2719
|
+
* ```
|
|
2720
|
+
*/
|
|
2721
|
+
int32_t kreuzberg_validate_binarization_method(const char *method);
|
|
2722
|
+
|
|
2723
|
+
/**
|
|
2724
|
+
* Validates an OCR backend string.
|
|
2725
|
+
*
|
|
2726
|
+
* # Arguments
|
|
2727
|
+
*
|
|
2728
|
+
* * `backend` - C string containing the OCR backend (e.g., "tesseract", "easyocr", "paddleocr")
|
|
2729
|
+
*
|
|
2730
|
+
* # Returns
|
|
2731
|
+
*
|
|
2732
|
+
* - `1` if valid
|
|
2733
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2734
|
+
*
|
|
2735
|
+
* # Safety
|
|
2736
|
+
*
|
|
2737
|
+
* * `backend` must be a valid pointer to a null-terminated UTF-8 string
|
|
2738
|
+
* * `backend` cannot be NULL
|
|
2739
|
+
* * The string must be valid for the duration of the call
|
|
2740
|
+
*
|
|
2741
|
+
* # C Signature
|
|
2742
|
+
*
|
|
2743
|
+
* ```c
|
|
2744
|
+
* int32_t kreuzberg_validate_ocr_backend(const char* backend);
|
|
2745
|
+
* ```
|
|
2746
|
+
*/
|
|
2747
|
+
int32_t kreuzberg_validate_ocr_backend(const char *backend);
|
|
2748
|
+
|
|
2749
|
+
/**
|
|
2750
|
+
* Validates a language code (ISO 639-1 or 639-3 format).
|
|
2751
|
+
*
|
|
2752
|
+
* Accepts both 2-letter codes (e.g., "en", "de") and 3-letter codes (e.g., "eng", "deu").
|
|
2753
|
+
*
|
|
2754
|
+
* # Arguments
|
|
2755
|
+
*
|
|
2756
|
+
* * `code` - C string containing the language code
|
|
2757
|
+
*
|
|
2758
|
+
* # Returns
|
|
2759
|
+
*
|
|
2760
|
+
* - `1` if valid
|
|
2761
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2762
|
+
*
|
|
2763
|
+
* # Safety
|
|
2764
|
+
*
|
|
2765
|
+
* * `code` must be a valid pointer to a null-terminated UTF-8 string
|
|
2766
|
+
* * `code` cannot be NULL
|
|
2767
|
+
* * The string must be valid for the duration of the call
|
|
2768
|
+
*
|
|
2769
|
+
* # C Signature
|
|
2770
|
+
*
|
|
2771
|
+
* ```c
|
|
2772
|
+
* int32_t kreuzberg_validate_language_code(const char* code);
|
|
2773
|
+
* ```
|
|
2774
|
+
*/
|
|
2775
|
+
int32_t kreuzberg_validate_language_code(const char *code);
|
|
2776
|
+
|
|
2777
|
+
/**
|
|
2778
|
+
* Validates a token reduction level string.
|
|
2779
|
+
*
|
|
2780
|
+
* # Arguments
|
|
2781
|
+
*
|
|
2782
|
+
* * `level` - C string containing the token reduction level (e.g., "off", "light", "moderate")
|
|
2783
|
+
*
|
|
2784
|
+
* # Returns
|
|
2785
|
+
*
|
|
2786
|
+
* - `1` if valid
|
|
2787
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2788
|
+
*
|
|
2789
|
+
* # Safety
|
|
2790
|
+
*
|
|
2791
|
+
* * `level` must be a valid pointer to a null-terminated UTF-8 string
|
|
2792
|
+
* * `level` cannot be NULL
|
|
2793
|
+
* * The string must be valid for the duration of the call
|
|
2794
|
+
*
|
|
2795
|
+
* # C Signature
|
|
2796
|
+
*
|
|
2797
|
+
* ```c
|
|
2798
|
+
* int32_t kreuzberg_validate_token_reduction_level(const char* level);
|
|
2799
|
+
* ```
|
|
2800
|
+
*/
|
|
2801
|
+
int32_t kreuzberg_validate_token_reduction_level(const char *level);
|
|
2802
|
+
|
|
2803
|
+
/**
|
|
2804
|
+
* Validates a tesseract Page Segmentation Mode (PSM) value.
|
|
2805
|
+
*
|
|
2806
|
+
* # Arguments
|
|
2807
|
+
*
|
|
2808
|
+
* * `psm` - PSM value (valid range: 0-13)
|
|
2809
|
+
*
|
|
2810
|
+
* # Returns
|
|
2811
|
+
*
|
|
2812
|
+
* - `1` if valid
|
|
2813
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2814
|
+
*
|
|
2815
|
+
* # C Signature
|
|
2816
|
+
*
|
|
2817
|
+
* ```c
|
|
2818
|
+
* int32_t kreuzberg_validate_tesseract_psm(int32_t psm);
|
|
2819
|
+
* ```
|
|
2820
|
+
*/
|
|
2821
|
+
int32_t kreuzberg_validate_tesseract_psm(int32_t psm);
|
|
2822
|
+
|
|
2823
|
+
/**
|
|
2824
|
+
* Validates a tesseract OCR Engine Mode (OEM) value.
|
|
2825
|
+
*
|
|
2826
|
+
* # Arguments
|
|
2827
|
+
*
|
|
2828
|
+
* * `oem` - OEM value (valid range: 0-3)
|
|
2829
|
+
*
|
|
2830
|
+
* # Returns
|
|
2831
|
+
*
|
|
2832
|
+
* - `1` if valid
|
|
2833
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2834
|
+
*
|
|
2835
|
+
* # C Signature
|
|
2836
|
+
*
|
|
2837
|
+
* ```c
|
|
2838
|
+
* int32_t kreuzberg_validate_tesseract_oem(int32_t oem);
|
|
2839
|
+
* ```
|
|
2840
|
+
*/
|
|
2841
|
+
int32_t kreuzberg_validate_tesseract_oem(int32_t oem);
|
|
2842
|
+
|
|
2843
|
+
/**
|
|
2844
|
+
* Validates a tesseract output format string.
|
|
2845
|
+
*
|
|
2846
|
+
* # Arguments
|
|
2847
|
+
*
|
|
2848
|
+
* * `format` - C string containing the output format (e.g., "text", "markdown")
|
|
2849
|
+
*
|
|
2850
|
+
* # Returns
|
|
2851
|
+
*
|
|
2852
|
+
* - `1` if valid
|
|
2853
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2854
|
+
*
|
|
2855
|
+
* # Safety
|
|
2856
|
+
*
|
|
2857
|
+
* * `format` must be a valid pointer to a null-terminated UTF-8 string
|
|
2858
|
+
* * `format` cannot be NULL
|
|
2859
|
+
* * The string must be valid for the duration of the call
|
|
2860
|
+
*
|
|
2861
|
+
* # C Signature
|
|
2862
|
+
*
|
|
2863
|
+
* ```c
|
|
2864
|
+
* int32_t kreuzberg_validate_output_format(const char* format);
|
|
2865
|
+
* ```
|
|
2866
|
+
*/
|
|
2867
|
+
int32_t kreuzberg_validate_output_format(const char *format);
|
|
2868
|
+
|
|
2869
|
+
/**
|
|
2870
|
+
* Validates a confidence threshold value.
|
|
2871
|
+
*
|
|
2872
|
+
* Confidence thresholds must be between 0.0 and 1.0 inclusive.
|
|
2873
|
+
*
|
|
2874
|
+
* # Arguments
|
|
2875
|
+
*
|
|
2876
|
+
* * `confidence` - Confidence threshold value
|
|
2877
|
+
*
|
|
2878
|
+
* # Returns
|
|
2879
|
+
*
|
|
2880
|
+
* - `1` if valid
|
|
2881
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2882
|
+
*
|
|
2883
|
+
* # C Signature
|
|
2884
|
+
*
|
|
2885
|
+
* ```c
|
|
2886
|
+
* int32_t kreuzberg_validate_confidence(double confidence);
|
|
2887
|
+
* ```
|
|
2888
|
+
*/
|
|
2889
|
+
int32_t kreuzberg_validate_confidence(double confidence);
|
|
2890
|
+
|
|
2891
|
+
/**
|
|
2892
|
+
* Validates a DPI (dots per inch) value.
|
|
2893
|
+
*
|
|
2894
|
+
* DPI must be a positive integer, typically 72-600.
|
|
2895
|
+
*
|
|
2896
|
+
* # Arguments
|
|
2897
|
+
*
|
|
2898
|
+
* * `dpi` - DPI value
|
|
2899
|
+
*
|
|
2900
|
+
* # Returns
|
|
2901
|
+
*
|
|
2902
|
+
* - `1` if valid
|
|
2903
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2904
|
+
*
|
|
2905
|
+
* # C Signature
|
|
2906
|
+
*
|
|
2907
|
+
* ```c
|
|
2908
|
+
* int32_t kreuzberg_validate_dpi(int32_t dpi);
|
|
2909
|
+
* ```
|
|
2910
|
+
*/
|
|
2911
|
+
int32_t kreuzberg_validate_dpi(int32_t dpi);
|
|
2912
|
+
|
|
2913
|
+
/**
|
|
2914
|
+
* Validates chunking parameters.
|
|
2915
|
+
*
|
|
2916
|
+
* Checks that `max_chars > 0` and `max_overlap < max_chars`.
|
|
2917
|
+
*
|
|
2918
|
+
* # Arguments
|
|
2919
|
+
*
|
|
2920
|
+
* * `max_chars` - Maximum characters per chunk
|
|
2921
|
+
* * `max_overlap` - Maximum overlap between chunks
|
|
2922
|
+
*
|
|
2923
|
+
* # Returns
|
|
2924
|
+
*
|
|
2925
|
+
* - `1` if valid
|
|
2926
|
+
* - `0` if invalid (error message available via `kreuzberg_get_last_error_message()`)
|
|
2927
|
+
*
|
|
2928
|
+
* # C Signature
|
|
2929
|
+
*
|
|
2930
|
+
* ```c
|
|
2931
|
+
* int32_t kreuzberg_validate_chunking_params(size_t max_chars, size_t max_overlap);
|
|
2932
|
+
* ```
|
|
2933
|
+
*/
|
|
2934
|
+
int32_t kreuzberg_validate_chunking_params(uintptr_t max_chars, uintptr_t max_overlap);
|
|
2935
|
+
|
|
2936
|
+
/**
|
|
2937
|
+
* Returns valid binarization methods as a JSON array string.
|
|
2938
|
+
*
|
|
2939
|
+
* The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
|
|
2940
|
+
*
|
|
2941
|
+
* # Returns
|
|
2942
|
+
*
|
|
2943
|
+
* A pointer to a dynamically allocated C string containing a JSON array of valid methods.
|
|
2944
|
+
* Returns NULL if memory allocation fails (error message set via `set_last_error()`).
|
|
2945
|
+
*
|
|
2946
|
+
* # Example
|
|
2947
|
+
*
|
|
2948
|
+
* The returned JSON string looks like: `["otsu","adaptive","sauvola"]`
|
|
2949
|
+
*
|
|
2950
|
+
* # C Signature
|
|
2951
|
+
*
|
|
2952
|
+
* ```c
|
|
2953
|
+
* char* kreuzberg_get_valid_binarization_methods(void);
|
|
2954
|
+
* ```
|
|
2955
|
+
*/
|
|
2956
|
+
char *kreuzberg_get_valid_binarization_methods(void);
|
|
2957
|
+
|
|
2958
|
+
/**
|
|
2959
|
+
* Returns valid language codes as a JSON array string.
|
|
2960
|
+
*
|
|
2961
|
+
* The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
|
|
2962
|
+
*
|
|
2963
|
+
* # Returns
|
|
2964
|
+
*
|
|
2965
|
+
* A pointer to a dynamically allocated C string containing a JSON array of valid codes.
|
|
2966
|
+
* Returns NULL if memory allocation fails (error message set via `set_last_error()`).
|
|
2967
|
+
*
|
|
2968
|
+
* # C Signature
|
|
2969
|
+
*
|
|
2970
|
+
* ```c
|
|
2971
|
+
* char* kreuzberg_get_valid_language_codes(void);
|
|
2972
|
+
* ```
|
|
2973
|
+
*/
|
|
2974
|
+
char *kreuzberg_get_valid_language_codes(void);
|
|
2975
|
+
|
|
2976
|
+
/**
|
|
2977
|
+
* Returns valid OCR backends as a JSON array string.
|
|
2978
|
+
*
|
|
2979
|
+
* The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
|
|
2980
|
+
*
|
|
2981
|
+
* # Returns
|
|
2982
|
+
*
|
|
2983
|
+
* A pointer to a dynamically allocated C string containing a JSON array of valid backends.
|
|
2984
|
+
* Returns NULL if memory allocation fails (error message set via `set_last_error()`).
|
|
2985
|
+
*
|
|
2986
|
+
* # C Signature
|
|
2987
|
+
*
|
|
2988
|
+
* ```c
|
|
2989
|
+
* char* kreuzberg_get_valid_ocr_backends(void);
|
|
2990
|
+
* ```
|
|
2991
|
+
*/
|
|
2992
|
+
char *kreuzberg_get_valid_ocr_backends(void);
|
|
2993
|
+
|
|
2994
|
+
/**
|
|
2995
|
+
* Returns valid token reduction levels as a JSON array string.
|
|
2996
|
+
*
|
|
2997
|
+
* The returned string MUST be freed by the caller using `kreuzberg_free_string()`.
|
|
2998
|
+
*
|
|
2999
|
+
* # Returns
|
|
3000
|
+
*
|
|
3001
|
+
* A pointer to a dynamically allocated C string containing a JSON array of valid levels.
|
|
3002
|
+
* Returns NULL if memory allocation fails (error message set via `set_last_error()`).
|
|
3003
|
+
*
|
|
3004
|
+
* # C Signature
|
|
3005
|
+
*
|
|
3006
|
+
* ```c
|
|
3007
|
+
* char* kreuzberg_get_valid_token_reduction_levels(void);
|
|
3008
|
+
* ```
|
|
3009
|
+
*/
|
|
3010
|
+
char *kreuzberg_get_valid_token_reduction_levels(void);
|
|
3011
|
+
|
|
3012
|
+
#endif /* KREUZBERG_FFI_H */
|