kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,1087 +0,0 @@
|
|
|
1
|
-
/* Auto-generated C bindings for Kreuzberg */
|
|
2
|
-
|
|
3
|
-
#ifndef KREUZBERG_FFI_H
|
|
4
|
-
#define KREUZBERG_FFI_H
|
|
5
|
-
|
|
6
|
-
#pragma once
|
|
7
|
-
|
|
8
|
-
/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */
|
|
9
|
-
|
|
10
|
-
#include <stdarg.h>
|
|
11
|
-
#include <stdbool.h>
|
|
12
|
-
#include <stdint.h>
|
|
13
|
-
#include <stdlib.h>
|
|
14
|
-
/**
|
|
15
|
-
* Opaque type for extraction configuration.
|
|
16
|
-
* This is an opaque pointer type - callers should not access its internals.
|
|
17
|
-
*/
|
|
18
|
-
typedef struct ExtractionConfig ExtractionConfig;
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
/**
|
|
22
|
-
* C-compatible extraction result structure
|
|
23
|
-
*
|
|
24
|
-
* Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
25
|
-
* Field order: 11 pointers (8 bytes each) + 1 bool + 7 bytes padding = 96 bytes total
|
|
26
|
-
*/
|
|
27
|
-
typedef struct CExtractionResult {
|
|
28
|
-
/**
|
|
29
|
-
* Extracted text content (null-terminated UTF-8 string, must be freed with kreuzberg_free_string)
|
|
30
|
-
*/
|
|
31
|
-
char *content;
|
|
32
|
-
/**
|
|
33
|
-
* Detected MIME type (null-terminated string, must be freed with kreuzberg_free_string)
|
|
34
|
-
*/
|
|
35
|
-
char *mime_type;
|
|
36
|
-
/**
|
|
37
|
-
* Document language (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
38
|
-
*/
|
|
39
|
-
char *language;
|
|
40
|
-
/**
|
|
41
|
-
* Document date (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
42
|
-
*/
|
|
43
|
-
char *date;
|
|
44
|
-
/**
|
|
45
|
-
* Document subject (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
46
|
-
*/
|
|
47
|
-
char *subject;
|
|
48
|
-
/**
|
|
49
|
-
* Tables as JSON array (null-terminated string, or NULL if no tables, must be freed with kreuzberg_free_string)
|
|
50
|
-
*/
|
|
51
|
-
char *tables_json;
|
|
52
|
-
/**
|
|
53
|
-
* Detected languages as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
54
|
-
*/
|
|
55
|
-
char *detected_languages_json;
|
|
56
|
-
/**
|
|
57
|
-
* Metadata as JSON object (null-terminated string, or NULL if no metadata, must be freed with kreuzberg_free_string)
|
|
58
|
-
*/
|
|
59
|
-
char *metadata_json;
|
|
60
|
-
/**
|
|
61
|
-
* Text chunks as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
62
|
-
*/
|
|
63
|
-
char *chunks_json;
|
|
64
|
-
/**
|
|
65
|
-
* Extracted images as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
66
|
-
*/
|
|
67
|
-
char *images_json;
|
|
68
|
-
/**
|
|
69
|
-
* Page structure as JSON object (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
70
|
-
*/
|
|
71
|
-
char *page_structure_json;
|
|
72
|
-
/**
|
|
73
|
-
* Whether extraction was successful
|
|
74
|
-
*/
|
|
75
|
-
bool success;
|
|
76
|
-
/**
|
|
77
|
-
* Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
78
|
-
*/
|
|
79
|
-
uint8_t _padding1[7];
|
|
80
|
-
} CExtractionResult;
|
|
81
|
-
|
|
82
|
-
/**
|
|
83
|
-
* C-compatible structure for batch extraction results
|
|
84
|
-
*
|
|
85
|
-
* Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
86
|
-
* Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 bool + 7 bytes padding = 24 bytes total
|
|
87
|
-
*/
|
|
88
|
-
typedef struct CBatchResult {
|
|
89
|
-
/**
|
|
90
|
-
* Array of extraction results
|
|
91
|
-
*/
|
|
92
|
-
struct CExtractionResult **results;
|
|
93
|
-
/**
|
|
94
|
-
* Number of results
|
|
95
|
-
*/
|
|
96
|
-
uintptr_t count;
|
|
97
|
-
/**
|
|
98
|
-
* Whether batch operation was successful
|
|
99
|
-
*/
|
|
100
|
-
bool success;
|
|
101
|
-
/**
|
|
102
|
-
* Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
103
|
-
*/
|
|
104
|
-
uint8_t _padding2[7];
|
|
105
|
-
} CBatchResult;
|
|
106
|
-
|
|
107
|
-
/**
|
|
108
|
-
* C-compatible structure for passing byte array with MIME type in batch operations
|
|
109
|
-
*
|
|
110
|
-
* Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
111
|
-
* Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 pointer (8 bytes) = 24 bytes total
|
|
112
|
-
*/
|
|
113
|
-
typedef struct CBytesWithMime {
|
|
114
|
-
/**
|
|
115
|
-
* Pointer to byte data
|
|
116
|
-
*/
|
|
117
|
-
const uint8_t *data;
|
|
118
|
-
/**
|
|
119
|
-
* Length of byte data
|
|
120
|
-
*/
|
|
121
|
-
uintptr_t data_len;
|
|
122
|
-
/**
|
|
123
|
-
* MIME type as null-terminated C string
|
|
124
|
-
*/
|
|
125
|
-
const char *mime_type;
|
|
126
|
-
} CBytesWithMime;
|
|
127
|
-
|
|
128
|
-
/**
|
|
129
|
-
* Type alias for the OCR backend callback function.
|
|
130
|
-
*
|
|
131
|
-
* # Parameters
|
|
132
|
-
*
|
|
133
|
-
* - `image_bytes`: Pointer to image data
|
|
134
|
-
* - `image_length`: Length of image data in bytes
|
|
135
|
-
* - `config_json`: JSON-encoded OcrConfig (null-terminated string)
|
|
136
|
-
*
|
|
137
|
-
* # Returns
|
|
138
|
-
*
|
|
139
|
-
* Null-terminated string containing extracted text (must be freed by Rust via kreuzberg_free_string),
|
|
140
|
-
* or NULL on error.
|
|
141
|
-
*
|
|
142
|
-
* # Safety
|
|
143
|
-
*
|
|
144
|
-
* The callback must:
|
|
145
|
-
* - Not store the image_bytes pointer (it's only valid for the duration of the call)
|
|
146
|
-
* - Return a valid null-terminated UTF-8 string allocated by the caller
|
|
147
|
-
* - Return NULL on error (error message should be retrievable separately)
|
|
148
|
-
*/
|
|
149
|
-
typedef char *(*OcrBackendCallback)(const uint8_t *image_bytes,
|
|
150
|
-
uintptr_t image_length,
|
|
151
|
-
const char *config_json);
|
|
152
|
-
|
|
153
|
-
/**
|
|
154
|
-
* Type alias for the PostProcessor callback function.
|
|
155
|
-
*
|
|
156
|
-
* # Parameters
|
|
157
|
-
*
|
|
158
|
-
* - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
|
|
159
|
-
*
|
|
160
|
-
* # Returns
|
|
161
|
-
*
|
|
162
|
-
* Null-terminated JSON string containing the processed ExtractionResult
|
|
163
|
-
* (must be freed by Rust via kreuzberg_free_string), or NULL on error.
|
|
164
|
-
*
|
|
165
|
-
* # Safety
|
|
166
|
-
*
|
|
167
|
-
* The callback must:
|
|
168
|
-
* - Not store the result_json pointer (it's only valid for the duration of the call)
|
|
169
|
-
* - Return a valid null-terminated UTF-8 JSON string allocated by the caller
|
|
170
|
-
* - Return NULL on error (error message should be retrievable separately)
|
|
171
|
-
*/
|
|
172
|
-
typedef char *(*PostProcessorCallback)(const char *result_json);
|
|
173
|
-
|
|
174
|
-
/**
|
|
175
|
-
* Type alias for the DocumentExtractor callback function.
|
|
176
|
-
*
|
|
177
|
-
* # Parameters
|
|
178
|
-
*
|
|
179
|
-
* - `content`: Raw document bytes
|
|
180
|
-
* - `content_len`: Length of the content array
|
|
181
|
-
* - `mime_type`: MIME type of the document (null-terminated string)
|
|
182
|
-
* - `config_json`: JSON-encoded ExtractionConfig (null-terminated string)
|
|
183
|
-
*
|
|
184
|
-
* # Returns
|
|
185
|
-
*
|
|
186
|
-
* Null-terminated JSON string containing the ExtractionResult, or NULL on error.
|
|
187
|
-
* The returned string must be freeable by kreuzberg_free_string.
|
|
188
|
-
*
|
|
189
|
-
* # Safety
|
|
190
|
-
*
|
|
191
|
-
* The callback must:
|
|
192
|
-
* - Not store the content, mime_type, or config_json pointers (only valid during the call)
|
|
193
|
-
* - Return a valid null-terminated UTF-8 JSON string or NULL on error
|
|
194
|
-
* - The returned string must be freeable by kreuzberg_free_string
|
|
195
|
-
*/
|
|
196
|
-
typedef char *(*DocumentExtractorCallback)(const uint8_t *content,
|
|
197
|
-
uintptr_t content_len,
|
|
198
|
-
const char *mime_type,
|
|
199
|
-
const char *config_json);
|
|
200
|
-
|
|
201
|
-
/**
|
|
202
|
-
* Type alias for the Validator callback function.
|
|
203
|
-
*
|
|
204
|
-
* # Parameters
|
|
205
|
-
*
|
|
206
|
-
* - `result_json`: JSON-encoded ExtractionResult (null-terminated string)
|
|
207
|
-
*
|
|
208
|
-
* # Returns
|
|
209
|
-
*
|
|
210
|
-
* Null-terminated error message string if validation fails (must be freed by Rust
|
|
211
|
-
* via kreuzberg_free_string), or NULL if validation passes.
|
|
212
|
-
*
|
|
213
|
-
* # Safety
|
|
214
|
-
*
|
|
215
|
-
* The callback must:
|
|
216
|
-
* - Not store the result_json pointer (it's only valid for the duration of the call)
|
|
217
|
-
* - Return a valid null-terminated UTF-8 string (error message) if validation fails
|
|
218
|
-
* - Return NULL if validation passes
|
|
219
|
-
* - The returned string must be freeable by kreuzberg_free_string
|
|
220
|
-
*/
|
|
221
|
-
typedef char *(*ValidatorCallback)(const char *result_json);
|
|
222
|
-
|
|
223
|
-
/**
|
|
224
|
-
* Extract text and metadata from a file (synchronous).
|
|
225
|
-
*
|
|
226
|
-
* # Safety
|
|
227
|
-
*
|
|
228
|
-
* - `file_path` must be a valid null-terminated C string
|
|
229
|
-
* - The returned pointer must be freed with `kreuzberg_free_result`
|
|
230
|
-
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
231
|
-
*
|
|
232
|
-
* # Example (C)
|
|
233
|
-
*
|
|
234
|
-
* ```c
|
|
235
|
-
* const char* path = "/path/to/document.pdf";
|
|
236
|
-
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
237
|
-
* if (result != NULL && result->success) {
|
|
238
|
-
* printf("Content: %s\n", result->content);
|
|
239
|
-
* printf("MIME: %s\n", result->mime_type);
|
|
240
|
-
* kreuzberg_free_result(result);
|
|
241
|
-
* } else {
|
|
242
|
-
* const char* error = kreuzberg_last_error();
|
|
243
|
-
* printf("Error: %s\n", error);
|
|
244
|
-
* }
|
|
245
|
-
* ```
|
|
246
|
-
*/
|
|
247
|
-
struct CExtractionResult *kreuzberg_extract_file_sync(const char *file_path);
|
|
248
|
-
|
|
249
|
-
/**
|
|
250
|
-
* Detect MIME type from a file path.
|
|
251
|
-
*
|
|
252
|
-
* # Safety
|
|
253
|
-
*
|
|
254
|
-
* - `file_path` must be a valid null-terminated C string
|
|
255
|
-
* - The returned string must be freed with `kreuzberg_free_string`
|
|
256
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
257
|
-
*/
|
|
258
|
-
char *kreuzberg_detect_mime_type(const char *file_path, bool check_exists);
|
|
259
|
-
|
|
260
|
-
/**
|
|
261
|
-
* Validate that a MIME type is supported by Kreuzberg.
|
|
262
|
-
*
|
|
263
|
-
* # Safety
|
|
264
|
-
*
|
|
265
|
-
* - `mime_type` must be a valid null-terminated C string
|
|
266
|
-
* - The returned string must be freed with `kreuzberg_free_string`
|
|
267
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
268
|
-
*/
|
|
269
|
-
char *kreuzberg_validate_mime_type(const char *mime_type);
|
|
270
|
-
|
|
271
|
-
/**
|
|
272
|
-
* List available embedding preset names.
|
|
273
|
-
*
|
|
274
|
-
* # Safety
|
|
275
|
-
*
|
|
276
|
-
* - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
|
|
277
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
278
|
-
*/
|
|
279
|
-
char *kreuzberg_list_embedding_presets(void);
|
|
280
|
-
|
|
281
|
-
/**
|
|
282
|
-
* Get a specific embedding preset by name.
|
|
283
|
-
*
|
|
284
|
-
* # Safety
|
|
285
|
-
*
|
|
286
|
-
* - `name` must be a valid null-terminated C string
|
|
287
|
-
* - Returned string is JSON object and must be freed with `kreuzberg_free_string`
|
|
288
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
289
|
-
*/
|
|
290
|
-
char *kreuzberg_get_embedding_preset(const char *name);
|
|
291
|
-
|
|
292
|
-
/**
|
|
293
|
-
* Extract text and metadata from a file with custom configuration (synchronous).
|
|
294
|
-
*
|
|
295
|
-
* # Safety
|
|
296
|
-
*
|
|
297
|
-
* - `file_path` must be a valid null-terminated C string
|
|
298
|
-
* - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
299
|
-
* - The returned pointer must be freed with `kreuzberg_free_result`
|
|
300
|
-
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
301
|
-
*
|
|
302
|
-
* # Example (C)
|
|
303
|
-
*
|
|
304
|
-
* ```c
|
|
305
|
-
* const char* path = "/path/to/document.pdf";
|
|
306
|
-
* const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
|
|
307
|
-
* CExtractionResult* result = kreuzberg_extract_file_sync_with_config(path, config);
|
|
308
|
-
* if (result != NULL && result->success) {
|
|
309
|
-
* printf("Content: %s\n", result->content);
|
|
310
|
-
* kreuzberg_free_result(result);
|
|
311
|
-
* }
|
|
312
|
-
* ```
|
|
313
|
-
*/
|
|
314
|
-
struct CExtractionResult *kreuzberg_extract_file_sync_with_config(const char *file_path,
|
|
315
|
-
const char *config_json);
|
|
316
|
-
|
|
317
|
-
/**
|
|
318
|
-
* Extract text and metadata from byte array (synchronous).
|
|
319
|
-
*
|
|
320
|
-
* # Safety
|
|
321
|
-
*
|
|
322
|
-
* - `data` must be a valid pointer to a byte array of length `data_len`
|
|
323
|
-
* - `mime_type` must be a valid null-terminated C string
|
|
324
|
-
* - The returned pointer must be freed with `kreuzberg_free_result`
|
|
325
|
-
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
326
|
-
*
|
|
327
|
-
* # Example (C)
|
|
328
|
-
*
|
|
329
|
-
* ```c
|
|
330
|
-
* const uint8_t* data = ...; // Document bytes
|
|
331
|
-
* size_t len = ...; // Length of data
|
|
332
|
-
* const char* mime = "application/pdf";
|
|
333
|
-
* CExtractionResult* result = kreuzberg_extract_bytes_sync(data, len, mime);
|
|
334
|
-
* if (result != NULL && result->success) {
|
|
335
|
-
* printf("Content: %s\n", result->content);
|
|
336
|
-
* kreuzberg_free_result(result);
|
|
337
|
-
* } else {
|
|
338
|
-
* const char* error = kreuzberg_last_error();
|
|
339
|
-
* printf("Error: %s\n", error);
|
|
340
|
-
* }
|
|
341
|
-
* ```
|
|
342
|
-
*/
|
|
343
|
-
struct CExtractionResult *kreuzberg_extract_bytes_sync(const uint8_t *data,
|
|
344
|
-
uintptr_t data_len,
|
|
345
|
-
const char *mime_type);
|
|
346
|
-
|
|
347
|
-
/**
|
|
348
|
-
* Extract text and metadata from byte array with custom configuration (synchronous).
|
|
349
|
-
*
|
|
350
|
-
* # Safety
|
|
351
|
-
*
|
|
352
|
-
* - `data` must be a valid pointer to a byte array of length `data_len`
|
|
353
|
-
* - `mime_type` must be a valid null-terminated C string
|
|
354
|
-
* - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
355
|
-
* - The returned pointer must be freed with `kreuzberg_free_result`
|
|
356
|
-
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
357
|
-
*
|
|
358
|
-
* # Example (C)
|
|
359
|
-
*
|
|
360
|
-
* ```c
|
|
361
|
-
* const uint8_t* data = ...; // Document bytes
|
|
362
|
-
* size_t len = ...; // Length of data
|
|
363
|
-
* const char* mime = "application/pdf";
|
|
364
|
-
* const char* config = "{\"force_ocr\": true, \"ocr\": {\"language\": \"deu\"}}";
|
|
365
|
-
* CExtractionResult* result = kreuzberg_extract_bytes_sync_with_config(data, len, mime, config);
|
|
366
|
-
* if (result != NULL && result->success) {
|
|
367
|
-
* printf("Content: %s\n", result->content);
|
|
368
|
-
* kreuzberg_free_result(result);
|
|
369
|
-
* }
|
|
370
|
-
* ```
|
|
371
|
-
*/
|
|
372
|
-
struct CExtractionResult *kreuzberg_extract_bytes_sync_with_config(const uint8_t *data,
|
|
373
|
-
uintptr_t data_len,
|
|
374
|
-
const char *mime_type,
|
|
375
|
-
const char *config_json);
|
|
376
|
-
|
|
377
|
-
/**
|
|
378
|
-
* Batch extract text and metadata from multiple files (synchronous).
|
|
379
|
-
*
|
|
380
|
-
* # Safety
|
|
381
|
-
*
|
|
382
|
-
* - `file_paths` must be a valid pointer to an array of null-terminated C strings
|
|
383
|
-
* - `count` must be the number of file paths in the array
|
|
384
|
-
* - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
385
|
-
* - The returned pointer must be freed with `kreuzberg_free_batch_result`
|
|
386
|
-
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
387
|
-
*/
|
|
388
|
-
struct CBatchResult *kreuzberg_batch_extract_files_sync(const char *const *file_paths,
|
|
389
|
-
uintptr_t count,
|
|
390
|
-
const char *config_json);
|
|
391
|
-
|
|
392
|
-
/**
|
|
393
|
-
* Batch extract text and metadata from multiple byte arrays (synchronous).
|
|
394
|
-
*
|
|
395
|
-
* # Safety
|
|
396
|
-
*
|
|
397
|
-
* - `items` must be a valid pointer to an array of CBytesWithMime structures
|
|
398
|
-
* - `count` must be the number of items in the array
|
|
399
|
-
* - `config_json` must be a valid null-terminated C string containing JSON, or NULL for default config
|
|
400
|
-
* - The returned pointer must be freed with `kreuzberg_free_batch_result`
|
|
401
|
-
* - Returns NULL on error (check `kreuzberg_last_error` for details)
|
|
402
|
-
*/
|
|
403
|
-
struct CBatchResult *kreuzberg_batch_extract_bytes_sync(const struct CBytesWithMime *items,
|
|
404
|
-
uintptr_t count,
|
|
405
|
-
const char *config_json);
|
|
406
|
-
|
|
407
|
-
/**
|
|
408
|
-
* Load an extraction configuration from a TOML/YAML/JSON file.
|
|
409
|
-
*
|
|
410
|
-
* # Safety
|
|
411
|
-
*
|
|
412
|
-
* - `file_path` must be a valid null-terminated C string
|
|
413
|
-
* - The returned string must be freed with `kreuzberg_free_string`
|
|
414
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
415
|
-
*/
|
|
416
|
-
char *kreuzberg_load_extraction_config_from_file(const char *file_path);
|
|
417
|
-
|
|
418
|
-
/**
|
|
419
|
-
* Free a batch result returned by batch extraction functions.
|
|
420
|
-
*
|
|
421
|
-
* # Safety
|
|
422
|
-
*
|
|
423
|
-
* - `batch_result` must be a pointer previously returned by a batch extraction function
|
|
424
|
-
* - `batch_result` can be NULL (no-op)
|
|
425
|
-
* - `batch_result` must not be used after this call
|
|
426
|
-
* - All results and strings within the batch result will be freed automatically
|
|
427
|
-
*/
|
|
428
|
-
void kreuzberg_free_batch_result(struct CBatchResult *batch_result);
|
|
429
|
-
|
|
430
|
-
/**
|
|
431
|
-
* Free a string returned by Kreuzberg functions.
|
|
432
|
-
*
|
|
433
|
-
* # Safety
|
|
434
|
-
*
|
|
435
|
-
* - `s` must be a string previously returned by a Kreuzberg function
|
|
436
|
-
* - `s` can be NULL (no-op)
|
|
437
|
-
* - `s` must not be used after this call
|
|
438
|
-
*
|
|
439
|
-
* # Example (C)
|
|
440
|
-
*
|
|
441
|
-
* ```c
|
|
442
|
-
* char* str = result->content;
|
|
443
|
-
* kreuzberg_free_string(str);
|
|
444
|
-
* // str is now invalid
|
|
445
|
-
* ```
|
|
446
|
-
*/
|
|
447
|
-
void kreuzberg_free_string(char *s);
|
|
448
|
-
|
|
449
|
-
/**
|
|
450
|
-
* Clone a null-terminated string using Rust's allocator.
|
|
451
|
-
*
|
|
452
|
-
* # Safety
|
|
453
|
-
*
|
|
454
|
-
* - `s` must be a valid null-terminated UTF-8 string
|
|
455
|
-
* - Returned pointer must be freed with `kreuzberg_free_string`
|
|
456
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
457
|
-
*/
|
|
458
|
-
char *kreuzberg_clone_string(const char *s);
|
|
459
|
-
|
|
460
|
-
/**
|
|
461
|
-
* Free an extraction result returned by `kreuzberg_extract_file_sync`.
|
|
462
|
-
*
|
|
463
|
-
* # Safety
|
|
464
|
-
*
|
|
465
|
-
* - `result` must be a pointer previously returned by `kreuzberg_extract_file_sync`
|
|
466
|
-
* - `result` can be NULL (no-op)
|
|
467
|
-
* - `result` must not be used after this call
|
|
468
|
-
* - All string fields within the result will be freed automatically
|
|
469
|
-
*
|
|
470
|
-
* # Example (C)
|
|
471
|
-
*
|
|
472
|
-
* ```c
|
|
473
|
-
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
474
|
-
* // Use result...
|
|
475
|
-
* kreuzberg_free_result(result);
|
|
476
|
-
* // result is now invalid
|
|
477
|
-
* ```
|
|
478
|
-
*/
|
|
479
|
-
void kreuzberg_free_result(struct CExtractionResult *result);
|
|
480
|
-
|
|
481
|
-
/**
|
|
482
|
-
* Get the last error message from a failed operation.
|
|
483
|
-
*
|
|
484
|
-
* # Safety
|
|
485
|
-
*
|
|
486
|
-
* - Returns a static string that does not need to be freed
|
|
487
|
-
* - Returns NULL if no error has occurred
|
|
488
|
-
* - The returned string is valid until the next Kreuzberg function call on the same thread
|
|
489
|
-
*
|
|
490
|
-
* # Example (C)
|
|
491
|
-
*
|
|
492
|
-
* ```c
|
|
493
|
-
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
494
|
-
* if (result == NULL) {
|
|
495
|
-
* const char* error = kreuzberg_last_error();
|
|
496
|
-
* if (error != NULL) {
|
|
497
|
-
* printf("Error: %s\n", error);
|
|
498
|
-
* }
|
|
499
|
-
* }
|
|
500
|
-
* ```
|
|
501
|
-
*/
|
|
502
|
-
const char *kreuzberg_last_error(void);
|
|
503
|
-
|
|
504
|
-
/**
|
|
505
|
-
* Get the error code for the last error.
|
|
506
|
-
*
|
|
507
|
-
* Returns the error code as an i32. Error codes are defined in ErrorCode enum:
|
|
508
|
-
* - 0: Success (no error)
|
|
509
|
-
* - 1: GenericError
|
|
510
|
-
* - 2: Panic
|
|
511
|
-
* - 3: InvalidArgument
|
|
512
|
-
* - 4: IoError
|
|
513
|
-
* - 5: ParsingError
|
|
514
|
-
* - 6: OcrError
|
|
515
|
-
* - 7: MissingDependency
|
|
516
|
-
*
|
|
517
|
-
* # Safety
|
|
518
|
-
*
|
|
519
|
-
* This function is thread-safe and always safe to call.
|
|
520
|
-
*
|
|
521
|
-
* # Example (C)
|
|
522
|
-
*
|
|
523
|
-
* ```c
|
|
524
|
-
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
525
|
-
* if (result == NULL) {
|
|
526
|
-
* int32_t code = kreuzberg_last_error_code();
|
|
527
|
-
* if (code == 2) {
|
|
528
|
-
* // A panic occurred
|
|
529
|
-
* }
|
|
530
|
-
* }
|
|
531
|
-
* ```
|
|
532
|
-
*/
|
|
533
|
-
int32_t kreuzberg_last_error_code(void);
|
|
534
|
-
|
|
535
|
-
/**
|
|
536
|
-
* Get the panic context for the last error (if it was a panic).
|
|
537
|
-
*
|
|
538
|
-
* Returns a JSON string containing panic context information, or NULL if
|
|
539
|
-
* the last error was not a panic.
|
|
540
|
-
*
|
|
541
|
-
* The JSON structure contains:
|
|
542
|
-
* - file: Source file where panic occurred
|
|
543
|
-
* - line: Line number
|
|
544
|
-
* - function: Function name
|
|
545
|
-
* - message: Panic message
|
|
546
|
-
* - timestamp_secs: Unix timestamp (seconds since epoch)
|
|
547
|
-
*
|
|
548
|
-
* # Safety
|
|
549
|
-
*
|
|
550
|
-
* The returned string must be freed with kreuzberg_free_string().
|
|
551
|
-
*
|
|
552
|
-
* # Example (C)
|
|
553
|
-
*
|
|
554
|
-
* ```c
|
|
555
|
-
* CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
556
|
-
* if (result == NULL && kreuzberg_last_error_code() == 2) {
|
|
557
|
-
* const char* context = kreuzberg_last_panic_context();
|
|
558
|
-
* if (context != NULL) {
|
|
559
|
-
* printf("Panic context: %s\n", context);
|
|
560
|
-
* kreuzberg_free_string((char*)context);
|
|
561
|
-
* }
|
|
562
|
-
* }
|
|
563
|
-
* ```
|
|
564
|
-
*/
|
|
565
|
-
char *kreuzberg_last_panic_context(void);
|
|
566
|
-
|
|
567
|
-
/**
|
|
568
|
-
* Get the library version string.
|
|
569
|
-
*
|
|
570
|
-
* # Safety
|
|
571
|
-
*
|
|
572
|
-
* - Returns a static string that does not need to be freed
|
|
573
|
-
* - The returned string is always valid
|
|
574
|
-
*
|
|
575
|
-
* # Example (C)
|
|
576
|
-
*
|
|
577
|
-
* ```c
|
|
578
|
-
* const char* version = kreuzberg_version();
|
|
579
|
-
* printf("Kreuzberg version: %s\n", version);
|
|
580
|
-
* ```
|
|
581
|
-
*/
|
|
582
|
-
const char *kreuzberg_version(void);
|
|
583
|
-
|
|
584
|
-
/**
|
|
585
|
-
* Register a custom OCR backend via FFI callback.
|
|
586
|
-
*
|
|
587
|
-
* # Safety
|
|
588
|
-
*
|
|
589
|
-
* - `name` must be a valid null-terminated C string
|
|
590
|
-
* - `callback` must be a valid function pointer that:
|
|
591
|
-
* - Does not store the image_bytes pointer
|
|
592
|
-
* - Returns a null-terminated UTF-8 string or NULL on error
|
|
593
|
-
* - The returned string must be freeable by kreuzberg_free_string
|
|
594
|
-
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
595
|
-
*
|
|
596
|
-
* # Example (C)
|
|
597
|
-
*
|
|
598
|
-
* ```c
|
|
599
|
-
* char* my_ocr_backend(const uint8_t* image_bytes, size_t image_length, const char* config_json) {
|
|
600
|
-
* // Implement OCR logic here
|
|
601
|
-
* // Return allocated string with result, or NULL on error
|
|
602
|
-
* return strdup("Extracted text");
|
|
603
|
-
* }
|
|
604
|
-
*
|
|
605
|
-
* bool success = kreuzberg_register_ocr_backend("my-ocr", my_ocr_backend);
|
|
606
|
-
* if (!success) {
|
|
607
|
-
* const char* error = kreuzberg_last_error();
|
|
608
|
-
* printf("Failed to register: %s\n", error);
|
|
609
|
-
* }
|
|
610
|
-
* ```
|
|
611
|
-
*/
|
|
612
|
-
bool kreuzberg_register_ocr_backend(const char *name, OcrBackendCallback callback);
|
|
613
|
-
|
|
614
|
-
/**
|
|
615
|
-
* Register a custom OCR backend with explicit language support via FFI callback.
|
|
616
|
-
*
|
|
617
|
-
* # Safety
|
|
618
|
-
*
|
|
619
|
-
* - `languages_json` must be a null-terminated JSON array of language codes or NULL
|
|
620
|
-
* - See `kreuzberg_register_ocr_backend` for additional safety notes.
|
|
621
|
-
*/
|
|
622
|
-
bool kreuzberg_register_ocr_backend_with_languages(const char *name,
|
|
623
|
-
OcrBackendCallback callback,
|
|
624
|
-
const char *languages_json);
|
|
625
|
-
|
|
626
|
-
/**
|
|
627
|
-
* Register a custom PostProcessor via FFI callback.
|
|
628
|
-
*
|
|
629
|
-
* # Safety
|
|
630
|
-
*
|
|
631
|
-
* - `name` must be a valid null-terminated C string
|
|
632
|
-
* - `callback` must be a valid function pointer that:
|
|
633
|
-
* - Does not store the result_json pointer
|
|
634
|
-
* - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
635
|
-
* - The returned string must be freeable by kreuzberg_free_string
|
|
636
|
-
* - `priority` determines the order of execution (higher priority runs first)
|
|
637
|
-
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
638
|
-
*
|
|
639
|
-
* # Example (C)
|
|
640
|
-
*
|
|
641
|
-
* ```c
|
|
642
|
-
* char* my_post_processor(const char* result_json) {
|
|
643
|
-
* // Parse result_json, modify it, return JSON string
|
|
644
|
-
* return strdup("{\"content\":\"PROCESSED\"}");
|
|
645
|
-
* }
|
|
646
|
-
*
|
|
647
|
-
* bool success = kreuzberg_register_post_processor("my-processor", my_post_processor, 100);
|
|
648
|
-
* if (!success) {
|
|
649
|
-
* const char* error = kreuzberg_last_error();
|
|
650
|
-
* printf("Failed to register: %s\n", error);
|
|
651
|
-
* }
|
|
652
|
-
* ```
|
|
653
|
-
*/
|
|
654
|
-
bool kreuzberg_register_post_processor(const char *name,
|
|
655
|
-
PostProcessorCallback callback,
|
|
656
|
-
int32_t priority);
|
|
657
|
-
|
|
658
|
-
/**
|
|
659
|
-
* Register a custom PostProcessor with an explicit processing stage.
|
|
660
|
-
*
|
|
661
|
-
* # Safety
|
|
662
|
-
*
|
|
663
|
-
* - `name` must be a valid null-terminated C string
|
|
664
|
-
* - `stage` must be a valid null-terminated C string containing "early", "middle", or "late"
|
|
665
|
-
* - `callback` must be a valid function pointer that:
|
|
666
|
-
* - Does not store the result_json pointer
|
|
667
|
-
* - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
668
|
-
* - The returned string must be freeable by kreuzberg_free_string
|
|
669
|
-
* - `priority` determines the order of execution within the stage (higher priority runs first)
|
|
670
|
-
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
671
|
-
*/
|
|
672
|
-
bool kreuzberg_register_post_processor_with_stage(const char *name,
|
|
673
|
-
PostProcessorCallback callback,
|
|
674
|
-
int32_t priority,
|
|
675
|
-
const char *stage);
|
|
676
|
-
|
|
677
|
-
/**
|
|
678
|
-
* Unregister a PostProcessor by name.
|
|
679
|
-
*
|
|
680
|
-
* # Safety
|
|
681
|
-
*
|
|
682
|
-
* - `name` must be a valid null-terminated C string
|
|
683
|
-
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
684
|
-
*
|
|
685
|
-
* # Example (C)
|
|
686
|
-
*
|
|
687
|
-
* ```c
|
|
688
|
-
* bool success = kreuzberg_unregister_post_processor("my-processor");
|
|
689
|
-
* if (!success) {
|
|
690
|
-
* const char* error = kreuzberg_last_error();
|
|
691
|
-
* printf("Failed to unregister: %s\n", error);
|
|
692
|
-
* }
|
|
693
|
-
* ```
|
|
694
|
-
*/
|
|
695
|
-
bool kreuzberg_unregister_post_processor(const char *name);
|
|
696
|
-
|
|
697
|
-
/**
|
|
698
|
-
* Clear all registered PostProcessors.
|
|
699
|
-
*
|
|
700
|
-
* # Safety
|
|
701
|
-
*
|
|
702
|
-
* - Removes all registered processors. Subsequent extractions will run without them.
|
|
703
|
-
* - Returns true on success, false on error.
|
|
704
|
-
*/
|
|
705
|
-
bool kreuzberg_clear_post_processors(void);
|
|
706
|
-
|
|
707
|
-
/**
|
|
708
|
-
* List all registered PostProcessors as a JSON array of names.
|
|
709
|
-
*
|
|
710
|
-
* # Safety
|
|
711
|
-
*
|
|
712
|
-
* - Returned string must be freed with `kreuzberg_free_string`.
|
|
713
|
-
* - Returns NULL on error (check `kreuzberg_last_error`).
|
|
714
|
-
*/
|
|
715
|
-
char *kreuzberg_list_post_processors(void);
|
|
716
|
-
|
|
717
|
-
/**
|
|
718
|
-
* Register a custom DocumentExtractor via FFI callback.
|
|
719
|
-
*
|
|
720
|
-
* # Safety
|
|
721
|
-
*
|
|
722
|
-
* - `name` must be a valid null-terminated C string
|
|
723
|
-
* - `callback` must be a valid function pointer that:
|
|
724
|
-
* - Does not store the content, mime_type, or config_json pointers
|
|
725
|
-
* - Returns a null-terminated UTF-8 JSON string or NULL on error
|
|
726
|
-
* - The returned string must be freeable by kreuzberg_free_string
|
|
727
|
-
* - `mime_types` must be a valid null-terminated C string containing comma-separated MIME types
|
|
728
|
-
* - `priority` determines the order of selection (higher priority preferred)
|
|
729
|
-
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
730
|
-
*
|
|
731
|
-
* # Example (C)
|
|
732
|
-
*
|
|
733
|
-
* ```c
|
|
734
|
-
* char* my_extractor(const uint8_t* content, size_t len, const char* mime_type, const char* config) {
|
|
735
|
-
* // Extract content from bytes, return JSON ExtractionResult
|
|
736
|
-
* return strdup("{\"content\":\"extracted text\",\"mime_type\":\"text/plain\",\"metadata\":{}}");
|
|
737
|
-
* }
|
|
738
|
-
*
|
|
739
|
-
* bool success = kreuzberg_register_document_extractor(
|
|
740
|
-
* "my-extractor",
|
|
741
|
-
* my_extractor,
|
|
742
|
-
* "application/x-custom,text/x-custom",
|
|
743
|
-
* 100
|
|
744
|
-
* );
|
|
745
|
-
* if (!success) {
|
|
746
|
-
* const char* error = kreuzberg_last_error();
|
|
747
|
-
* printf("Failed to register: %s\n", error);
|
|
748
|
-
* }
|
|
749
|
-
* ```
|
|
750
|
-
*/
|
|
751
|
-
bool kreuzberg_register_document_extractor(const char *name,
|
|
752
|
-
DocumentExtractorCallback callback,
|
|
753
|
-
const char *mime_types,
|
|
754
|
-
int32_t priority);
|
|
755
|
-
|
|
756
|
-
/**
|
|
757
|
-
* Unregister a DocumentExtractor by name.
|
|
758
|
-
*
|
|
759
|
-
* # Safety
|
|
760
|
-
*
|
|
761
|
-
* - `name` must be a valid null-terminated C string
|
|
762
|
-
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
763
|
-
*
|
|
764
|
-
* # Example (C)
|
|
765
|
-
*
|
|
766
|
-
* ```c
|
|
767
|
-
* bool success = kreuzberg_unregister_document_extractor("my-extractor");
|
|
768
|
-
* if (!success) {
|
|
769
|
-
* const char* error = kreuzberg_last_error();
|
|
770
|
-
* printf("Failed to unregister: %s\n", error);
|
|
771
|
-
* }
|
|
772
|
-
* ```
|
|
773
|
-
*/
|
|
774
|
-
bool kreuzberg_unregister_document_extractor(const char *name);
|
|
775
|
-
|
|
776
|
-
/**
|
|
777
|
-
* List all registered DocumentExtractors as a JSON array of names.
|
|
778
|
-
*
|
|
779
|
-
* # Safety
|
|
780
|
-
*
|
|
781
|
-
* - Returned string must be freed with `kreuzberg_free_string`.
|
|
782
|
-
* - Returns NULL on error (check `kreuzberg_last_error`).
|
|
783
|
-
*/
|
|
784
|
-
char *kreuzberg_list_document_extractors(void);
|
|
785
|
-
|
|
786
|
-
/**
|
|
787
|
-
* Register a custom Validator via FFI callback.
|
|
788
|
-
*
|
|
789
|
-
* # Safety
|
|
790
|
-
*
|
|
791
|
-
* - `name` must be a valid null-terminated C string
|
|
792
|
-
* - `callback` must be a valid function pointer that:
|
|
793
|
-
* - Does not store the result_json pointer
|
|
794
|
-
* - Returns a null-terminated UTF-8 string (error message) if validation fails
|
|
795
|
-
* - Returns NULL if validation passes
|
|
796
|
-
* - The returned string must be freeable by kreuzberg_free_string
|
|
797
|
-
* - `priority` determines the order of validation (higher priority runs first)
|
|
798
|
-
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
799
|
-
*
|
|
800
|
-
* # Example (C)
|
|
801
|
-
*
|
|
802
|
-
* ```c
|
|
803
|
-
* char* my_validator(const char* result_json) {
|
|
804
|
-
* // Parse result_json, validate it
|
|
805
|
-
* // Return error message if validation fails, NULL if passes
|
|
806
|
-
* if (invalid) {
|
|
807
|
-
* return strdup("Validation failed: content too short");
|
|
808
|
-
* }
|
|
809
|
-
* return NULL;
|
|
810
|
-
* }
|
|
811
|
-
*
|
|
812
|
-
* bool success = kreuzberg_register_validator("my-validator", my_validator, 100);
|
|
813
|
-
* if (!success) {
|
|
814
|
-
* const char* error = kreuzberg_last_error();
|
|
815
|
-
* printf("Failed to register: %s\n", error);
|
|
816
|
-
* }
|
|
817
|
-
* ```
|
|
818
|
-
*/
|
|
819
|
-
bool kreuzberg_register_validator(const char *name, ValidatorCallback callback, int32_t priority);
|
|
820
|
-
|
|
821
|
-
/**
|
|
822
|
-
* Unregister a Validator by name.
|
|
823
|
-
*
|
|
824
|
-
* # Safety
|
|
825
|
-
*
|
|
826
|
-
* - `name` must be a valid null-terminated C string
|
|
827
|
-
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
828
|
-
*
|
|
829
|
-
* # Example (C)
|
|
830
|
-
*
|
|
831
|
-
* ```c
|
|
832
|
-
* bool success = kreuzberg_unregister_validator("my-validator");
|
|
833
|
-
* if (!success) {
|
|
834
|
-
* const char* error = kreuzberg_last_error();
|
|
835
|
-
* printf("Failed to unregister: %s\n", error);
|
|
836
|
-
* }
|
|
837
|
-
* ```
|
|
838
|
-
*/
|
|
839
|
-
bool kreuzberg_unregister_validator(const char *name);
|
|
840
|
-
|
|
841
|
-
/**
|
|
842
|
-
* Clear all registered Validators.
|
|
843
|
-
*
|
|
844
|
-
* # Safety
|
|
845
|
-
*
|
|
846
|
-
* - Removes all validators. Subsequent extractions will skip custom validation.
|
|
847
|
-
* - Returns true on success, false on error.
|
|
848
|
-
*/
|
|
849
|
-
bool kreuzberg_clear_validators(void);
|
|
850
|
-
|
|
851
|
-
/**
|
|
852
|
-
* List all registered Validators as a JSON array of names.
|
|
853
|
-
*
|
|
854
|
-
* # Safety
|
|
855
|
-
*
|
|
856
|
-
* - Returned string must be freed with `kreuzberg_free_string`.
|
|
857
|
-
* - Returns NULL on error (check `kreuzberg_last_error`).
|
|
858
|
-
*/
|
|
859
|
-
char *kreuzberg_list_validators(void);
|
|
860
|
-
|
|
861
|
-
/**
|
|
862
|
-
* Unregister an OCR backend by name.
|
|
863
|
-
*
|
|
864
|
-
* # Safety
|
|
865
|
-
*
|
|
866
|
-
* - `name` must be a valid null-terminated C string
|
|
867
|
-
* - Returns true on success, false on error (check kreuzberg_last_error)
|
|
868
|
-
*
|
|
869
|
-
* # Example (C)
|
|
870
|
-
*
|
|
871
|
-
* ```c
|
|
872
|
-
* bool success = kreuzberg_unregister_ocr_backend("custom-ocr");
|
|
873
|
-
* if (!success) {
|
|
874
|
-
* const char* error = kreuzberg_last_error();
|
|
875
|
-
* printf("Failed to unregister: %s\n", error);
|
|
876
|
-
* }
|
|
877
|
-
* ```
|
|
878
|
-
*/
|
|
879
|
-
bool kreuzberg_unregister_ocr_backend(const char *name);
|
|
880
|
-
|
|
881
|
-
/**
|
|
882
|
-
* List all registered OCR backends as a JSON array of names.
|
|
883
|
-
*
|
|
884
|
-
* # Safety
|
|
885
|
-
*
|
|
886
|
-
* - Returned string must be freed with `kreuzberg_free_string`.
|
|
887
|
-
* - Returns NULL on error (check `kreuzberg_last_error`).
|
|
888
|
-
*
|
|
889
|
-
* # Example (C)
|
|
890
|
-
*
|
|
891
|
-
* ```c
|
|
892
|
-
* char* backends = kreuzberg_list_ocr_backends();
|
|
893
|
-
* if (backends == NULL) {
|
|
894
|
-
* const char* error = kreuzberg_last_error();
|
|
895
|
-
* printf("Failed to list backends: %s\n", error);
|
|
896
|
-
* } else {
|
|
897
|
-
* printf("OCR backends: %s\n", backends);
|
|
898
|
-
* kreuzberg_free_string(backends);
|
|
899
|
-
* }
|
|
900
|
-
* ```
|
|
901
|
-
*/
|
|
902
|
-
char *kreuzberg_list_ocr_backends(void);
|
|
903
|
-
|
|
904
|
-
/**
|
|
905
|
-
* Clear all registered OCR backends.
|
|
906
|
-
*
|
|
907
|
-
* # Safety
|
|
908
|
-
*
|
|
909
|
-
* - Removes all registered OCR backends. Subsequent extractions will use only built-in backends.
|
|
910
|
-
* - Returns true on success, false on error.
|
|
911
|
-
*
|
|
912
|
-
* # Example (C)
|
|
913
|
-
*
|
|
914
|
-
* ```c
|
|
915
|
-
* bool success = kreuzberg_clear_ocr_backends();
|
|
916
|
-
* if (!success) {
|
|
917
|
-
* const char* error = kreuzberg_last_error();
|
|
918
|
-
* printf("Failed to clear OCR backends: %s\n", error);
|
|
919
|
-
* }
|
|
920
|
-
* ```
|
|
921
|
-
*/
|
|
922
|
-
bool kreuzberg_clear_ocr_backends(void);
|
|
923
|
-
|
|
924
|
-
/**
|
|
925
|
-
* Clear all registered DocumentExtractors.
|
|
926
|
-
*
|
|
927
|
-
* # Safety
|
|
928
|
-
*
|
|
929
|
-
* - Removes all registered extractors. Subsequent extractions will use only built-in extractors.
|
|
930
|
-
* - Returns true on success, false on error.
|
|
931
|
-
*
|
|
932
|
-
* # Example (C)
|
|
933
|
-
*
|
|
934
|
-
* ```c
|
|
935
|
-
* bool success = kreuzberg_clear_document_extractors();
|
|
936
|
-
* if (!success) {
|
|
937
|
-
* const char* error = kreuzberg_last_error();
|
|
938
|
-
* printf("Failed to clear document extractors: %s\n", error);
|
|
939
|
-
* }
|
|
940
|
-
* ```
|
|
941
|
-
*/
|
|
942
|
-
bool kreuzberg_clear_document_extractors(void);
|
|
943
|
-
|
|
944
|
-
/**
|
|
945
|
-
* Detect MIME type from raw bytes.
|
|
946
|
-
*
|
|
947
|
-
* # Safety
|
|
948
|
-
*
|
|
949
|
-
* - `bytes` must be a valid pointer to byte data
|
|
950
|
-
* - `len` must be the correct length of the byte array
|
|
951
|
-
* - The returned string must be freed with `kreuzberg_free_string`
|
|
952
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
953
|
-
*
|
|
954
|
-
* # Example (C)
|
|
955
|
-
*
|
|
956
|
-
* ```c
|
|
957
|
-
* const char* pdf_bytes = "%PDF-1.4\n";
|
|
958
|
-
* char* mime = kreuzberg_detect_mime_type_from_bytes((const uint8_t*)pdf_bytes, strlen(pdf_bytes));
|
|
959
|
-
* if (mime == NULL) {
|
|
960
|
-
* const char* error = kreuzberg_last_error();
|
|
961
|
-
* printf("Failed to detect MIME type: %s\n", error);
|
|
962
|
-
* } else {
|
|
963
|
-
* printf("MIME type: %s\n", mime);
|
|
964
|
-
* kreuzberg_free_string(mime);
|
|
965
|
-
* }
|
|
966
|
-
* ```
|
|
967
|
-
*/
|
|
968
|
-
char *kreuzberg_detect_mime_type_from_bytes(const uint8_t *bytes, uintptr_t len);
|
|
969
|
-
|
|
970
|
-
/**
|
|
971
|
-
* Detect MIME type from file path (checks extension and reads file content).
|
|
972
|
-
*
|
|
973
|
-
* # Safety
|
|
974
|
-
*
|
|
975
|
-
* - `file_path` must be a valid null-terminated C string
|
|
976
|
-
* - The returned string must be freed with `kreuzberg_free_string`
|
|
977
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
978
|
-
*
|
|
979
|
-
* # Example (C)
|
|
980
|
-
*
|
|
981
|
-
* ```c
|
|
982
|
-
* char* mime = kreuzberg_detect_mime_type_from_path("document.pdf");
|
|
983
|
-
* if (mime == NULL) {
|
|
984
|
-
* const char* error = kreuzberg_last_error();
|
|
985
|
-
* printf("Failed to detect MIME type: %s\n", error);
|
|
986
|
-
* } else {
|
|
987
|
-
* printf("MIME type: %s\n", mime);
|
|
988
|
-
* kreuzberg_free_string(mime);
|
|
989
|
-
* }
|
|
990
|
-
* ```
|
|
991
|
-
*/
|
|
992
|
-
char *kreuzberg_detect_mime_type_from_path(const char *file_path);
|
|
993
|
-
|
|
994
|
-
/**
|
|
995
|
-
* Get file extensions for a MIME type.
|
|
996
|
-
*
|
|
997
|
-
* # Safety
|
|
998
|
-
*
|
|
999
|
-
* - `mime_type` must be a valid null-terminated C string
|
|
1000
|
-
* - The returned string is a JSON array of extensions (must be freed with `kreuzberg_free_string`)
|
|
1001
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
1002
|
-
*
|
|
1003
|
-
* # Example (C)
|
|
1004
|
-
*
|
|
1005
|
-
* ```c
|
|
1006
|
-
* char* extensions = kreuzberg_get_extensions_for_mime("application/pdf");
|
|
1007
|
-
* if (extensions == NULL) {
|
|
1008
|
-
* const char* error = kreuzberg_last_error();
|
|
1009
|
-
* printf("Failed to get extensions: %s\n", error);
|
|
1010
|
-
* } else {
|
|
1011
|
-
* printf("Extensions: %s\n", extensions);
|
|
1012
|
-
* kreuzberg_free_string(extensions);
|
|
1013
|
-
* }
|
|
1014
|
-
* ```
|
|
1015
|
-
*/
|
|
1016
|
-
char *kreuzberg_get_extensions_for_mime(const char *mime_type);
|
|
1017
|
-
|
|
1018
|
-
/**
|
|
1019
|
-
* Load an ExtractionConfig from a file.
|
|
1020
|
-
*
|
|
1021
|
-
* Automatically detects the file format based on extension:
|
|
1022
|
-
* - `.toml` - TOML format
|
|
1023
|
-
* - `.yaml`, `.yml` - YAML format
|
|
1024
|
-
* - `.json` - JSON format
|
|
1025
|
-
*
|
|
1026
|
-
* # Safety
|
|
1027
|
-
*
|
|
1028
|
-
* - `path` must be a valid null-terminated C string representing a file path
|
|
1029
|
-
* - Returns a pointer to ExtractionConfig on success, NULL on error
|
|
1030
|
-
* - The returned config must be freed with `kreuzberg_free_config`
|
|
1031
|
-
* - Check `kreuzberg_last_error` on NULL return
|
|
1032
|
-
*
|
|
1033
|
-
* # Example (C)
|
|
1034
|
-
*
|
|
1035
|
-
* ```c
|
|
1036
|
-
* ExtractionConfig* config = kreuzberg_config_from_file("kreuzberg.toml");
|
|
1037
|
-
* if (config == NULL) {
|
|
1038
|
-
* const char* error = kreuzberg_last_error();
|
|
1039
|
-
* printf("Failed to load config: %s\n", error);
|
|
1040
|
-
* return 1;
|
|
1041
|
-
* }
|
|
1042
|
-
*
|
|
1043
|
-
* // Use config...
|
|
1044
|
-
* char* result = kreuzberg_extract_file_with_config_sync("document.pdf", config);
|
|
1045
|
-
*
|
|
1046
|
-
* kreuzberg_free_config(config);
|
|
1047
|
-
* ```
|
|
1048
|
-
*/
|
|
1049
|
-
ExtractionConfig *kreuzberg_config_from_file(const char *path);
|
|
1050
|
-
|
|
1051
|
-
/**
|
|
1052
|
-
* Discover and load an ExtractionConfig by searching parent directories.
|
|
1053
|
-
*
|
|
1054
|
-
* Searches the current directory and all parent directories for:
|
|
1055
|
-
* - `kreuzberg.toml`
|
|
1056
|
-
* - `kreuzberg.yaml`
|
|
1057
|
-
* - `kreuzberg.yml`
|
|
1058
|
-
* - `kreuzberg.json`
|
|
1059
|
-
*
|
|
1060
|
-
* Returns the first config file found as JSON, or NULL if none found.
|
|
1061
|
-
*
|
|
1062
|
-
* # Safety
|
|
1063
|
-
*
|
|
1064
|
-
* - The returned string must be freed with `kreuzberg_free_string`
|
|
1065
|
-
* - Returns NULL if no config found or on error (check `kreuzberg_last_error`)
|
|
1066
|
-
*
|
|
1067
|
-
* # Example (C)
|
|
1068
|
-
*
|
|
1069
|
-
* ```c
|
|
1070
|
-
* char* config_json = kreuzberg_config_discover();
|
|
1071
|
-
* if (config_json == NULL) {
|
|
1072
|
-
* const char* error = kreuzberg_last_error();
|
|
1073
|
-
* if (error != NULL && strlen(error) > 0) {
|
|
1074
|
-
* printf("Error discovering config: %s\n", error);
|
|
1075
|
-
* return 1;
|
|
1076
|
-
* }
|
|
1077
|
-
* // No config found, use defaults
|
|
1078
|
-
* printf("No config file found\n");
|
|
1079
|
-
* } else {
|
|
1080
|
-
* printf("Config: %s\n", config_json);
|
|
1081
|
-
* kreuzberg_free_string(config_json);
|
|
1082
|
-
* }
|
|
1083
|
-
* ```
|
|
1084
|
-
*/
|
|
1085
|
-
char *kreuzberg_config_discover(void);
|
|
1086
|
-
|
|
1087
|
-
#endif /* KREUZBERG_FFI_H */
|