kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,851 +0,0 @@
|
|
|
1
|
-
# kreuzberg-ffi
|
|
2
|
-
|
|
3
|
-
C Foreign Function Interface (FFI) bindings for the Kreuzberg document intelligence library.
|
|
4
|
-
|
|
5
|
-
## Overview
|
|
6
|
-
|
|
7
|
-
This crate provides a C-compatible API that bridges the high-performance Rust core of Kreuzberg with multiple programming languages and FFI systems. It is the foundation for language bindings in Java (Panama FFI), Go (cgo), C# (P/Invoke), and other languages with C interoperability.
|
|
8
|
-
|
|
9
|
-
The FFI exposes extraction functions, configuration management, plugin registration, and error handling through a stable C interface with thread-safe callbacks.
|
|
10
|
-
|
|
11
|
-
## Architecture
|
|
12
|
-
|
|
13
|
-
### FFI Bridge Layers
|
|
14
|
-
|
|
15
|
-
```
|
|
16
|
-
Language-Specific Bindings
|
|
17
|
-
↓
|
|
18
|
-
Kreuzberg FFI C Library (crates/kreuzberg-ffi) ← This crate
|
|
19
|
-
↓
|
|
20
|
-
Rust Core Library (crates/kreuzberg)
|
|
21
|
-
↓
|
|
22
|
-
Document Extraction Engines
|
|
23
|
-
```
|
|
24
|
-
|
|
25
|
-
### Binding Support
|
|
26
|
-
|
|
27
|
-
This FFI layer is consumed by:
|
|
28
|
-
|
|
29
|
-
- **Java** (packages/java): Using Java 25 Foreign Function & Memory API (Panama FFI)
|
|
30
|
-
- **Go** (packages/go): Using cgo wrapper bindings
|
|
31
|
-
- **C#** (packages/csharp): Using P/Invoke interop
|
|
32
|
-
- **Zig** and other C-compatible languages
|
|
33
|
-
|
|
34
|
-
### Key Components
|
|
35
|
-
|
|
36
|
-
- **Core Extraction** (`extract_file`, `extract_bytes`): Document text and data extraction
|
|
37
|
-
- **Batch Operations**: Parallel processing of multiple documents
|
|
38
|
-
- **MIME Detection**: File format identification
|
|
39
|
-
- **Configuration Management**: Loading and applying extraction settings
|
|
40
|
-
- **Plugin System**: OCR backend registration and callbacks
|
|
41
|
-
- **Error Handling**: Thread-local error message storage
|
|
42
|
-
- **Memory Management**: Safe pointer handling and FFI boundaries
|
|
43
|
-
|
|
44
|
-
## Installation
|
|
45
|
-
|
|
46
|
-
### Build from Source
|
|
47
|
-
|
|
48
|
-
```bash
|
|
49
|
-
cargo build --release -p kreuzberg-ffi
|
|
50
|
-
```
|
|
51
|
-
|
|
52
|
-
### Output Artifacts
|
|
53
|
-
|
|
54
|
-
After building, you will have:
|
|
55
|
-
|
|
56
|
-
- **Dynamic Library**: `target/release/libkreuzberg_ffi.{so,dylib,dll}`
|
|
57
|
-
- For loading at runtime
|
|
58
|
-
- Platform-specific extensions (`.so` Linux, `.dylib` macOS, `.dll` Windows)
|
|
59
|
-
|
|
60
|
-
- **Static Library**: `target/release/libkreuzberg_ffi.{a,lib}`
|
|
61
|
-
- For static linking into applications
|
|
62
|
-
- Platform-specific extensions (`.a` Unix, `.lib` Windows)
|
|
63
|
-
|
|
64
|
-
- **Header File**: Auto-generated via `cbindgen` during build
|
|
65
|
-
|
|
66
|
-
### Header Generation
|
|
67
|
-
|
|
68
|
-
The C header file is automatically generated during the build process via `cbindgen`:
|
|
69
|
-
|
|
70
|
-
```bash
|
|
71
|
-
cargo build --release -p kreuzberg-ffi
|
|
72
|
-
# Header is generated at build time based on #[no_mangle] functions
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
For manual header generation:
|
|
76
|
-
|
|
77
|
-
```bash
|
|
78
|
-
cargo build --features html,embeddings -p kreuzberg-ffi
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
### pkg-config File Generation
|
|
82
|
-
|
|
83
|
-
The build process automatically generates pkg-config files for library discovery:
|
|
84
|
-
|
|
85
|
-
```bash
|
|
86
|
-
cargo build --release -p kreuzberg-ffi
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
This creates two variants in `crates/kreuzberg-ffi/`:
|
|
90
|
-
- **kreuzberg-ffi.pc**: Development version (prefix points to repository)
|
|
91
|
-
- **kreuzberg-ffi-install.pc**: Installation version (prefix=/usr/local)
|
|
92
|
-
|
|
93
|
-
The development variant enables monorepo developers to use pkg-config:
|
|
94
|
-
|
|
95
|
-
```bash
|
|
96
|
-
export PKG_CONFIG_PATH="$PWD/crates/kreuzberg-ffi:$PKG_CONFIG_PATH"
|
|
97
|
-
pkg-config --cflags kreuzberg-ffi # Returns -I/path/to/repo/crates/kreuzberg-ffi
|
|
98
|
-
pkg-config --libs kreuzberg-ffi # Returns -L/path/to/repo/target/release -lkreuzberg_ffi
|
|
99
|
-
```
|
|
100
|
-
|
|
101
|
-
The installation variant is used in release artifacts for third-party use.
|
|
102
|
-
|
|
103
|
-
### Installing from Release Artifacts
|
|
104
|
-
|
|
105
|
-
Pre-built binaries are available for Linux, macOS, and Windows (MinGW) from the [releases page](https://github.com/kreuzberg-dev/kreuzberg/releases).
|
|
106
|
-
|
|
107
|
-
Each `go-ffi-{platform}.tar.gz` archive contains:
|
|
108
|
-
- `lib/`: Shared libraries (kreuzberg-ffi, pdfium, onnxruntime)
|
|
109
|
-
- `include/`: C header file (kreuzberg.h)
|
|
110
|
-
- `share/pkgconfig/`: pkg-config file for library discovery
|
|
111
|
-
- `README.md`: Installation instructions
|
|
112
|
-
|
|
113
|
-
Installation:
|
|
114
|
-
|
|
115
|
-
```bash
|
|
116
|
-
# Download and extract
|
|
117
|
-
tar -xzf go-ffi-linux-x86_64.tar.gz
|
|
118
|
-
cd kreuzberg-ffi
|
|
119
|
-
|
|
120
|
-
# System-wide installation (requires sudo)
|
|
121
|
-
sudo cp -r lib/* /usr/local/lib/
|
|
122
|
-
sudo cp -r include/* /usr/local/include/
|
|
123
|
-
sudo cp -r share/* /usr/local/share/
|
|
124
|
-
sudo ldconfig # Linux only
|
|
125
|
-
|
|
126
|
-
# Verify
|
|
127
|
-
pkg-config --modversion kreuzberg-ffi
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
For user-local installation or custom prefix, see the README.md included in the archive.
|
|
131
|
-
|
|
132
|
-
## Quick Start: C Example
|
|
133
|
-
|
|
134
|
-
### Basic Extraction
|
|
135
|
-
|
|
136
|
-
```c
|
|
137
|
-
#include <stdio.h>
|
|
138
|
-
#include <stdlib.h>
|
|
139
|
-
#include <string.h>
|
|
140
|
-
|
|
141
|
-
// Include the auto-generated Kreuzberg FFI header
|
|
142
|
-
#include "kreuzberg_ffi.h"
|
|
143
|
-
|
|
144
|
-
int main() {
|
|
145
|
-
// Extract text from a PDF file
|
|
146
|
-
const char* file_path = "document.pdf";
|
|
147
|
-
const char* mime_type = NULL; // Auto-detect
|
|
148
|
-
const char* config_json = "{}"; // Empty config uses defaults
|
|
149
|
-
|
|
150
|
-
// Perform extraction
|
|
151
|
-
ExtractionResult result = kreuzberg_extract_file(
|
|
152
|
-
file_path,
|
|
153
|
-
mime_type,
|
|
154
|
-
config_json
|
|
155
|
-
);
|
|
156
|
-
|
|
157
|
-
// Check for errors
|
|
158
|
-
if (result.error != NULL) {
|
|
159
|
-
fprintf(stderr, "Extraction failed: %s\n", result.error);
|
|
160
|
-
kreuzberg_free_string(result.error);
|
|
161
|
-
return 1;
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
// Process the result
|
|
165
|
-
printf("Extracted content:\n%s\n", result.content);
|
|
166
|
-
printf("MIME Type: %s\n", result.mime_type);
|
|
167
|
-
|
|
168
|
-
// Free resources
|
|
169
|
-
kreuzberg_free_extraction_result(result);
|
|
170
|
-
|
|
171
|
-
return 0;
|
|
172
|
-
}
|
|
173
|
-
```
|
|
174
|
-
|
|
175
|
-
### Compilation
|
|
176
|
-
|
|
177
|
-
```bash
|
|
178
|
-
# Link against the dynamic library
|
|
179
|
-
gcc -o extract_example extract.c \
|
|
180
|
-
-L/path/to/kreuzberg/target/release \
|
|
181
|
-
-lkreuzberg_ffi
|
|
182
|
-
```
|
|
183
|
-
|
|
184
|
-
## API Reference
|
|
185
|
-
|
|
186
|
-
### Core Extraction Functions
|
|
187
|
-
|
|
188
|
-
All functions return results via out-parameters or result structs. Error messages are accessible via `kreuzberg_get_last_error()`.
|
|
189
|
-
|
|
190
|
-
#### `ExtractionResult kreuzberg_extract_file(const char *path, const char *mime_type, const char *config_json)`
|
|
191
|
-
|
|
192
|
-
Extract text and metadata from a file.
|
|
193
|
-
|
|
194
|
-
**Parameters:**
|
|
195
|
-
- `path`: Absolute or relative file path (must exist)
|
|
196
|
-
- `mime_type`: Optional MIME type hint (e.g., "application/pdf"). Pass NULL for auto-detection.
|
|
197
|
-
- `config_json`: JSON string with extraction configuration
|
|
198
|
-
|
|
199
|
-
**Returns:**
|
|
200
|
-
- `ExtractionResult`: Contains `content` (extracted text), `mime_type`, `metadata` (JSON), and optional `error`
|
|
201
|
-
|
|
202
|
-
**Example:**
|
|
203
|
-
|
|
204
|
-
```c
|
|
205
|
-
const char* config = "{\"use_cache\": true, \"enable_quality_processing\": true}";
|
|
206
|
-
ExtractionResult result = kreuzberg_extract_file("document.pdf", NULL, config);
|
|
207
|
-
|
|
208
|
-
if (result.error) {
|
|
209
|
-
printf("Error: %s\n", result.error);
|
|
210
|
-
} else {
|
|
211
|
-
printf("Content: %s\n", result.content);
|
|
212
|
-
printf("MIME: %s\n", result.mime_type);
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
kreuzberg_free_extraction_result(result);
|
|
216
|
-
```
|
|
217
|
-
|
|
218
|
-
#### `ExtractionResult kreuzberg_extract_bytes(const char *data, size_t data_len, const char *mime_type, const char *config_json)`
|
|
219
|
-
|
|
220
|
-
Extract from a byte buffer (e.g., file in memory).
|
|
221
|
-
|
|
222
|
-
**Parameters:**
|
|
223
|
-
- `data`: Pointer to byte buffer
|
|
224
|
-
- `data_len`: Buffer length in bytes
|
|
225
|
-
- `mime_type`: Optional MIME type hint (required if auto-detection not desired)
|
|
226
|
-
- `config_json`: JSON extraction configuration
|
|
227
|
-
|
|
228
|
-
**Returns:**
|
|
229
|
-
- `ExtractionResult`: Same as `extract_file`
|
|
230
|
-
|
|
231
|
-
**Example:**
|
|
232
|
-
|
|
233
|
-
```c
|
|
234
|
-
// Read file into buffer
|
|
235
|
-
FILE* f = fopen("document.pdf", "rb");
|
|
236
|
-
fseek(f, 0, SEEK_END);
|
|
237
|
-
size_t size = ftell(f);
|
|
238
|
-
fseek(f, 0, SEEK_SET);
|
|
239
|
-
unsigned char* buffer = malloc(size);
|
|
240
|
-
fread(buffer, 1, size, f);
|
|
241
|
-
fclose(f);
|
|
242
|
-
|
|
243
|
-
// Extract from buffer
|
|
244
|
-
const char* config = "{}";
|
|
245
|
-
ExtractionResult result = kreuzberg_extract_bytes(
|
|
246
|
-
(const char*)buffer,
|
|
247
|
-
size,
|
|
248
|
-
"application/pdf",
|
|
249
|
-
config
|
|
250
|
-
);
|
|
251
|
-
|
|
252
|
-
if (!result.error) {
|
|
253
|
-
printf("Content: %s\n", result.content);
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
kreuzberg_free_extraction_result(result);
|
|
257
|
-
free(buffer);
|
|
258
|
-
```
|
|
259
|
-
|
|
260
|
-
#### `BatchExtractionResult kreuzberg_batch_extract_files(const char **paths, size_t paths_count, const char *config_json)`
|
|
261
|
-
|
|
262
|
-
Process multiple files in parallel.
|
|
263
|
-
|
|
264
|
-
**Parameters:**
|
|
265
|
-
- `paths`: Array of file path strings
|
|
266
|
-
- `paths_count`: Number of paths in array
|
|
267
|
-
- `config_json`: JSON extraction configuration (applied to all files)
|
|
268
|
-
|
|
269
|
-
**Returns:**
|
|
270
|
-
- `BatchExtractionResult`: Contains array of `ExtractionResult` and error status
|
|
271
|
-
|
|
272
|
-
**Example:**
|
|
273
|
-
|
|
274
|
-
```c
|
|
275
|
-
const char* files[] = {"doc1.pdf", "doc2.docx", "doc3.xlsx"};
|
|
276
|
-
size_t count = 3;
|
|
277
|
-
|
|
278
|
-
BatchExtractionResult batch_result = kreuzberg_batch_extract_files(
|
|
279
|
-
files,
|
|
280
|
-
count,
|
|
281
|
-
"{}"
|
|
282
|
-
);
|
|
283
|
-
|
|
284
|
-
if (batch_result.error) {
|
|
285
|
-
printf("Batch error: %s\n", batch_result.error);
|
|
286
|
-
} else {
|
|
287
|
-
for (size_t i = 0; i < batch_result.count; i++) {
|
|
288
|
-
printf("File %zu: %s\n", i, batch_result.results[i].mime_type);
|
|
289
|
-
}
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
kreuzberg_free_batch_extraction_result(batch_result);
|
|
293
|
-
```
|
|
294
|
-
|
|
295
|
-
#### `char* kreuzberg_detect_mime_type(const char *path, bool use_cache)`
|
|
296
|
-
|
|
297
|
-
Identify the MIME type of a file using magic bytes and file signatures.
|
|
298
|
-
|
|
299
|
-
**Parameters:**
|
|
300
|
-
- `path`: File path to analyze
|
|
301
|
-
- `use_cache`: Whether to use cached results for the same path
|
|
302
|
-
|
|
303
|
-
**Returns:**
|
|
304
|
-
- `char*`: MIME type string (must be freed with `kreuzberg_free_string`)
|
|
305
|
-
|
|
306
|
-
**Example:**
|
|
307
|
-
|
|
308
|
-
```c
|
|
309
|
-
char* mime = kreuzberg_detect_mime_type("unknown-file", true);
|
|
310
|
-
if (mime) {
|
|
311
|
-
printf("MIME Type: %s\n", mime);
|
|
312
|
-
kreuzberg_free_string(mime);
|
|
313
|
-
}
|
|
314
|
-
```
|
|
315
|
-
|
|
316
|
-
### Configuration Management
|
|
317
|
-
|
|
318
|
-
#### `char* kreuzberg_get_default_config_json()`
|
|
319
|
-
|
|
320
|
-
Get default extraction configuration as JSON.
|
|
321
|
-
|
|
322
|
-
**Returns:**
|
|
323
|
-
- `char*`: JSON string (must be freed with `kreuzberg_free_string`)
|
|
324
|
-
|
|
325
|
-
**Example:**
|
|
326
|
-
|
|
327
|
-
```c
|
|
328
|
-
char* default_config = kreuzberg_get_default_config_json();
|
|
329
|
-
printf("Default config: %s\n", default_config);
|
|
330
|
-
kreuzberg_free_string(default_config);
|
|
331
|
-
```
|
|
332
|
-
|
|
333
|
-
#### Configuration JSON Schema
|
|
334
|
-
|
|
335
|
-
```json
|
|
336
|
-
{
|
|
337
|
-
"use_cache": true,
|
|
338
|
-
"enable_quality_processing": false,
|
|
339
|
-
"force_ocr": false,
|
|
340
|
-
"ocr": {
|
|
341
|
-
"backend": "tesseract",
|
|
342
|
-
"language": "eng",
|
|
343
|
-
"tesseract_config": {
|
|
344
|
-
"enable_table_detection": true,
|
|
345
|
-
"psm": 6,
|
|
346
|
-
"min_confidence": 50.0
|
|
347
|
-
}
|
|
348
|
-
},
|
|
349
|
-
"chunking": {
|
|
350
|
-
"max_chars": 1000,
|
|
351
|
-
"max_overlap": 200
|
|
352
|
-
},
|
|
353
|
-
"language_detection": {
|
|
354
|
-
"enabled": false,
|
|
355
|
-
"min_confidence": 0.8,
|
|
356
|
-
"detect_multiple": false
|
|
357
|
-
}
|
|
358
|
-
}
|
|
359
|
-
```
|
|
360
|
-
|
|
361
|
-
### Plugin System
|
|
362
|
-
|
|
363
|
-
#### `void kreuzberg_register_ocr_backend(const char *backend_name, OcrBackendCallback callback)`
|
|
364
|
-
|
|
365
|
-
Register a custom OCR backend implemented in the calling language.
|
|
366
|
-
|
|
367
|
-
**Parameters:**
|
|
368
|
-
- `backend_name`: Unique identifier for the backend (e.g., "custom-ocr")
|
|
369
|
-
- `callback`: Function pointer to OCR processing implementation
|
|
370
|
-
|
|
371
|
-
**Example (C):**
|
|
372
|
-
|
|
373
|
-
```c
|
|
374
|
-
// Define the callback function
|
|
375
|
-
OcrBackendResult custom_ocr_callback(
|
|
376
|
-
const char* image_data,
|
|
377
|
-
size_t image_size,
|
|
378
|
-
const char* language,
|
|
379
|
-
void* user_data
|
|
380
|
-
) {
|
|
381
|
-
// Call out to Python, Go, C#, etc. to perform OCR
|
|
382
|
-
// Implementation depends on the host language
|
|
383
|
-
|
|
384
|
-
OcrBackendResult result = {0};
|
|
385
|
-
result.content = "Extracted text from OCR";
|
|
386
|
-
result.metadata = "{}";
|
|
387
|
-
return result;
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
// Register the backend
|
|
391
|
-
kreuzberg_register_ocr_backend("easyocr", custom_ocr_callback);
|
|
392
|
-
|
|
393
|
-
// Now use it in extraction config:
|
|
394
|
-
const char* config = "{\"ocr\": {\"backend\": \"easyocr\", \"language\": \"eng\"}}";
|
|
395
|
-
ExtractionResult result = kreuzberg_extract_file("scanned.pdf", NULL, config);
|
|
396
|
-
```
|
|
397
|
-
|
|
398
|
-
### Error Handling
|
|
399
|
-
|
|
400
|
-
#### `char* kreuzberg_get_last_error()`
|
|
401
|
-
|
|
402
|
-
Retrieve the last error message.
|
|
403
|
-
|
|
404
|
-
**Returns:**
|
|
405
|
-
- `char*`: Error message string (NULL if no error)
|
|
406
|
-
|
|
407
|
-
Note: Error messages are thread-local and persist until the next Kreuzberg function call.
|
|
408
|
-
|
|
409
|
-
**Example:**
|
|
410
|
-
|
|
411
|
-
```c
|
|
412
|
-
ExtractionResult result = kreuzberg_extract_file("invalid.pdf", NULL, "{}");
|
|
413
|
-
|
|
414
|
-
if (result.error == NULL) {
|
|
415
|
-
// Also check global error state
|
|
416
|
-
char* error = kreuzberg_get_last_error();
|
|
417
|
-
if (error) {
|
|
418
|
-
printf("Error: %s\n", error);
|
|
419
|
-
}
|
|
420
|
-
}
|
|
421
|
-
```
|
|
422
|
-
|
|
423
|
-
### Memory Management
|
|
424
|
-
|
|
425
|
-
#### `void kreuzberg_free_string(char *ptr)`
|
|
426
|
-
|
|
427
|
-
Free a string allocated by FFI functions.
|
|
428
|
-
|
|
429
|
-
**Note:** Only use for strings returned by FFI functions, not for caller-allocated strings.
|
|
430
|
-
|
|
431
|
-
**Example:**
|
|
432
|
-
|
|
433
|
-
```c
|
|
434
|
-
char* mime = kreuzberg_detect_mime_type("file.pdf", true);
|
|
435
|
-
kreuzberg_free_string(mime);
|
|
436
|
-
|
|
437
|
-
char* error = kreuzberg_get_last_error();
|
|
438
|
-
if (error) {
|
|
439
|
-
kreuzberg_free_string(error);
|
|
440
|
-
}
|
|
441
|
-
```
|
|
442
|
-
|
|
443
|
-
#### `void kreuzberg_free_extraction_result(ExtractionResult result)`
|
|
444
|
-
|
|
445
|
-
Free memory associated with an extraction result.
|
|
446
|
-
|
|
447
|
-
**Example:**
|
|
448
|
-
|
|
449
|
-
```c
|
|
450
|
-
ExtractionResult result = kreuzberg_extract_file("doc.pdf", NULL, "{}");
|
|
451
|
-
// ... use result ...
|
|
452
|
-
kreuzberg_free_extraction_result(result);
|
|
453
|
-
```
|
|
454
|
-
|
|
455
|
-
#### `void kreuzberg_free_batch_extraction_result(BatchExtractionResult result)`
|
|
456
|
-
|
|
457
|
-
Free memory associated with a batch extraction result.
|
|
458
|
-
|
|
459
|
-
**Example:**
|
|
460
|
-
|
|
461
|
-
```c
|
|
462
|
-
BatchExtractionResult batch = kreuzberg_batch_extract_files(files, 3, "{}");
|
|
463
|
-
// ... use batch.results ...
|
|
464
|
-
kreuzberg_free_batch_extraction_result(batch);
|
|
465
|
-
```
|
|
466
|
-
|
|
467
|
-
## Type Definitions
|
|
468
|
-
|
|
469
|
-
### ExtractionResult
|
|
470
|
-
|
|
471
|
-
```c
|
|
472
|
-
typedef struct {
|
|
473
|
-
char* content; // Extracted text
|
|
474
|
-
char* mime_type; // MIME type of document
|
|
475
|
-
char* metadata; // JSON metadata string
|
|
476
|
-
char* error; // Error message (NULL if success)
|
|
477
|
-
// ... tables, images, chunks as needed
|
|
478
|
-
} ExtractionResult;
|
|
479
|
-
```
|
|
480
|
-
|
|
481
|
-
### BatchExtractionResult
|
|
482
|
-
|
|
483
|
-
```c
|
|
484
|
-
typedef struct {
|
|
485
|
-
ExtractionResult* results; // Array of extraction results
|
|
486
|
-
size_t count; // Number of results
|
|
487
|
-
char* error; // Batch-level error (NULL if success)
|
|
488
|
-
} BatchExtractionResult;
|
|
489
|
-
```
|
|
490
|
-
|
|
491
|
-
### OcrBackendCallback
|
|
492
|
-
|
|
493
|
-
```c
|
|
494
|
-
typedef OcrBackendResult (*OcrBackendCallback)(
|
|
495
|
-
const char* image_data,
|
|
496
|
-
size_t image_size,
|
|
497
|
-
const char* language,
|
|
498
|
-
void* user_data
|
|
499
|
-
);
|
|
500
|
-
```
|
|
501
|
-
|
|
502
|
-
## FFI Safety Guidelines
|
|
503
|
-
|
|
504
|
-
### Thread Safety
|
|
505
|
-
|
|
506
|
-
All FFI functions are **thread-safe**. The Kreuzberg core uses Arc, Mutex, and RwLock for safe concurrent access:
|
|
507
|
-
|
|
508
|
-
```c
|
|
509
|
-
// Safe to call from multiple threads
|
|
510
|
-
for (int i = 0; i < 10; i++) {
|
|
511
|
-
pthread_t thread;
|
|
512
|
-
pthread_create(&thread, NULL, extract_worker, (void*)(intptr_t)i);
|
|
513
|
-
}
|
|
514
|
-
```
|
|
515
|
-
|
|
516
|
-
### Pointer Validation
|
|
517
|
-
|
|
518
|
-
The FFI layer validates all input pointers:
|
|
519
|
-
|
|
520
|
-
```c
|
|
521
|
-
// Safe: NULL pointers are handled gracefully
|
|
522
|
-
ExtractionResult result = kreuzberg_extract_file(NULL, NULL, "{}");
|
|
523
|
-
// Returns error in result.error
|
|
524
|
-
|
|
525
|
-
// Safe: Invalid paths return errors
|
|
526
|
-
ExtractionResult result = kreuzberg_extract_file("/nonexistent", NULL, "{}");
|
|
527
|
-
// Returns error in result.error
|
|
528
|
-
```
|
|
529
|
-
|
|
530
|
-
### Memory Lifetime
|
|
531
|
-
|
|
532
|
-
- **Returned strings** must be freed with `kreuzberg_free_string()`
|
|
533
|
-
- **Result structs** must be freed with appropriate cleanup functions
|
|
534
|
-
- **Input parameters** are copied internally; caller retains ownership
|
|
535
|
-
|
|
536
|
-
```c
|
|
537
|
-
// Correct: Free returned values
|
|
538
|
-
char* mime = kreuzberg_detect_mime_type("file.pdf", true);
|
|
539
|
-
kreuzberg_free_string(mime);
|
|
540
|
-
|
|
541
|
-
// Correct: Input paths can be freed immediately after call
|
|
542
|
-
{
|
|
543
|
-
char path[256];
|
|
544
|
-
snprintf(path, sizeof(path), "document.pdf");
|
|
545
|
-
ExtractionResult result = kreuzberg_extract_file(path, NULL, "{}");
|
|
546
|
-
// path can be freed here; FFI has already copied it
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
// Correct: Free result structures
|
|
550
|
-
ExtractionResult result = kreuzberg_extract_file("doc.pdf", NULL, "{}");
|
|
551
|
-
// ... use result ...
|
|
552
|
-
kreuzberg_free_extraction_result(result);
|
|
553
|
-
```
|
|
554
|
-
|
|
555
|
-
### Error Handling Pattern
|
|
556
|
-
|
|
557
|
-
```c
|
|
558
|
-
ExtractionResult result = kreuzberg_extract_file(path, NULL, config);
|
|
559
|
-
|
|
560
|
-
// Check return value first
|
|
561
|
-
if (result.error != NULL) {
|
|
562
|
-
fprintf(stderr, "FFI Error: %s\n", result.error);
|
|
563
|
-
kreuzberg_free_extraction_result(result);
|
|
564
|
-
return;
|
|
565
|
-
}
|
|
566
|
-
|
|
567
|
-
// Process successful result
|
|
568
|
-
printf("Content: %s\n", result.content);
|
|
569
|
-
printf("MIME: %s\n", result.mime_type);
|
|
570
|
-
|
|
571
|
-
// Always cleanup
|
|
572
|
-
kreuzberg_free_extraction_result(result);
|
|
573
|
-
```
|
|
574
|
-
|
|
575
|
-
## Building from C
|
|
576
|
-
|
|
577
|
-
### Static Linking
|
|
578
|
-
|
|
579
|
-
```bash
|
|
580
|
-
# Build the library
|
|
581
|
-
cargo build --release -p kreuzberg-ffi
|
|
582
|
-
|
|
583
|
-
# Create your C program
|
|
584
|
-
gcc -c -o myapp.o myapp.c
|
|
585
|
-
|
|
586
|
-
# Link statically
|
|
587
|
-
gcc -o myapp myapp.o \
|
|
588
|
-
-L/path/to/kreuzberg/target/release \
|
|
589
|
-
-l:libkreuzberg_ffi.a \
|
|
590
|
-
-pthread -ldl # May need additional system libs
|
|
591
|
-
|
|
592
|
-
# Run
|
|
593
|
-
./myapp
|
|
594
|
-
```
|
|
595
|
-
|
|
596
|
-
### Dynamic Linking
|
|
597
|
-
|
|
598
|
-
```bash
|
|
599
|
-
# Build the library
|
|
600
|
-
cargo build --release -p kreuzberg-ffi
|
|
601
|
-
|
|
602
|
-
# Create your C program
|
|
603
|
-
gcc -c -o myapp.o myapp.c
|
|
604
|
-
|
|
605
|
-
# Link dynamically
|
|
606
|
-
gcc -o myapp myapp.o \
|
|
607
|
-
-L/path/to/kreuzberg/target/release \
|
|
608
|
-
-lkreuzberg_ffi
|
|
609
|
-
|
|
610
|
-
# Set library path and run
|
|
611
|
-
export LD_LIBRARY_PATH=/path/to/kreuzberg/target/release:$LD_LIBRARY_PATH
|
|
612
|
-
./myapp
|
|
613
|
-
```
|
|
614
|
-
|
|
615
|
-
### macOS Considerations
|
|
616
|
-
|
|
617
|
-
```bash
|
|
618
|
-
# Build
|
|
619
|
-
cargo build --release -p kreuzberg-ffi
|
|
620
|
-
|
|
621
|
-
# Link
|
|
622
|
-
gcc -o myapp myapp.o \
|
|
623
|
-
-L/path/to/kreuzberg/target/release \
|
|
624
|
-
-lkreuzberg_ffi
|
|
625
|
-
|
|
626
|
-
# Set runtime path
|
|
627
|
-
export DYLD_LIBRARY_PATH=/path/to/kreuzberg/target/release:$DYLD_LIBRARY_PATH
|
|
628
|
-
./myapp
|
|
629
|
-
```
|
|
630
|
-
|
|
631
|
-
## Language Binding Integration
|
|
632
|
-
|
|
633
|
-
### Java Integration (Panama FFI)
|
|
634
|
-
|
|
635
|
-
The FFI is wrapped in Java 25's Foreign Function & Memory API:
|
|
636
|
-
|
|
637
|
-
```java
|
|
638
|
-
// Java code that calls FFI
|
|
639
|
-
Arena arena = Arena.ofConfined();
|
|
640
|
-
MemorySegment path = arena.allocateUtf8String("document.pdf");
|
|
641
|
-
MemorySegment config = arena.allocateUtf8String("{}");
|
|
642
|
-
|
|
643
|
-
ExtractionResult result = KreuzbergFFI.extract_file(path, MemorySegment.NULL, config);
|
|
644
|
-
```
|
|
645
|
-
|
|
646
|
-
### Go Integration (cgo)
|
|
647
|
-
|
|
648
|
-
The FFI is exposed through cgo bindings:
|
|
649
|
-
|
|
650
|
-
```go
|
|
651
|
-
// Go code that calls FFI
|
|
652
|
-
C.kreuzberg_extract_file(C.CString("document.pdf"), nil, C.CString("{}"))
|
|
653
|
-
```
|
|
654
|
-
|
|
655
|
-
### C# Integration (P/Invoke)
|
|
656
|
-
|
|
657
|
-
The FFI is declared in C# through P/Invoke:
|
|
658
|
-
|
|
659
|
-
```csharp
|
|
660
|
-
[DllImport("kreuzberg_ffi", CharSet = CharSet.Ansi)]
|
|
661
|
-
private static extern IntPtr kreuzberg_extract_file(
|
|
662
|
-
string path,
|
|
663
|
-
string mimeType,
|
|
664
|
-
string configJson
|
|
665
|
-
);
|
|
666
|
-
```
|
|
667
|
-
|
|
668
|
-
## Supported Features
|
|
669
|
-
|
|
670
|
-
### Default Features
|
|
671
|
-
- `html`: HTML to Markdown conversion support
|
|
672
|
-
- `embeddings`: Text embedding extraction via fastembed-rs (requires ONNX Runtime - must be installed separately)
|
|
673
|
-
|
|
674
|
-
### System Requirements for Embeddings
|
|
675
|
-
|
|
676
|
-
If using the `embeddings` feature, ONNX Runtime must be installed on the system:
|
|
677
|
-
|
|
678
|
-
```bash
|
|
679
|
-
# macOS
|
|
680
|
-
brew install onnxruntime
|
|
681
|
-
|
|
682
|
-
# Ubuntu/Debian
|
|
683
|
-
sudo apt install libonnxruntime libonnxruntime-dev
|
|
684
|
-
|
|
685
|
-
# Windows (MSVC)
|
|
686
|
-
scoop install onnxruntime
|
|
687
|
-
# OR download from https://github.com/microsoft/onnxruntime/releases
|
|
688
|
-
```
|
|
689
|
-
|
|
690
|
-
Without ONNX Runtime, embeddings functionality will raise errors at runtime.
|
|
691
|
-
|
|
692
|
-
### Core Feature (Windows MinGW Compatibility)
|
|
693
|
-
- `core`: Minimal feature set for cross-platform compatibility
|
|
694
|
-
- Includes: `html` (HTML to Markdown conversion)
|
|
695
|
-
- Excludes: `embeddings` (ONNX Runtime not available on MinGW)
|
|
696
|
-
- Use case: Windows Go bindings with MinGW toolchain
|
|
697
|
-
|
|
698
|
-
### Platform-Specific Build Requirements
|
|
699
|
-
|
|
700
|
-
**Windows MinGW (Go bindings):**
|
|
701
|
-
|
|
702
|
-
The Windows ONNX Runtime library only provides MSVC-compatible .lib files. MinGW cannot link against these, requiring the core feature:
|
|
703
|
-
|
|
704
|
-
```bash
|
|
705
|
-
# Windows MinGW - Use core feature
|
|
706
|
-
cargo build --release -p kreuzberg-ffi --target x86_64-pc-windows-gnu --no-default-features --features core
|
|
707
|
-
|
|
708
|
-
# Windows MSVC - Full features available
|
|
709
|
-
cargo build --release -p kreuzberg-ffi --target x86_64-pc-windows-msvc
|
|
710
|
-
|
|
711
|
-
# Unix (Linux/macOS) - Full features available
|
|
712
|
-
cargo build --release -p kreuzberg-ffi
|
|
713
|
-
```
|
|
714
|
-
|
|
715
|
-
**Why MinGW Requires core Feature:**
|
|
716
|
-
- ONNX Runtime distributes Windows binaries compiled with MSVC toolchain
|
|
717
|
-
- MSVC .lib files use different name mangling and linking conventions than MinGW
|
|
718
|
-
- MinGW's GNU toolchain cannot consume MSVC import libraries
|
|
719
|
-
- The `core` feature excludes the `embeddings` dependency, which depends on ort-sys (ONNX Runtime)
|
|
720
|
-
- HTML support (via html-to-markdown-rs) is pure Rust and works on all platforms
|
|
721
|
-
|
|
722
|
-
### Building with Features
|
|
723
|
-
|
|
724
|
-
```bash
|
|
725
|
-
# Build with HTML and embeddings support (default)
|
|
726
|
-
cargo build --release -p kreuzberg-ffi
|
|
727
|
-
|
|
728
|
-
# Build with core feature only (Windows MinGW compatibility)
|
|
729
|
-
cargo build --release -p kreuzberg-ffi --no-default-features --features core
|
|
730
|
-
|
|
731
|
-
# Build without any features (minimal FFI)
|
|
732
|
-
cargo build --release -p kreuzberg-ffi --no-default-features
|
|
733
|
-
```
|
|
734
|
-
|
|
735
|
-
## Performance Characteristics
|
|
736
|
-
|
|
737
|
-
- **Single extraction**: 10-100ms (varies by file size and format)
|
|
738
|
-
- **Batch processing**: Near-linear scaling with CPU cores
|
|
739
|
-
- **OCR processing**: 100-500ms per page
|
|
740
|
-
- **Memory overhead**: ~2-5MB per extraction operation
|
|
741
|
-
- **Thread safety**: Zero synchronization overhead on single-threaded extraction
|
|
742
|
-
|
|
743
|
-
## Key Files
|
|
744
|
-
|
|
745
|
-
- `src/lib.rs`: FFI function implementations and plugin system
|
|
746
|
-
- `Cargo.toml`: Dependencies and features
|
|
747
|
-
- Generated header: Auto-created by cbindgen during build
|
|
748
|
-
|
|
749
|
-
## Building
|
|
750
|
-
|
|
751
|
-
### Development Build
|
|
752
|
-
|
|
753
|
-
```bash
|
|
754
|
-
cargo build -p kreuzberg-ffi
|
|
755
|
-
```
|
|
756
|
-
|
|
757
|
-
### Release Build
|
|
758
|
-
|
|
759
|
-
```bash
|
|
760
|
-
cargo build --release -p kreuzberg-ffi
|
|
761
|
-
```
|
|
762
|
-
|
|
763
|
-
### With All Features
|
|
764
|
-
|
|
765
|
-
```bash
|
|
766
|
-
cargo build --release -p kreuzberg-ffi --features html,embeddings
|
|
767
|
-
```
|
|
768
|
-
|
|
769
|
-
## Testing
|
|
770
|
-
|
|
771
|
-
```bash
|
|
772
|
-
# Run FFI tests
|
|
773
|
-
cargo test -p kreuzberg-ffi
|
|
774
|
-
|
|
775
|
-
# With logging
|
|
776
|
-
RUST_LOG=debug cargo test -p kreuzberg-ffi -- --nocapture
|
|
777
|
-
```
|
|
778
|
-
|
|
779
|
-
## Troubleshooting
|
|
780
|
-
|
|
781
|
-
### Library Not Found
|
|
782
|
-
|
|
783
|
-
Ensure the built library is in the linker search path:
|
|
784
|
-
|
|
785
|
-
```bash
|
|
786
|
-
# Check for built libraries
|
|
787
|
-
ls -la target/release/libkreuzberg_ffi*
|
|
788
|
-
|
|
789
|
-
# Add to library path
|
|
790
|
-
export LD_LIBRARY_PATH=target/release:$LD_LIBRARY_PATH
|
|
791
|
-
export DYLD_LIBRARY_PATH=target/release:$DYLD_LIBRARY_PATH # macOS
|
|
792
|
-
```
|
|
793
|
-
|
|
794
|
-
### Undefined Reference Errors
|
|
795
|
-
|
|
796
|
-
Ensure you're linking against the FFI library, not the core library:
|
|
797
|
-
|
|
798
|
-
```bash
|
|
799
|
-
# Correct
|
|
800
|
-
gcc -o app app.c -lkreuzberg_ffi
|
|
801
|
-
|
|
802
|
-
# Incorrect
|
|
803
|
-
gcc -o app app.c -lkreuzberg # Wrong library
|
|
804
|
-
```
|
|
805
|
-
|
|
806
|
-
### Memory Leaks
|
|
807
|
-
|
|
808
|
-
Always free returned strings and result structures:
|
|
809
|
-
|
|
810
|
-
```c
|
|
811
|
-
// Problem: Memory leak
|
|
812
|
-
char* mime = kreuzberg_detect_mime_type("file.pdf", true);
|
|
813
|
-
printf("%s\n", mime);
|
|
814
|
-
// mime not freed!
|
|
815
|
-
|
|
816
|
-
// Solution: Free the string
|
|
817
|
-
char* mime = kreuzberg_detect_mime_type("file.pdf", true);
|
|
818
|
-
printf("%s\n", mime);
|
|
819
|
-
kreuzberg_free_string(mime);
|
|
820
|
-
```
|
|
821
|
-
|
|
822
|
-
### Thread-Local Error Messages
|
|
823
|
-
|
|
824
|
-
Each thread has its own error message storage. Check both return values and `kreuzberg_get_last_error()`:
|
|
825
|
-
|
|
826
|
-
```c
|
|
827
|
-
// Safe across threads
|
|
828
|
-
#pragma omp parallel for
|
|
829
|
-
for (int i = 0; i < 10; i++) {
|
|
830
|
-
ExtractionResult result = kreuzberg_extract_file(files[i], NULL, "{}");
|
|
831
|
-
if (result.error) {
|
|
832
|
-
printf("Thread %d error: %s\n", i, result.error);
|
|
833
|
-
}
|
|
834
|
-
}
|
|
835
|
-
```
|
|
836
|
-
|
|
837
|
-
## References
|
|
838
|
-
|
|
839
|
-
- **Kreuzberg Core**: `../kreuzberg/`
|
|
840
|
-
- **C FFI Standards**: https://en.cppreference.com/w/c
|
|
841
|
-
- **cbindgen Documentation**: https://rust-lang.github.io/cbindgen/
|
|
842
|
-
- **Project Homepage**: https://kreuzberg.dev
|
|
843
|
-
- **GitHub Repository**: https://github.com/kreuzberg-dev/kreuzberg
|
|
844
|
-
|
|
845
|
-
## Contributing
|
|
846
|
-
|
|
847
|
-
We welcome contributions! Please see the main Kreuzberg repository for contribution guidelines.
|
|
848
|
-
|
|
849
|
-
## License
|
|
850
|
-
|
|
851
|
-
MIT
|