kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
//! Type definitions for C FFI compatibility.
|
|
2
|
+
//!
|
|
3
|
+
//! This module contains all C-compatible struct definitions used across the FFI boundary.
|
|
4
|
+
//! These types must maintain strict memory layout guarantees to ensure compatibility with
|
|
5
|
+
//! other languages (Java via Panama FFI, Go via cgo, C# via P/Invoke, etc.).
|
|
6
|
+
|
|
7
|
+
use std::ffi::CString;
|
|
8
|
+
use std::os::raw::c_char;
|
|
9
|
+
use std::ptr;
|
|
10
|
+
|
|
11
|
+
/// RAII guard for C strings that ensures proper cleanup.
|
|
12
|
+
///
|
|
13
|
+
/// This guard owns a raw C string pointer and automatically frees it when dropped,
|
|
14
|
+
/// preventing memory leaks. It can also transfer ownership via `into_raw()`.
|
|
15
|
+
///
|
|
16
|
+
/// # Memory Safety
|
|
17
|
+
///
|
|
18
|
+
/// - The guard takes ownership of a `CString` and converts it to a raw pointer
|
|
19
|
+
/// - On drop, it reconstructs the `CString` and drops it, freeing the memory
|
|
20
|
+
/// - If `into_raw()` is called, ownership is transferred and the drop is skipped
|
|
21
|
+
pub struct CStringGuard {
|
|
22
|
+
ptr: *mut c_char,
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
impl CStringGuard {
|
|
26
|
+
/// Create a new guard from a CString, transferring ownership of the raw pointer
|
|
27
|
+
pub fn new(s: CString) -> Self {
|
|
28
|
+
Self { ptr: s.into_raw() }
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/// Transfer ownership of the raw pointer to the caller, preventing cleanup
|
|
32
|
+
pub fn into_raw(mut self) -> *mut c_char {
|
|
33
|
+
let ptr = self.ptr;
|
|
34
|
+
self.ptr = ptr::null_mut();
|
|
35
|
+
ptr
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
impl Drop for CStringGuard {
|
|
40
|
+
fn drop(&mut self) {
|
|
41
|
+
if !self.ptr.is_null() {
|
|
42
|
+
unsafe { drop(CString::from_raw(self.ptr)) };
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/// C-compatible extraction result structure
|
|
48
|
+
///
|
|
49
|
+
/// This struct must maintain a stable ABI and memory layout for FFI compatibility.
|
|
50
|
+
///
|
|
51
|
+
/// # Memory Layout
|
|
52
|
+
///
|
|
53
|
+
/// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
54
|
+
/// Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
|
|
55
|
+
///
|
|
56
|
+
/// The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
|
|
57
|
+
/// - Fields are laid out in order
|
|
58
|
+
/// - Padding is added to maintain alignment
|
|
59
|
+
/// - The struct has the same size and alignment on all platforms (for 64-bit)
|
|
60
|
+
///
|
|
61
|
+
/// # Memory Management
|
|
62
|
+
///
|
|
63
|
+
/// All pointer fields are owned by the caller and must be freed using `kreuzberg_free_string`.
|
|
64
|
+
/// The struct itself must be freed using `kreuzberg_free_extraction_result`.
|
|
65
|
+
#[repr(C)]
|
|
66
|
+
pub struct CExtractionResult {
|
|
67
|
+
/// Extracted text content (null-terminated UTF-8 string, must be freed with kreuzberg_free_string)
|
|
68
|
+
pub content: *mut c_char,
|
|
69
|
+
/// Detected MIME type (null-terminated string, must be freed with kreuzberg_free_string)
|
|
70
|
+
pub mime_type: *mut c_char,
|
|
71
|
+
/// Document language (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
72
|
+
pub language: *mut c_char,
|
|
73
|
+
/// Document date (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
74
|
+
pub date: *mut c_char,
|
|
75
|
+
/// Document subject (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
76
|
+
pub subject: *mut c_char,
|
|
77
|
+
/// Tables as JSON array (null-terminated string, or NULL if no tables, must be freed with kreuzberg_free_string)
|
|
78
|
+
pub tables_json: *mut c_char,
|
|
79
|
+
/// Detected languages as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
80
|
+
pub detected_languages_json: *mut c_char,
|
|
81
|
+
/// Metadata as JSON object (null-terminated string, or NULL if no metadata, must be freed with kreuzberg_free_string)
|
|
82
|
+
pub metadata_json: *mut c_char,
|
|
83
|
+
/// Text chunks as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
84
|
+
pub chunks_json: *mut c_char,
|
|
85
|
+
/// Extracted images as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
86
|
+
pub images_json: *mut c_char,
|
|
87
|
+
/// Page structure as JSON object (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
88
|
+
pub page_structure_json: *mut c_char,
|
|
89
|
+
/// Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
90
|
+
pub pages_json: *mut c_char,
|
|
91
|
+
/// Whether extraction was successful
|
|
92
|
+
pub success: bool,
|
|
93
|
+
/// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
94
|
+
pub _padding1: [u8; 7],
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/// C-compatible structure for passing byte array with MIME type in batch operations
|
|
98
|
+
///
|
|
99
|
+
/// # Memory Layout
|
|
100
|
+
///
|
|
101
|
+
/// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
102
|
+
/// Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 pointer (8 bytes) = 24 bytes total
|
|
103
|
+
///
|
|
104
|
+
/// The `#[repr(C)]` attribute ensures consistent memory layout across languages.
|
|
105
|
+
///
|
|
106
|
+
/// # Usage
|
|
107
|
+
///
|
|
108
|
+
/// This struct is used to pass document data to batch extraction functions. The caller
|
|
109
|
+
/// retains ownership of the data and mime_type pointers.
|
|
110
|
+
#[repr(C)]
|
|
111
|
+
pub struct CBytesWithMime {
|
|
112
|
+
/// Pointer to byte data
|
|
113
|
+
pub data: *const u8,
|
|
114
|
+
/// Length of byte data
|
|
115
|
+
pub data_len: usize,
|
|
116
|
+
/// MIME type as null-terminated C string
|
|
117
|
+
pub mime_type: *const c_char,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/// C-compatible structure for batch extraction results
|
|
121
|
+
///
|
|
122
|
+
/// # Memory Layout
|
|
123
|
+
///
|
|
124
|
+
/// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
125
|
+
/// Field order: 1 pointer (8 bytes) + 1 usize (8 bytes) + 1 bool + 7 bytes padding = 24 bytes total
|
|
126
|
+
///
|
|
127
|
+
/// The padding ensures the struct is properly aligned for 64-bit architectures.
|
|
128
|
+
///
|
|
129
|
+
/// # Memory Management
|
|
130
|
+
///
|
|
131
|
+
/// - The `results` array must be freed using `kreuzberg_free_batch_result`
|
|
132
|
+
/// - Each individual result in the array must also be freed
|
|
133
|
+
#[repr(C)]
|
|
134
|
+
pub struct CBatchResult {
|
|
135
|
+
/// Array of extraction results
|
|
136
|
+
pub results: *mut *mut CExtractionResult,
|
|
137
|
+
/// Number of results
|
|
138
|
+
pub count: usize,
|
|
139
|
+
/// Whether batch operation was successful
|
|
140
|
+
pub success: bool,
|
|
141
|
+
/// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
142
|
+
pub _padding2: [u8; 7],
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/// Compile-time layout assertions to ensure ABI stability.
|
|
146
|
+
///
|
|
147
|
+
/// These assertions verify that the struct layouts match the expected sizes and alignments.
|
|
148
|
+
/// If these fail at compile time, it indicates a breaking change in the memory layout.
|
|
149
|
+
#[allow(non_upper_case_globals)]
|
|
150
|
+
const _: () = {
|
|
151
|
+
const fn assert_c_extraction_result_size() {
|
|
152
|
+
const SIZE: usize = std::mem::size_of::<CExtractionResult>();
|
|
153
|
+
const _: () = assert!(SIZE == 104, "CExtractionResult size must be 104 bytes");
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const fn assert_c_extraction_result_alignment() {
|
|
157
|
+
const ALIGN: usize = std::mem::align_of::<CExtractionResult>();
|
|
158
|
+
const _: () = assert!(ALIGN == 8, "CExtractionResult alignment must be 8 bytes");
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const fn assert_c_batch_result_size() {
|
|
162
|
+
const SIZE: usize = std::mem::size_of::<CBatchResult>();
|
|
163
|
+
const _: () = assert!(SIZE == 24, "CBatchResult size must be 24 bytes");
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const fn assert_c_batch_result_alignment() {
|
|
167
|
+
const ALIGN: usize = std::mem::align_of::<CBatchResult>();
|
|
168
|
+
const _: () = assert!(ALIGN == 8, "CBatchResult alignment must be 8 bytes");
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const fn assert_c_bytes_with_mime_size() {
|
|
172
|
+
const SIZE: usize = std::mem::size_of::<CBytesWithMime>();
|
|
173
|
+
const _: () = assert!(SIZE == 24, "CBytesWithMime size must be 24 bytes");
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const fn assert_c_bytes_with_mime_alignment() {
|
|
177
|
+
const ALIGN: usize = std::mem::align_of::<CBytesWithMime>();
|
|
178
|
+
const _: () = assert!(ALIGN == 8, "CBytesWithMime alignment must be 8 bytes");
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
let _ = assert_c_extraction_result_size;
|
|
182
|
+
let _ = assert_c_extraction_result_alignment;
|
|
183
|
+
let _ = assert_c_batch_result_size;
|
|
184
|
+
let _ = assert_c_batch_result_alignment;
|
|
185
|
+
let _ = assert_c_bytes_with_mime_size;
|
|
186
|
+
let _ = assert_c_bytes_with_mime_alignment;
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
#[cfg(test)]
|
|
190
|
+
mod tests {
|
|
191
|
+
use super::*;
|
|
192
|
+
|
|
193
|
+
/// Test that CExtractionResult has the correct size
|
|
194
|
+
#[test]
|
|
195
|
+
fn test_c_extraction_result_size() {
|
|
196
|
+
assert_eq!(
|
|
197
|
+
std::mem::size_of::<CExtractionResult>(),
|
|
198
|
+
104,
|
|
199
|
+
"CExtractionResult must be exactly 104 bytes"
|
|
200
|
+
);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/// Test that CExtractionResult has the correct alignment
|
|
204
|
+
#[test]
|
|
205
|
+
fn test_c_extraction_result_alignment() {
|
|
206
|
+
assert_eq!(
|
|
207
|
+
std::mem::align_of::<CExtractionResult>(),
|
|
208
|
+
8,
|
|
209
|
+
"CExtractionResult must be 8-byte aligned"
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/// Test that CBatchResult has the correct size
|
|
214
|
+
#[test]
|
|
215
|
+
fn test_c_batch_result_size() {
|
|
216
|
+
assert_eq!(
|
|
217
|
+
std::mem::size_of::<CBatchResult>(),
|
|
218
|
+
24,
|
|
219
|
+
"CBatchResult must be exactly 24 bytes"
|
|
220
|
+
);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/// Test that CBatchResult has the correct alignment
|
|
224
|
+
#[test]
|
|
225
|
+
fn test_c_batch_result_alignment() {
|
|
226
|
+
assert_eq!(
|
|
227
|
+
std::mem::align_of::<CBatchResult>(),
|
|
228
|
+
8,
|
|
229
|
+
"CBatchResult must be 8-byte aligned"
|
|
230
|
+
);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
/// Test that CBytesWithMime has the correct size
|
|
234
|
+
#[test]
|
|
235
|
+
fn test_c_bytes_with_mime_size() {
|
|
236
|
+
assert_eq!(
|
|
237
|
+
std::mem::size_of::<CBytesWithMime>(),
|
|
238
|
+
24,
|
|
239
|
+
"CBytesWithMime must be exactly 24 bytes"
|
|
240
|
+
);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/// Test that CBytesWithMime has the correct alignment
|
|
244
|
+
#[test]
|
|
245
|
+
fn test_c_bytes_with_mime_alignment() {
|
|
246
|
+
assert_eq!(
|
|
247
|
+
std::mem::align_of::<CBytesWithMime>(),
|
|
248
|
+
8,
|
|
249
|
+
"CBytesWithMime must be 8-byte aligned"
|
|
250
|
+
);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
/// Test CStringGuard RAII behavior - normal drop
|
|
254
|
+
#[test]
|
|
255
|
+
fn test_c_string_guard_drop() {
|
|
256
|
+
let original = CString::new("test string").unwrap();
|
|
257
|
+
let guard = CStringGuard::new(original);
|
|
258
|
+
// Guard should automatically free the string when it goes out of scope
|
|
259
|
+
drop(guard);
|
|
260
|
+
// If this test completes without crashing, the RAII behavior is working
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/// Test CStringGuard RAII behavior - into_raw transfer
|
|
264
|
+
#[test]
|
|
265
|
+
fn test_c_string_guard_into_raw() {
|
|
266
|
+
let original = CString::new("test string").unwrap();
|
|
267
|
+
let guard = CStringGuard::new(original);
|
|
268
|
+
let ptr = guard.into_raw();
|
|
269
|
+
|
|
270
|
+
assert!(!ptr.is_null(), "into_raw should return a non-null pointer");
|
|
271
|
+
|
|
272
|
+
// Manually free the string since we took ownership
|
|
273
|
+
unsafe {
|
|
274
|
+
let _ = CString::from_raw(ptr);
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/// Test CStringGuard with empty string
|
|
279
|
+
#[test]
|
|
280
|
+
fn test_c_string_guard_empty() {
|
|
281
|
+
let original = CString::new("").unwrap();
|
|
282
|
+
let guard = CStringGuard::new(original);
|
|
283
|
+
let ptr = guard.into_raw();
|
|
284
|
+
|
|
285
|
+
assert!(!ptr.is_null(), "Empty string should still have a valid pointer");
|
|
286
|
+
|
|
287
|
+
unsafe {
|
|
288
|
+
let recovered = CString::from_raw(ptr);
|
|
289
|
+
assert_eq!(recovered.to_str().unwrap(), "");
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/// Test that CStringGuard doesn't double-free
|
|
294
|
+
#[test]
|
|
295
|
+
fn test_c_string_guard_no_double_free() {
|
|
296
|
+
let original = CString::new("test").unwrap();
|
|
297
|
+
let mut guard = CStringGuard::new(original);
|
|
298
|
+
|
|
299
|
+
// Manually set to null to simulate into_raw behavior
|
|
300
|
+
let ptr = guard.ptr;
|
|
301
|
+
guard.ptr = ptr::null_mut();
|
|
302
|
+
|
|
303
|
+
// This should not attempt to free anything
|
|
304
|
+
drop(guard);
|
|
305
|
+
|
|
306
|
+
// Clean up the actual pointer
|
|
307
|
+
unsafe {
|
|
308
|
+
let _ = CString::from_raw(ptr);
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
/// Verify field offsets in CExtractionResult match expectations
|
|
313
|
+
#[test]
|
|
314
|
+
fn test_c_extraction_result_field_offsets() {
|
|
315
|
+
use std::mem::offset_of;
|
|
316
|
+
|
|
317
|
+
// All pointer fields should be 8 bytes each
|
|
318
|
+
assert_eq!(offset_of!(CExtractionResult, content), 0);
|
|
319
|
+
assert_eq!(offset_of!(CExtractionResult, mime_type), 8);
|
|
320
|
+
assert_eq!(offset_of!(CExtractionResult, language), 16);
|
|
321
|
+
assert_eq!(offset_of!(CExtractionResult, date), 24);
|
|
322
|
+
assert_eq!(offset_of!(CExtractionResult, subject), 32);
|
|
323
|
+
assert_eq!(offset_of!(CExtractionResult, tables_json), 40);
|
|
324
|
+
assert_eq!(offset_of!(CExtractionResult, detected_languages_json), 48);
|
|
325
|
+
assert_eq!(offset_of!(CExtractionResult, metadata_json), 56);
|
|
326
|
+
assert_eq!(offset_of!(CExtractionResult, chunks_json), 64);
|
|
327
|
+
assert_eq!(offset_of!(CExtractionResult, images_json), 72);
|
|
328
|
+
assert_eq!(offset_of!(CExtractionResult, page_structure_json), 80);
|
|
329
|
+
assert_eq!(offset_of!(CExtractionResult, pages_json), 88);
|
|
330
|
+
assert_eq!(offset_of!(CExtractionResult, success), 96);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/// Verify field offsets in CBatchResult match expectations
|
|
334
|
+
#[test]
|
|
335
|
+
fn test_c_batch_result_field_offsets() {
|
|
336
|
+
use std::mem::offset_of;
|
|
337
|
+
|
|
338
|
+
assert_eq!(offset_of!(CBatchResult, results), 0);
|
|
339
|
+
assert_eq!(offset_of!(CBatchResult, count), 8);
|
|
340
|
+
assert_eq!(offset_of!(CBatchResult, success), 16);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/// Verify field offsets in CBytesWithMime match expectations
|
|
344
|
+
#[test]
|
|
345
|
+
fn test_c_bytes_with_mime_field_offsets() {
|
|
346
|
+
use std::mem::offset_of;
|
|
347
|
+
|
|
348
|
+
assert_eq!(offset_of!(CBytesWithMime, data), 0);
|
|
349
|
+
assert_eq!(offset_of!(CBytesWithMime, data_len), 8);
|
|
350
|
+
assert_eq!(offset_of!(CBytesWithMime, mime_type), 16);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/// Test that all structs can be safely created with zeroed memory
|
|
354
|
+
#[test]
|
|
355
|
+
fn test_structs_can_be_zeroed() {
|
|
356
|
+
unsafe {
|
|
357
|
+
// These should not panic if the types are properly repr(C)
|
|
358
|
+
let _result: CExtractionResult = std::mem::zeroed();
|
|
359
|
+
let _batch: CBatchResult = std::mem::zeroed();
|
|
360
|
+
let _bytes: CBytesWithMime = std::mem::zeroed();
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
}
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
//! Utility functions for version and error reporting.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides FFI functions for:
|
|
4
|
+
//! - Getting the library version
|
|
5
|
+
//! - Retrieving error information (message, code, panic context)
|
|
6
|
+
|
|
7
|
+
use crate::ffi_panic_guard;
|
|
8
|
+
use crate::helpers::LAST_ERROR_C_STRING;
|
|
9
|
+
use crate::panic_shield::{get_last_error_code, get_last_panic_context};
|
|
10
|
+
use std::ffi::CString;
|
|
11
|
+
use std::os::raw::c_char;
|
|
12
|
+
use std::ptr;
|
|
13
|
+
|
|
14
|
+
/// Get the last error message from a failed operation.
|
|
15
|
+
///
|
|
16
|
+
/// # Safety
|
|
17
|
+
///
|
|
18
|
+
/// - Returns a static string that does not need to be freed
|
|
19
|
+
/// - Returns NULL if no error has occurred
|
|
20
|
+
/// - The returned string is valid until the next Kreuzberg function call on the same thread
|
|
21
|
+
///
|
|
22
|
+
/// # Example (C)
|
|
23
|
+
///
|
|
24
|
+
/// ```c
|
|
25
|
+
/// CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
26
|
+
/// if (result == NULL) {
|
|
27
|
+
/// const char* error = kreuzberg_last_error();
|
|
28
|
+
/// printf("Error: %s\n", error);
|
|
29
|
+
/// }
|
|
30
|
+
/// ```
|
|
31
|
+
#[unsafe(no_mangle)]
|
|
32
|
+
pub unsafe extern "C" fn kreuzberg_last_error() -> *const c_char {
|
|
33
|
+
LAST_ERROR_C_STRING.with(|last| match &*last.borrow() {
|
|
34
|
+
Some(c_str) => c_str.as_ptr(),
|
|
35
|
+
None => ptr::null(),
|
|
36
|
+
})
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/// Get the error code for the last error.
|
|
40
|
+
///
|
|
41
|
+
/// Returns the error code as an i32. Error codes are defined in ErrorCode enum:
|
|
42
|
+
/// - 0: Success (no error)
|
|
43
|
+
/// - 1: GenericError
|
|
44
|
+
/// - 2: Panic
|
|
45
|
+
/// - 3: InvalidArgument
|
|
46
|
+
/// - 4: IoError
|
|
47
|
+
/// - 5: ParsingError
|
|
48
|
+
/// - 6: OcrError
|
|
49
|
+
/// - 7: MissingDependency
|
|
50
|
+
///
|
|
51
|
+
/// # Safety
|
|
52
|
+
///
|
|
53
|
+
/// This function is thread-safe and always safe to call.
|
|
54
|
+
///
|
|
55
|
+
/// # Example (C)
|
|
56
|
+
///
|
|
57
|
+
/// ```c
|
|
58
|
+
/// CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
59
|
+
/// if (result == NULL) {
|
|
60
|
+
/// int32_t code = kreuzberg_last_error_code();
|
|
61
|
+
/// if (code == 2) {
|
|
62
|
+
/// // A panic occurred
|
|
63
|
+
/// }
|
|
64
|
+
/// }
|
|
65
|
+
/// ```
|
|
66
|
+
#[unsafe(no_mangle)]
|
|
67
|
+
pub unsafe extern "C" fn kreuzberg_last_error_code() -> i32 {
|
|
68
|
+
get_last_error_code() as i32
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/// Get the panic context for the last error (if it was a panic).
|
|
72
|
+
///
|
|
73
|
+
/// Returns a JSON object with panic details including:
|
|
74
|
+
/// - file: Source file where panic occurred
|
|
75
|
+
/// - line: Line number in source file
|
|
76
|
+
/// - function: Name of the function that panicked
|
|
77
|
+
/// - message: Panic message
|
|
78
|
+
/// - timestamp_secs: Unix timestamp when panic occurred
|
|
79
|
+
///
|
|
80
|
+
/// # Safety
|
|
81
|
+
///
|
|
82
|
+
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
83
|
+
/// - Returns NULL if the last error was not a panic or no error has occurred
|
|
84
|
+
///
|
|
85
|
+
/// # Example (C)
|
|
86
|
+
///
|
|
87
|
+
/// ```c
|
|
88
|
+
/// CExtractionResult* result = kreuzberg_extract_file_sync(path);
|
|
89
|
+
/// if (result == NULL && kreuzberg_last_error_code() == 2) {
|
|
90
|
+
/// char* context = kreuzberg_last_panic_context();
|
|
91
|
+
/// if (context != NULL) {
|
|
92
|
+
/// printf("Panic context: %s\n", context);
|
|
93
|
+
/// kreuzberg_free_string(context);
|
|
94
|
+
/// }
|
|
95
|
+
/// }
|
|
96
|
+
/// ```
|
|
97
|
+
#[unsafe(no_mangle)]
|
|
98
|
+
pub unsafe extern "C" fn kreuzberg_last_panic_context() -> *mut c_char {
|
|
99
|
+
ffi_panic_guard!("kreuzberg_last_panic_context", {
|
|
100
|
+
match get_last_panic_context() {
|
|
101
|
+
Some(ctx) => {
|
|
102
|
+
use std::time::UNIX_EPOCH;
|
|
103
|
+
|
|
104
|
+
let timestamp_secs = ctx
|
|
105
|
+
.timestamp
|
|
106
|
+
.duration_since(UNIX_EPOCH)
|
|
107
|
+
.map(|d| d.as_secs())
|
|
108
|
+
.unwrap_or(0);
|
|
109
|
+
|
|
110
|
+
let json_value = serde_json::json!({
|
|
111
|
+
"file": ctx.file,
|
|
112
|
+
"line": ctx.line,
|
|
113
|
+
"function": ctx.function,
|
|
114
|
+
"message": ctx.message,
|
|
115
|
+
"timestamp_secs": timestamp_secs
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
match serde_json::to_string(&json_value) {
|
|
119
|
+
Ok(json) => match CString::new(json) {
|
|
120
|
+
Ok(c_str) => c_str.into_raw(),
|
|
121
|
+
Err(_) => ptr::null_mut(),
|
|
122
|
+
},
|
|
123
|
+
Err(_) => ptr::null_mut(),
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
None => ptr::null_mut(),
|
|
127
|
+
}
|
|
128
|
+
})
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/// Get the library version string.
|
|
132
|
+
///
|
|
133
|
+
/// # Safety
|
|
134
|
+
///
|
|
135
|
+
/// - Returns a static string that does not need to be freed
|
|
136
|
+
/// - The returned string is always valid
|
|
137
|
+
///
|
|
138
|
+
/// # Example (C)
|
|
139
|
+
///
|
|
140
|
+
/// ```c
|
|
141
|
+
/// const char* version = kreuzberg_version();
|
|
142
|
+
/// printf("Kreuzberg version: %s\n", version);
|
|
143
|
+
/// ```
|
|
144
|
+
#[unsafe(no_mangle)]
|
|
145
|
+
pub unsafe extern "C" fn kreuzberg_version() -> *const c_char {
|
|
146
|
+
concat!(env!("CARGO_PKG_VERSION"), "\0").as_ptr() as *const c_char
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
#[cfg(test)]
|
|
150
|
+
mod tests {
|
|
151
|
+
use super::*;
|
|
152
|
+
use crate::helpers::{clear_last_error, set_last_error};
|
|
153
|
+
use std::ffi::CStr;
|
|
154
|
+
|
|
155
|
+
#[test]
|
|
156
|
+
fn test_version_not_null() {
|
|
157
|
+
let version = unsafe { kreuzberg_version() };
|
|
158
|
+
assert!(!version.is_null());
|
|
159
|
+
|
|
160
|
+
let version_str = unsafe { CStr::from_ptr(version).to_str().unwrap() };
|
|
161
|
+
assert!(!version_str.is_empty());
|
|
162
|
+
// Version should contain dots (e.g., "0.1.0")
|
|
163
|
+
assert!(version_str.contains('.'));
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
#[test]
|
|
167
|
+
fn test_last_error_null_when_no_error() {
|
|
168
|
+
clear_last_error();
|
|
169
|
+
let error = unsafe { kreuzberg_last_error() };
|
|
170
|
+
assert!(error.is_null());
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
#[test]
|
|
174
|
+
fn test_last_error_returns_message() {
|
|
175
|
+
set_last_error("Test error message".to_string());
|
|
176
|
+
let error = unsafe { kreuzberg_last_error() };
|
|
177
|
+
assert!(!error.is_null());
|
|
178
|
+
|
|
179
|
+
let error_str = unsafe { CStr::from_ptr(error).to_str().unwrap() };
|
|
180
|
+
assert_eq!(error_str, "Test error message");
|
|
181
|
+
|
|
182
|
+
clear_last_error();
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
#[test]
|
|
186
|
+
fn test_last_error_code_success_by_default() {
|
|
187
|
+
clear_last_error();
|
|
188
|
+
let code = unsafe { kreuzberg_last_error_code() };
|
|
189
|
+
assert_eq!(code, 0); // Success
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
#[test]
|
|
193
|
+
fn test_last_panic_context_null_when_no_panic() {
|
|
194
|
+
let context = unsafe { kreuzberg_last_panic_context() };
|
|
195
|
+
// Should be null if no panic has occurred
|
|
196
|
+
if !context.is_null() {
|
|
197
|
+
unsafe {
|
|
198
|
+
crate::kreuzberg_free_string(context);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
#[test]
|
|
204
|
+
fn test_error_code_values() {
|
|
205
|
+
// Ensure error codes are in expected range
|
|
206
|
+
let code = unsafe { kreuzberg_last_error_code() };
|
|
207
|
+
assert!(code >= 0);
|
|
208
|
+
assert!(code < 10); // Should be within reasonable bounds
|
|
209
|
+
}
|
|
210
|
+
}
|