kreuzberg 4.0.0.rc2 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +396 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,773 @@
|
|
|
1
|
+
//! Zero-copy result view FFI module.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides direct read-only access to ExtractionResult fields without copying.
|
|
4
|
+
//! This eliminates memory allocation and JSON serialization overhead for common
|
|
5
|
+
//! field access patterns in language bindings.
|
|
6
|
+
//!
|
|
7
|
+
//! # Safety Model
|
|
8
|
+
//!
|
|
9
|
+
//! Views are **borrowed references** to ExtractionResult data. The caller MUST ensure:
|
|
10
|
+
//! 1. The source ExtractionResult outlives all views created from it
|
|
11
|
+
//! 2. Views are not used after the source result is freed
|
|
12
|
+
//! 3. Multi-threaded access requires external synchronization
|
|
13
|
+
//!
|
|
14
|
+
//! # Performance Benefits
|
|
15
|
+
//!
|
|
16
|
+
//! Zero-copy views eliminate:
|
|
17
|
+
//! - String allocation overhead (no `CString::new()` calls)
|
|
18
|
+
//! - JSON serialization for metadata/tables/chunks
|
|
19
|
+
//! - UTF-8 validation (already validated in Rust)
|
|
20
|
+
//! - Memory copying from Rust String → C string
|
|
21
|
+
//!
|
|
22
|
+
//! Expected performance improvement: 10-20% for large documents with many fields.
|
|
23
|
+
|
|
24
|
+
use crate::{clear_last_error, set_last_error};
|
|
25
|
+
use kreuzberg::types::ExtractionResult;
|
|
26
|
+
use std::ptr;
|
|
27
|
+
|
|
28
|
+
/// Zero-copy view into an ExtractionResult.
|
|
29
|
+
///
|
|
30
|
+
/// Provides direct pointers to string data without allocation or copying.
|
|
31
|
+
/// All pointers are valid UTF-8 byte slices (not null-terminated).
|
|
32
|
+
///
|
|
33
|
+
/// # Lifetime Safety
|
|
34
|
+
///
|
|
35
|
+
/// This structure contains borrowed pointers. The caller MUST ensure:
|
|
36
|
+
/// - The source `ExtractionResult` outlives this view
|
|
37
|
+
/// - No use after the source result is freed with `kreuzberg_result_free()`
|
|
38
|
+
///
|
|
39
|
+
/// # Memory Layout
|
|
40
|
+
///
|
|
41
|
+
/// Field order: 6 ptr+len pairs (96 bytes) + 5 counts (40 bytes) = 136 bytes on 64-bit systems
|
|
42
|
+
/// All pointers are either valid UTF-8 data or NULL (with corresponding len=0).
|
|
43
|
+
///
|
|
44
|
+
/// # Thread Safety
|
|
45
|
+
///
|
|
46
|
+
/// Views are NOT thread-safe. External synchronization required for concurrent access.
|
|
47
|
+
#[repr(C)]
|
|
48
|
+
pub struct CExtractionResultView {
|
|
49
|
+
/// Direct pointer to content bytes (UTF-8, not null-terminated)
|
|
50
|
+
pub content_ptr: *const u8,
|
|
51
|
+
/// Length of content in bytes
|
|
52
|
+
pub content_len: usize,
|
|
53
|
+
|
|
54
|
+
/// Direct pointer to MIME type bytes (UTF-8, not null-terminated)
|
|
55
|
+
pub mime_type_ptr: *const u8,
|
|
56
|
+
/// Length of MIME type in bytes
|
|
57
|
+
pub mime_type_len: usize,
|
|
58
|
+
|
|
59
|
+
/// Direct pointer to language bytes (UTF-8, not null-terminated), or NULL
|
|
60
|
+
pub language_ptr: *const u8,
|
|
61
|
+
/// Length of language in bytes (0 if NULL)
|
|
62
|
+
pub language_len: usize,
|
|
63
|
+
|
|
64
|
+
/// Direct pointer to date bytes (UTF-8, not null-terminated), or NULL
|
|
65
|
+
pub date_ptr: *const u8,
|
|
66
|
+
/// Length of date in bytes (0 if NULL)
|
|
67
|
+
pub date_len: usize,
|
|
68
|
+
|
|
69
|
+
/// Direct pointer to subject bytes (UTF-8, not null-terminated), or NULL
|
|
70
|
+
pub subject_ptr: *const u8,
|
|
71
|
+
/// Length of subject in bytes (0 if NULL)
|
|
72
|
+
pub subject_len: usize,
|
|
73
|
+
|
|
74
|
+
/// Direct pointer to title bytes (UTF-8, not null-terminated), or NULL
|
|
75
|
+
pub title_ptr: *const u8,
|
|
76
|
+
/// Length of title in bytes (0 if NULL)
|
|
77
|
+
pub title_len: usize,
|
|
78
|
+
|
|
79
|
+
/// Number of tables extracted
|
|
80
|
+
pub table_count: usize,
|
|
81
|
+
|
|
82
|
+
/// Number of chunks (0 if chunking not enabled)
|
|
83
|
+
pub chunk_count: usize,
|
|
84
|
+
|
|
85
|
+
/// Number of detected languages (0 if language detection not enabled)
|
|
86
|
+
pub detected_language_count: usize,
|
|
87
|
+
|
|
88
|
+
/// Number of extracted images (0 if no images)
|
|
89
|
+
pub image_count: usize,
|
|
90
|
+
|
|
91
|
+
/// Total page count (0 if not applicable)
|
|
92
|
+
pub page_count: usize,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/// Get a zero-copy view of an extraction result.
|
|
96
|
+
///
|
|
97
|
+
/// Creates a view structure with direct pointers to result data without allocation.
|
|
98
|
+
/// The view is valid only while the source `result` remains valid.
|
|
99
|
+
///
|
|
100
|
+
/// # Arguments
|
|
101
|
+
///
|
|
102
|
+
/// * `result` - Pointer to an ExtractionResult structure
|
|
103
|
+
/// * `out_view` - Pointer to a CExtractionResultView structure to populate
|
|
104
|
+
///
|
|
105
|
+
/// # Returns
|
|
106
|
+
///
|
|
107
|
+
/// 0 on success, -1 on error (check `kreuzberg_last_error`).
|
|
108
|
+
///
|
|
109
|
+
/// # Safety
|
|
110
|
+
///
|
|
111
|
+
/// - `result` must be a valid pointer to an ExtractionResult
|
|
112
|
+
/// - `out_view` must be a valid pointer to writable memory
|
|
113
|
+
/// - Neither parameter can be NULL
|
|
114
|
+
/// - The returned view is valid ONLY while `result` is not freed
|
|
115
|
+
/// - Caller MUST NOT use the view after calling `kreuzberg_result_free(result)`
|
|
116
|
+
///
|
|
117
|
+
/// # Lifetime Safety
|
|
118
|
+
///
|
|
119
|
+
/// ```text
|
|
120
|
+
/// ExtractionResult lifetime: |-------------------------------------|
|
|
121
|
+
/// View lifetime: |----------------------|
|
|
122
|
+
/// SAFE FREE → INVALID
|
|
123
|
+
/// ```
|
|
124
|
+
///
|
|
125
|
+
/// # Example (C)
|
|
126
|
+
///
|
|
127
|
+
/// ```c
|
|
128
|
+
/// ExtractionResult* result = kreuzberg_extract_file("document.pdf", NULL);
|
|
129
|
+
/// if (result != NULL) {
|
|
130
|
+
/// CExtractionResultView view;
|
|
131
|
+
/// if (kreuzberg_get_result_view(result, &view) == 0) {
|
|
132
|
+
/// // Direct access to content without copying
|
|
133
|
+
/// printf("Content length: %zu bytes\n", view.content_len);
|
|
134
|
+
/// printf("MIME type: %.*s\n", (int)view.mime_type_len, view.mime_type_ptr);
|
|
135
|
+
/// printf("Tables: %zu, Chunks: %zu\n", view.table_count, view.chunk_count);
|
|
136
|
+
///
|
|
137
|
+
/// // No need to free the view (no allocations)
|
|
138
|
+
/// }
|
|
139
|
+
///
|
|
140
|
+
/// kreuzberg_result_free(result); // After this, view is INVALID
|
|
141
|
+
/// }
|
|
142
|
+
/// ```
|
|
143
|
+
#[unsafe(no_mangle)]
|
|
144
|
+
pub unsafe extern "C" fn kreuzberg_get_result_view(
|
|
145
|
+
result: *const ExtractionResult,
|
|
146
|
+
out_view: *mut CExtractionResultView,
|
|
147
|
+
) -> i32 {
|
|
148
|
+
if result.is_null() {
|
|
149
|
+
set_last_error("Result cannot be NULL".to_string());
|
|
150
|
+
return -1;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if out_view.is_null() {
|
|
154
|
+
set_last_error("Output view cannot be NULL".to_string());
|
|
155
|
+
return -1;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
clear_last_error();
|
|
159
|
+
|
|
160
|
+
let result_ref = unsafe { &*result };
|
|
161
|
+
|
|
162
|
+
unsafe {
|
|
163
|
+
let content_bytes = result_ref.content.as_bytes();
|
|
164
|
+
(*out_view).content_ptr = content_bytes.as_ptr();
|
|
165
|
+
(*out_view).content_len = content_bytes.len();
|
|
166
|
+
|
|
167
|
+
let mime_bytes = result_ref.mime_type.as_bytes();
|
|
168
|
+
(*out_view).mime_type_ptr = mime_bytes.as_ptr();
|
|
169
|
+
(*out_view).mime_type_len = mime_bytes.len();
|
|
170
|
+
|
|
171
|
+
if let Some(ref language) = result_ref.metadata.language {
|
|
172
|
+
let lang_bytes = language.as_bytes();
|
|
173
|
+
(*out_view).language_ptr = lang_bytes.as_ptr();
|
|
174
|
+
(*out_view).language_len = lang_bytes.len();
|
|
175
|
+
} else {
|
|
176
|
+
(*out_view).language_ptr = ptr::null();
|
|
177
|
+
(*out_view).language_len = 0;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if let Some(ref created_at) = result_ref.metadata.created_at {
|
|
181
|
+
let created_at_bytes = created_at.as_bytes();
|
|
182
|
+
(*out_view).date_ptr = created_at_bytes.as_ptr();
|
|
183
|
+
(*out_view).date_len = created_at_bytes.len();
|
|
184
|
+
} else {
|
|
185
|
+
(*out_view).date_ptr = ptr::null();
|
|
186
|
+
(*out_view).date_len = 0;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if let Some(ref subject) = result_ref.metadata.subject {
|
|
190
|
+
let subject_bytes = subject.as_bytes();
|
|
191
|
+
(*out_view).subject_ptr = subject_bytes.as_ptr();
|
|
192
|
+
(*out_view).subject_len = subject_bytes.len();
|
|
193
|
+
} else {
|
|
194
|
+
(*out_view).subject_ptr = ptr::null();
|
|
195
|
+
(*out_view).subject_len = 0;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if let Some(ref title) = result_ref.metadata.title {
|
|
199
|
+
let title_bytes = title.as_bytes();
|
|
200
|
+
(*out_view).title_ptr = title_bytes.as_ptr();
|
|
201
|
+
(*out_view).title_len = title_bytes.len();
|
|
202
|
+
} else {
|
|
203
|
+
(*out_view).title_ptr = ptr::null();
|
|
204
|
+
(*out_view).title_len = 0;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
(*out_view).table_count = result_ref.tables.len();
|
|
208
|
+
(*out_view).chunk_count = result_ref.chunks.as_ref().map_or(0, |c| c.len());
|
|
209
|
+
(*out_view).detected_language_count = result_ref.detected_languages.as_ref().map_or(0, |l| l.len());
|
|
210
|
+
(*out_view).image_count = result_ref.images.as_ref().map_or(0, |i| i.len());
|
|
211
|
+
(*out_view).page_count = result_ref.metadata.pages.as_ref().map_or(0, |p| p.total_count);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
0
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/// Internal helper: create a zero-copy view by value (for internal use).
|
|
218
|
+
///
|
|
219
|
+
/// This is a convenience function for internal FFI modules that need to
|
|
220
|
+
/// create views without having to allocate and populate an output structure.
|
|
221
|
+
///
|
|
222
|
+
/// # Safety
|
|
223
|
+
///
|
|
224
|
+
/// - `result` must be a valid reference to ExtractionResult
|
|
225
|
+
/// - Returned view is only valid while `result` is alive
|
|
226
|
+
pub(crate) fn create_result_view(result: &ExtractionResult) -> CExtractionResultView {
|
|
227
|
+
let mut view = CExtractionResultView {
|
|
228
|
+
content_ptr: ptr::null(),
|
|
229
|
+
content_len: 0,
|
|
230
|
+
mime_type_ptr: ptr::null(),
|
|
231
|
+
mime_type_len: 0,
|
|
232
|
+
language_ptr: ptr::null(),
|
|
233
|
+
language_len: 0,
|
|
234
|
+
date_ptr: ptr::null(),
|
|
235
|
+
date_len: 0,
|
|
236
|
+
subject_ptr: ptr::null(),
|
|
237
|
+
subject_len: 0,
|
|
238
|
+
title_ptr: ptr::null(),
|
|
239
|
+
title_len: 0,
|
|
240
|
+
table_count: 0,
|
|
241
|
+
chunk_count: 0,
|
|
242
|
+
detected_language_count: 0,
|
|
243
|
+
image_count: 0,
|
|
244
|
+
page_count: 0,
|
|
245
|
+
};
|
|
246
|
+
|
|
247
|
+
let content_bytes = result.content.as_bytes();
|
|
248
|
+
view.content_ptr = content_bytes.as_ptr();
|
|
249
|
+
view.content_len = content_bytes.len();
|
|
250
|
+
|
|
251
|
+
let mime_bytes = result.mime_type.as_bytes();
|
|
252
|
+
view.mime_type_ptr = mime_bytes.as_ptr();
|
|
253
|
+
view.mime_type_len = mime_bytes.len();
|
|
254
|
+
|
|
255
|
+
if let Some(ref language) = result.metadata.language {
|
|
256
|
+
let lang_bytes = language.as_bytes();
|
|
257
|
+
view.language_ptr = lang_bytes.as_ptr();
|
|
258
|
+
view.language_len = lang_bytes.len();
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if let Some(ref created_at) = result.metadata.created_at {
|
|
262
|
+
let created_at_bytes = created_at.as_bytes();
|
|
263
|
+
view.date_ptr = created_at_bytes.as_ptr();
|
|
264
|
+
view.date_len = created_at_bytes.len();
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
if let Some(ref subject) = result.metadata.subject {
|
|
268
|
+
let subject_bytes = subject.as_bytes();
|
|
269
|
+
view.subject_ptr = subject_bytes.as_ptr();
|
|
270
|
+
view.subject_len = subject_bytes.len();
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if let Some(ref title) = result.metadata.title {
|
|
274
|
+
let title_bytes = title.as_bytes();
|
|
275
|
+
view.title_ptr = title_bytes.as_ptr();
|
|
276
|
+
view.title_len = title_bytes.len();
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
view.table_count = result.tables.len();
|
|
280
|
+
view.chunk_count = result.chunks.as_ref().map_or(0, |c| c.len());
|
|
281
|
+
view.detected_language_count = result.detected_languages.as_ref().map_or(0, |l| l.len());
|
|
282
|
+
view.image_count = result.images.as_ref().map_or(0, |i| i.len());
|
|
283
|
+
view.page_count = result.metadata.pages.as_ref().map_or(0, |p| p.total_count);
|
|
284
|
+
|
|
285
|
+
view
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/// Get direct access to content from a result view.
|
|
289
|
+
///
|
|
290
|
+
/// Helper function to retrieve content as a slice without copying.
|
|
291
|
+
///
|
|
292
|
+
/// # Arguments
|
|
293
|
+
///
|
|
294
|
+
/// * `view` - Pointer to a CExtractionResultView structure
|
|
295
|
+
/// * `out_ptr` - Pointer to receive the content pointer
|
|
296
|
+
/// * `out_len` - Pointer to receive the content length
|
|
297
|
+
///
|
|
298
|
+
/// # Returns
|
|
299
|
+
///
|
|
300
|
+
/// 0 on success, -1 on error (check `kreuzberg_last_error`).
|
|
301
|
+
///
|
|
302
|
+
/// # Safety
|
|
303
|
+
///
|
|
304
|
+
/// - `view` must be a valid pointer to a CExtractionResultView
|
|
305
|
+
/// - `out_ptr` and `out_len` must be valid writable pointers
|
|
306
|
+
/// - The returned content pointer is valid only while the source ExtractionResult is valid
|
|
307
|
+
///
|
|
308
|
+
/// # Example (C)
|
|
309
|
+
///
|
|
310
|
+
/// ```c
|
|
311
|
+
/// const uint8_t* content;
|
|
312
|
+
/// size_t content_len;
|
|
313
|
+
/// if (kreuzberg_view_get_content(&view, &content, &content_len) == 0) {
|
|
314
|
+
/// // Process content directly without copying
|
|
315
|
+
/// fwrite(content, 1, content_len, stdout);
|
|
316
|
+
/// }
|
|
317
|
+
/// ```
|
|
318
|
+
#[unsafe(no_mangle)]
|
|
319
|
+
pub unsafe extern "C" fn kreuzberg_view_get_content(
|
|
320
|
+
view: *const CExtractionResultView,
|
|
321
|
+
out_ptr: *mut *const u8,
|
|
322
|
+
out_len: *mut usize,
|
|
323
|
+
) -> i32 {
|
|
324
|
+
if view.is_null() {
|
|
325
|
+
set_last_error("View cannot be NULL".to_string());
|
|
326
|
+
return -1;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
if out_ptr.is_null() || out_len.is_null() {
|
|
330
|
+
set_last_error("Output pointers cannot be NULL".to_string());
|
|
331
|
+
return -1;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
clear_last_error();
|
|
335
|
+
|
|
336
|
+
unsafe {
|
|
337
|
+
*out_ptr = (*view).content_ptr;
|
|
338
|
+
*out_len = (*view).content_len;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
0
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
/// Get direct access to MIME type from a result view.
|
|
345
|
+
///
|
|
346
|
+
/// # Arguments
|
|
347
|
+
///
|
|
348
|
+
/// * `view` - Pointer to a CExtractionResultView structure
|
|
349
|
+
/// * `out_ptr` - Pointer to receive the MIME type pointer
|
|
350
|
+
/// * `out_len` - Pointer to receive the MIME type length
|
|
351
|
+
///
|
|
352
|
+
/// # Returns
|
|
353
|
+
///
|
|
354
|
+
/// 0 on success, -1 on error (check `kreuzberg_last_error`).
|
|
355
|
+
///
|
|
356
|
+
/// # Safety
|
|
357
|
+
///
|
|
358
|
+
/// - `view` must be a valid pointer to a CExtractionResultView
|
|
359
|
+
/// - `out_ptr` and `out_len` must be valid writable pointers
|
|
360
|
+
/// - The returned MIME type pointer is valid only while the source ExtractionResult is valid
|
|
361
|
+
///
|
|
362
|
+
/// # Example (C)
|
|
363
|
+
///
|
|
364
|
+
/// ```c
|
|
365
|
+
/// const uint8_t* mime_type;
|
|
366
|
+
/// size_t mime_len;
|
|
367
|
+
/// if (kreuzberg_view_get_mime_type(&view, &mime_type, &mime_len) == 0) {
|
|
368
|
+
/// printf("MIME: %.*s\n", (int)mime_len, mime_type);
|
|
369
|
+
/// }
|
|
370
|
+
/// ```
|
|
371
|
+
#[unsafe(no_mangle)]
|
|
372
|
+
pub unsafe extern "C" fn kreuzberg_view_get_mime_type(
|
|
373
|
+
view: *const CExtractionResultView,
|
|
374
|
+
out_ptr: *mut *const u8,
|
|
375
|
+
out_len: *mut usize,
|
|
376
|
+
) -> i32 {
|
|
377
|
+
if view.is_null() {
|
|
378
|
+
set_last_error("View cannot be NULL".to_string());
|
|
379
|
+
return -1;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
if out_ptr.is_null() || out_len.is_null() {
|
|
383
|
+
set_last_error("Output pointers cannot be NULL".to_string());
|
|
384
|
+
return -1;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
clear_last_error();
|
|
388
|
+
|
|
389
|
+
unsafe {
|
|
390
|
+
*out_ptr = (*view).mime_type_ptr;
|
|
391
|
+
*out_len = (*view).mime_type_len;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
0
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
#[cfg(test)]
|
|
398
|
+
mod tests {
|
|
399
|
+
use super::*;
|
|
400
|
+
use kreuzberg::types::{Metadata, PageStructure, PageUnitType};
|
|
401
|
+
use std::mem;
|
|
402
|
+
|
|
403
|
+
fn create_test_result() -> ExtractionResult {
|
|
404
|
+
let mut metadata = Metadata {
|
|
405
|
+
title: Some("Test Document".to_string()),
|
|
406
|
+
language: Some("en".to_string()),
|
|
407
|
+
created_at: Some("2025-01-01".to_string()),
|
|
408
|
+
subject: Some("Test Subject".to_string()),
|
|
409
|
+
..Default::default()
|
|
410
|
+
};
|
|
411
|
+
|
|
412
|
+
let page_structure = PageStructure {
|
|
413
|
+
total_count: 10,
|
|
414
|
+
unit_type: PageUnitType::Page,
|
|
415
|
+
boundaries: None,
|
|
416
|
+
pages: None,
|
|
417
|
+
};
|
|
418
|
+
metadata.pages = Some(page_structure);
|
|
419
|
+
|
|
420
|
+
ExtractionResult {
|
|
421
|
+
content: "Sample content for zero-copy testing".to_string(),
|
|
422
|
+
mime_type: "text/plain".to_string(),
|
|
423
|
+
metadata,
|
|
424
|
+
tables: vec![],
|
|
425
|
+
detected_languages: Some(vec!["en".to_string(), "de".to_string()]),
|
|
426
|
+
chunks: Some(vec![
|
|
427
|
+
kreuzberg::types::Chunk {
|
|
428
|
+
content: "Chunk 1".to_string(),
|
|
429
|
+
embedding: None,
|
|
430
|
+
metadata: kreuzberg::types::ChunkMetadata {
|
|
431
|
+
byte_start: 0,
|
|
432
|
+
byte_end: 7,
|
|
433
|
+
token_count: None,
|
|
434
|
+
chunk_index: 0,
|
|
435
|
+
total_chunks: 2,
|
|
436
|
+
first_page: None,
|
|
437
|
+
last_page: None,
|
|
438
|
+
},
|
|
439
|
+
},
|
|
440
|
+
kreuzberg::types::Chunk {
|
|
441
|
+
content: "Chunk 2".to_string(),
|
|
442
|
+
embedding: None,
|
|
443
|
+
metadata: kreuzberg::types::ChunkMetadata {
|
|
444
|
+
byte_start: 8,
|
|
445
|
+
byte_end: 15,
|
|
446
|
+
token_count: None,
|
|
447
|
+
chunk_index: 1,
|
|
448
|
+
total_chunks: 2,
|
|
449
|
+
first_page: None,
|
|
450
|
+
last_page: None,
|
|
451
|
+
},
|
|
452
|
+
},
|
|
453
|
+
]),
|
|
454
|
+
images: None,
|
|
455
|
+
pages: None,
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
#[test]
|
|
460
|
+
fn test_result_view_structure_size() {
|
|
461
|
+
let size = mem::size_of::<CExtractionResultView>();
|
|
462
|
+
assert_eq!(
|
|
463
|
+
size, 136,
|
|
464
|
+
"View structure size should be 136 bytes (6 ptr+len pairs + 5 counts)"
|
|
465
|
+
);
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
#[test]
|
|
469
|
+
fn test_get_result_view_success() {
|
|
470
|
+
let result = create_test_result();
|
|
471
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
472
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
473
|
+
|
|
474
|
+
let ret = unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
475
|
+
assert_eq!(ret, 0, "Should return success");
|
|
476
|
+
|
|
477
|
+
assert!(!view.content_ptr.is_null());
|
|
478
|
+
assert_eq!(view.content_len, result.content.len());
|
|
479
|
+
|
|
480
|
+
assert!(!view.mime_type_ptr.is_null());
|
|
481
|
+
assert_eq!(view.mime_type_len, result.mime_type.len());
|
|
482
|
+
|
|
483
|
+
assert!(!view.language_ptr.is_null());
|
|
484
|
+
assert_eq!(view.language_len, 2);
|
|
485
|
+
|
|
486
|
+
assert!(!view.title_ptr.is_null());
|
|
487
|
+
assert_eq!(view.title_len, "Test Document".len());
|
|
488
|
+
|
|
489
|
+
assert_eq!(view.chunk_count, 2);
|
|
490
|
+
assert_eq!(view.detected_language_count, 2);
|
|
491
|
+
assert_eq!(view.page_count, 10);
|
|
492
|
+
assert_eq!(view.table_count, 0);
|
|
493
|
+
assert_eq!(view.image_count, 0);
|
|
494
|
+
|
|
495
|
+
let content_slice = unsafe { std::slice::from_raw_parts(view.content_ptr, view.content_len) };
|
|
496
|
+
assert_eq!(content_slice, result.content.as_bytes());
|
|
497
|
+
|
|
498
|
+
let mime_slice = unsafe { std::slice::from_raw_parts(view.mime_type_ptr, view.mime_type_len) };
|
|
499
|
+
assert_eq!(mime_slice, result.mime_type.as_bytes());
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
#[test]
|
|
503
|
+
fn test_get_result_view_null_result() {
|
|
504
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
505
|
+
let ret = unsafe { kreuzberg_get_result_view(ptr::null(), &mut view) };
|
|
506
|
+
assert_eq!(ret, -1, "Should return error for NULL result");
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
#[test]
|
|
510
|
+
fn test_get_result_view_null_output() {
|
|
511
|
+
let result = create_test_result();
|
|
512
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
513
|
+
let ret = unsafe { kreuzberg_get_result_view(result_ptr, ptr::null_mut()) };
|
|
514
|
+
assert_eq!(ret, -1, "Should return error for NULL output");
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
#[test]
|
|
518
|
+
fn test_view_get_content() {
|
|
519
|
+
let result = create_test_result();
|
|
520
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
521
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
522
|
+
|
|
523
|
+
unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
524
|
+
|
|
525
|
+
let mut content_ptr: *const u8 = ptr::null();
|
|
526
|
+
let mut content_len: usize = 0;
|
|
527
|
+
|
|
528
|
+
let ret = unsafe { kreuzberg_view_get_content(&view, &mut content_ptr, &mut content_len) };
|
|
529
|
+
|
|
530
|
+
assert_eq!(ret, 0, "Should return success");
|
|
531
|
+
assert!(!content_ptr.is_null());
|
|
532
|
+
assert_eq!(content_len, result.content.len());
|
|
533
|
+
|
|
534
|
+
let content_slice = unsafe { std::slice::from_raw_parts(content_ptr, content_len) };
|
|
535
|
+
assert_eq!(content_slice, result.content.as_bytes());
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
#[test]
|
|
539
|
+
fn test_view_get_mime_type() {
|
|
540
|
+
let result = create_test_result();
|
|
541
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
542
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
543
|
+
|
|
544
|
+
unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
545
|
+
|
|
546
|
+
let mut mime_ptr: *const u8 = ptr::null();
|
|
547
|
+
let mut mime_len: usize = 0;
|
|
548
|
+
|
|
549
|
+
let ret = unsafe { kreuzberg_view_get_mime_type(&view, &mut mime_ptr, &mut mime_len) };
|
|
550
|
+
|
|
551
|
+
assert_eq!(ret, 0, "Should return success");
|
|
552
|
+
assert!(!mime_ptr.is_null());
|
|
553
|
+
assert_eq!(mime_len, result.mime_type.len());
|
|
554
|
+
|
|
555
|
+
let mime_slice = unsafe { std::slice::from_raw_parts(mime_ptr, mime_len) };
|
|
556
|
+
assert_eq!(mime_slice, result.mime_type.as_bytes());
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
#[test]
|
|
560
|
+
fn test_view_optional_fields_null() {
|
|
561
|
+
let mut result = create_test_result();
|
|
562
|
+
result.metadata.language = None;
|
|
563
|
+
result.metadata.title = None;
|
|
564
|
+
|
|
565
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
566
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
567
|
+
|
|
568
|
+
let ret = unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
569
|
+
assert_eq!(ret, 0);
|
|
570
|
+
|
|
571
|
+
assert!(view.language_ptr.is_null());
|
|
572
|
+
assert_eq!(view.language_len, 0);
|
|
573
|
+
|
|
574
|
+
assert!(view.title_ptr.is_null());
|
|
575
|
+
assert_eq!(view.title_len, 0);
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
#[test]
|
|
579
|
+
fn test_view_lifetime_safety_pattern() {
|
|
580
|
+
let result = create_test_result();
|
|
581
|
+
let expected_content = result.content.clone();
|
|
582
|
+
|
|
583
|
+
{
|
|
584
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
585
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
586
|
+
|
|
587
|
+
unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
588
|
+
|
|
589
|
+
let content_slice = unsafe { std::slice::from_raw_parts(view.content_ptr, view.content_len) };
|
|
590
|
+
assert_eq!(content_slice, expected_content.as_bytes());
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
assert_eq!(result.content, expected_content);
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
#[test]
|
|
597
|
+
fn test_zero_copy_no_allocation() {
|
|
598
|
+
let result = create_test_result();
|
|
599
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
600
|
+
|
|
601
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
602
|
+
|
|
603
|
+
unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
604
|
+
|
|
605
|
+
let content_start = result.content.as_ptr() as usize;
|
|
606
|
+
let content_end = content_start + result.content.len();
|
|
607
|
+
let view_ptr = view.content_ptr as usize;
|
|
608
|
+
|
|
609
|
+
assert!(
|
|
610
|
+
view_ptr >= content_start && view_ptr < content_end,
|
|
611
|
+
"View pointer should point into result's memory"
|
|
612
|
+
);
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
#[test]
|
|
616
|
+
fn test_view_get_content_null_view() {
|
|
617
|
+
let mut content_ptr: *const u8 = ptr::null();
|
|
618
|
+
let mut content_len: usize = 0;
|
|
619
|
+
|
|
620
|
+
let ret = unsafe { kreuzberg_view_get_content(ptr::null(), &mut content_ptr, &mut content_len) };
|
|
621
|
+
|
|
622
|
+
assert_eq!(ret, -1, "Should return error for NULL view");
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
#[test]
|
|
626
|
+
fn test_view_get_content_null_outputs() {
|
|
627
|
+
let result = create_test_result();
|
|
628
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
629
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
630
|
+
|
|
631
|
+
unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
632
|
+
|
|
633
|
+
let mut content_len: usize = 0;
|
|
634
|
+
let ret = unsafe { kreuzberg_view_get_content(&view, ptr::null_mut(), &mut content_len) };
|
|
635
|
+
assert_eq!(ret, -1, "Should return error for NULL out_ptr");
|
|
636
|
+
|
|
637
|
+
let mut content_ptr: *const u8 = ptr::null();
|
|
638
|
+
let ret = unsafe { kreuzberg_view_get_content(&view, &mut content_ptr, ptr::null_mut()) };
|
|
639
|
+
assert_eq!(ret, -1, "Should return error for NULL out_len");
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
#[test]
|
|
643
|
+
fn test_view_get_mime_type_null_view() {
|
|
644
|
+
let mut mime_ptr: *const u8 = ptr::null();
|
|
645
|
+
let mut mime_len: usize = 0;
|
|
646
|
+
|
|
647
|
+
let ret = unsafe { kreuzberg_view_get_mime_type(ptr::null(), &mut mime_ptr, &mut mime_len) };
|
|
648
|
+
|
|
649
|
+
assert_eq!(ret, -1, "Should return error for NULL view");
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
#[test]
|
|
653
|
+
fn test_view_empty_content() {
|
|
654
|
+
let mut result = create_test_result();
|
|
655
|
+
result.content = String::new();
|
|
656
|
+
|
|
657
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
658
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
659
|
+
|
|
660
|
+
let ret = unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
661
|
+
assert_eq!(ret, 0);
|
|
662
|
+
|
|
663
|
+
assert!(
|
|
664
|
+
!view.content_ptr.is_null(),
|
|
665
|
+
"Empty string should still have valid pointer"
|
|
666
|
+
);
|
|
667
|
+
assert_eq!(view.content_len, 0);
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
#[test]
|
|
671
|
+
fn test_view_large_content() {
|
|
672
|
+
let mut result = create_test_result();
|
|
673
|
+
result.content = "x".repeat(10 * 1024 * 1024);
|
|
674
|
+
let expected_len = result.content.len();
|
|
675
|
+
|
|
676
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
677
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
678
|
+
|
|
679
|
+
let ret = unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
680
|
+
assert_eq!(ret, 0);
|
|
681
|
+
|
|
682
|
+
assert_eq!(view.content_len, expected_len);
|
|
683
|
+
assert!(!view.content_ptr.is_null());
|
|
684
|
+
|
|
685
|
+
let content_slice = unsafe { std::slice::from_raw_parts(view.content_ptr, view.content_len) };
|
|
686
|
+
assert_eq!(content_slice, result.content.as_bytes());
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
#[test]
|
|
690
|
+
fn test_view_unicode_content() {
|
|
691
|
+
let mut result = create_test_result();
|
|
692
|
+
result.content = "Hello 世界 🌍 Привет مرحبا".to_string();
|
|
693
|
+
result.metadata.title = Some("Título español 中文标题".to_string());
|
|
694
|
+
|
|
695
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
696
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
697
|
+
|
|
698
|
+
let ret = unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
699
|
+
assert_eq!(ret, 0);
|
|
700
|
+
|
|
701
|
+
let content_slice = unsafe { std::slice::from_raw_parts(view.content_ptr, view.content_len) };
|
|
702
|
+
assert_eq!(content_slice, result.content.as_bytes());
|
|
703
|
+
|
|
704
|
+
let title_slice = unsafe { std::slice::from_raw_parts(view.title_ptr, view.title_len) };
|
|
705
|
+
let title_str = std::str::from_utf8(title_slice).unwrap();
|
|
706
|
+
assert_eq!(title_str, "Título español 中文标题");
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
#[test]
|
|
710
|
+
fn test_view_all_counts_zero() {
|
|
711
|
+
let result = ExtractionResult {
|
|
712
|
+
content: "Minimal content".to_string(),
|
|
713
|
+
mime_type: "text/plain".to_string(),
|
|
714
|
+
metadata: Metadata::default(),
|
|
715
|
+
tables: vec![],
|
|
716
|
+
detected_languages: None,
|
|
717
|
+
chunks: None,
|
|
718
|
+
images: None,
|
|
719
|
+
pages: None,
|
|
720
|
+
};
|
|
721
|
+
|
|
722
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
723
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
724
|
+
|
|
725
|
+
let ret = unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
726
|
+
assert_eq!(ret, 0);
|
|
727
|
+
|
|
728
|
+
assert_eq!(view.table_count, 0);
|
|
729
|
+
assert_eq!(view.chunk_count, 0);
|
|
730
|
+
assert_eq!(view.detected_language_count, 0);
|
|
731
|
+
assert_eq!(view.image_count, 0);
|
|
732
|
+
assert_eq!(view.page_count, 0);
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
#[test]
|
|
736
|
+
fn test_view_multiple_views_same_result() {
|
|
737
|
+
let result = create_test_result();
|
|
738
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
739
|
+
|
|
740
|
+
let mut view1: CExtractionResultView = unsafe { mem::zeroed() };
|
|
741
|
+
let mut view2: CExtractionResultView = unsafe { mem::zeroed() };
|
|
742
|
+
|
|
743
|
+
unsafe {
|
|
744
|
+
kreuzberg_get_result_view(result_ptr, &mut view1);
|
|
745
|
+
kreuzberg_get_result_view(result_ptr, &mut view2);
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
assert_eq!(view1.content_ptr, view2.content_ptr);
|
|
749
|
+
assert_eq!(view1.content_len, view2.content_len);
|
|
750
|
+
assert_eq!(view1.mime_type_ptr, view2.mime_type_ptr);
|
|
751
|
+
assert_eq!(view1.mime_type_len, view2.mime_type_len);
|
|
752
|
+
assert_eq!(view1.table_count, view2.table_count);
|
|
753
|
+
assert_eq!(view1.chunk_count, view2.chunk_count);
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
#[test]
|
|
757
|
+
fn test_view_field_isolation() {
|
|
758
|
+
let result = create_test_result();
|
|
759
|
+
let result_ptr = &result as *const ExtractionResult;
|
|
760
|
+
let mut view: CExtractionResultView = unsafe { mem::zeroed() };
|
|
761
|
+
|
|
762
|
+
unsafe { kreuzberg_get_result_view(result_ptr, &mut view) };
|
|
763
|
+
|
|
764
|
+
assert_ne!(view.content_ptr, view.mime_type_ptr);
|
|
765
|
+
|
|
766
|
+
if !view.language_ptr.is_null() && !view.title_ptr.is_null() {
|
|
767
|
+
assert_ne!(view.language_ptr, view.title_ptr);
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
assert_eq!(view.mime_type_len, "text/plain".len());
|
|
771
|
+
assert_eq!(view.language_len, "en".len());
|
|
772
|
+
}
|
|
773
|
+
}
|