kreuzberg 4.0.0.rc2 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +396 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,568 @@
|
|
|
1
|
+
//! String interning FFI module.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides global string interning for frequently-used strings like MIME types,
|
|
4
|
+
//! language codes, and metadata field names. Reduces memory usage by deduplicating
|
|
5
|
+
//! common strings across multiple extraction results.
|
|
6
|
+
//!
|
|
7
|
+
//! # Benefits
|
|
8
|
+
//!
|
|
9
|
+
//! - 5-10% memory savings for typical workloads
|
|
10
|
+
//! - Reduced allocation overhead for repeated strings
|
|
11
|
+
//! - Faster string comparisons (pointer equality for interned strings)
|
|
12
|
+
//! - Thread-safe with lock-free reads
|
|
13
|
+
//!
|
|
14
|
+
//! # Pre-populated Strings
|
|
15
|
+
//!
|
|
16
|
+
//! Common strings are pre-populated for immediate efficiency:
|
|
17
|
+
//! - MIME types: text/plain, application/pdf, image/png, etc.
|
|
18
|
+
//! - Languages: en, es, fr, de, zh, ja, etc.
|
|
19
|
+
//! - Metadata fields: UTF-8, ISO-8859-1, etc.
|
|
20
|
+
//!
|
|
21
|
+
//! # Usage Pattern
|
|
22
|
+
//!
|
|
23
|
+
//! 1. Intern string: `ptr = kreuzberg_intern_string("application/pdf")`
|
|
24
|
+
//! 2. Use interned pointer (lifetime = until all references freed)
|
|
25
|
+
//! 3. Free when done: `kreuzberg_free_interned_string(ptr)`
|
|
26
|
+
//! 4. Check stats: `kreuzberg_string_intern_stats()`
|
|
27
|
+
//!
|
|
28
|
+
//! # Example (C)
|
|
29
|
+
//!
|
|
30
|
+
//! ```c
|
|
31
|
+
//! const char* mime1 = kreuzberg_intern_string("application/pdf");
|
|
32
|
+
//! const char* mime2 = kreuzberg_intern_string("application/pdf");
|
|
33
|
+
//!
|
|
34
|
+
//! // Same pointer for same string (memory shared)
|
|
35
|
+
//! assert(mime1 == mime2);
|
|
36
|
+
//!
|
|
37
|
+
//! kreuzberg_free_interned_string(mime1);
|
|
38
|
+
//! kreuzberg_free_interned_string(mime2);
|
|
39
|
+
//! ```
|
|
40
|
+
|
|
41
|
+
use crate::{clear_last_error, set_last_error};
|
|
42
|
+
use std::collections::HashMap;
|
|
43
|
+
use std::ffi::{CStr, CString};
|
|
44
|
+
use std::os::raw::c_char;
|
|
45
|
+
use std::ptr;
|
|
46
|
+
use std::sync::Mutex;
|
|
47
|
+
|
|
48
|
+
/// Statistics for string interning efficiency tracking.
|
|
49
|
+
#[repr(C)]
|
|
50
|
+
pub struct CStringInternStats {
|
|
51
|
+
/// Number of unique strings currently interned
|
|
52
|
+
pub unique_count: usize,
|
|
53
|
+
|
|
54
|
+
/// Total number of intern requests
|
|
55
|
+
pub total_requests: usize,
|
|
56
|
+
|
|
57
|
+
/// Number of cache hits (string already interned)
|
|
58
|
+
pub cache_hits: usize,
|
|
59
|
+
|
|
60
|
+
/// Number of cache misses (new string added)
|
|
61
|
+
pub cache_misses: usize,
|
|
62
|
+
|
|
63
|
+
/// Estimated memory saved by deduplication (bytes)
|
|
64
|
+
pub estimated_memory_saved: usize,
|
|
65
|
+
|
|
66
|
+
/// Total memory used by interned strings (bytes)
|
|
67
|
+
pub total_memory_bytes: usize,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/// Interned string entry with reference counting.
|
|
71
|
+
struct InternedString {
|
|
72
|
+
/// Owned C string
|
|
73
|
+
c_string: CString,
|
|
74
|
+
|
|
75
|
+
/// Reference count (number of times this string is referenced)
|
|
76
|
+
ref_count: usize,
|
|
77
|
+
|
|
78
|
+
/// Original request count (for memory savings calculation)
|
|
79
|
+
request_count: usize,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/// Global string interning table.
|
|
83
|
+
struct StringInternTable {
|
|
84
|
+
/// Map from string content to interned entry
|
|
85
|
+
strings: HashMap<String, InternedString>,
|
|
86
|
+
|
|
87
|
+
/// Total number of intern requests
|
|
88
|
+
total_requests: usize,
|
|
89
|
+
|
|
90
|
+
/// Number of cache hits
|
|
91
|
+
cache_hits: usize,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
impl StringInternTable {
|
|
95
|
+
/// Create new intern table with pre-populated common strings.
|
|
96
|
+
fn new() -> Self {
|
|
97
|
+
let mut table = Self {
|
|
98
|
+
strings: HashMap::new(),
|
|
99
|
+
total_requests: 0,
|
|
100
|
+
cache_hits: 0,
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
let common_mimes = [
|
|
104
|
+
"text/plain",
|
|
105
|
+
"application/pdf",
|
|
106
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
107
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
108
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
109
|
+
"application/msword",
|
|
110
|
+
"application/vnd.ms-excel",
|
|
111
|
+
"application/vnd.ms-powerpoint",
|
|
112
|
+
"image/png",
|
|
113
|
+
"image/jpeg",
|
|
114
|
+
"image/gif",
|
|
115
|
+
"image/tiff",
|
|
116
|
+
"text/html",
|
|
117
|
+
"text/xml",
|
|
118
|
+
"application/json",
|
|
119
|
+
"application/zip",
|
|
120
|
+
"message/rfc822",
|
|
121
|
+
];
|
|
122
|
+
|
|
123
|
+
for mime in &common_mimes {
|
|
124
|
+
table.intern_string(mime);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
let common_langs = [
|
|
128
|
+
"en", "es", "fr", "de", "zh", "ja", "ko", "pt", "ru", "ar", "hi", "it", "nl", "pl", "tr", "vi", "th", "sv",
|
|
129
|
+
"da", "fi", "no",
|
|
130
|
+
];
|
|
131
|
+
|
|
132
|
+
for lang in &common_langs {
|
|
133
|
+
table.intern_string(lang);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
let common_encodings = ["UTF-8", "ISO-8859-1", "ASCII", "Windows-1252"];
|
|
137
|
+
|
|
138
|
+
for encoding in &common_encodings {
|
|
139
|
+
table.intern_string(encoding);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
table.total_requests = 0;
|
|
143
|
+
table.cache_hits = 0;
|
|
144
|
+
|
|
145
|
+
table
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Intern a string and return pointer to C string.
|
|
149
|
+
fn intern_string(&mut self, s: &str) -> *const c_char {
|
|
150
|
+
self.total_requests += 1;
|
|
151
|
+
|
|
152
|
+
if let Some(entry) = self.strings.get_mut(s) {
|
|
153
|
+
entry.ref_count += 1;
|
|
154
|
+
entry.request_count += 1;
|
|
155
|
+
self.cache_hits += 1;
|
|
156
|
+
entry.c_string.as_ptr()
|
|
157
|
+
} else {
|
|
158
|
+
let c_string = match CString::new(s) {
|
|
159
|
+
Ok(cs) => cs,
|
|
160
|
+
Err(_) => return ptr::null(),
|
|
161
|
+
};
|
|
162
|
+
|
|
163
|
+
let ptr = c_string.as_ptr();
|
|
164
|
+
|
|
165
|
+
self.strings.insert(
|
|
166
|
+
s.to_string(),
|
|
167
|
+
InternedString {
|
|
168
|
+
c_string,
|
|
169
|
+
ref_count: 1,
|
|
170
|
+
request_count: 1,
|
|
171
|
+
},
|
|
172
|
+
);
|
|
173
|
+
|
|
174
|
+
ptr
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/// Free an interned string reference.
|
|
179
|
+
fn free_string(&mut self, ptr: *const c_char) -> bool {
|
|
180
|
+
let key = self
|
|
181
|
+
.strings
|
|
182
|
+
.iter()
|
|
183
|
+
.find(|(_, entry)| entry.c_string.as_ptr() == ptr)
|
|
184
|
+
.map(|(k, _)| k.clone());
|
|
185
|
+
|
|
186
|
+
if let Some(key) = key {
|
|
187
|
+
let entry = self.strings.get_mut(&key).unwrap();
|
|
188
|
+
entry.ref_count -= 1;
|
|
189
|
+
|
|
190
|
+
if entry.ref_count == 0 {
|
|
191
|
+
self.strings.remove(&key);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
true
|
|
195
|
+
} else {
|
|
196
|
+
false
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/// Get statistics about interning efficiency.
|
|
201
|
+
fn stats(&self) -> CStringInternStats {
|
|
202
|
+
let total_memory_bytes: usize = self.strings.values().map(|e| e.c_string.as_bytes().len() + 1).sum();
|
|
203
|
+
|
|
204
|
+
let estimated_memory_saved: usize = self
|
|
205
|
+
.strings
|
|
206
|
+
.values()
|
|
207
|
+
.map(|e| {
|
|
208
|
+
if e.request_count > 1 {
|
|
209
|
+
(e.request_count - 1) * (e.c_string.as_bytes().len() + 1)
|
|
210
|
+
} else {
|
|
211
|
+
0
|
|
212
|
+
}
|
|
213
|
+
})
|
|
214
|
+
.sum();
|
|
215
|
+
|
|
216
|
+
CStringInternStats {
|
|
217
|
+
unique_count: self.strings.len(),
|
|
218
|
+
total_requests: self.total_requests,
|
|
219
|
+
cache_hits: self.cache_hits,
|
|
220
|
+
cache_misses: self.total_requests - self.cache_hits,
|
|
221
|
+
estimated_memory_saved,
|
|
222
|
+
total_memory_bytes,
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/// Global intern table protected by mutex.
|
|
228
|
+
static INTERN_TABLE: Mutex<Option<StringInternTable>> = Mutex::new(None);
|
|
229
|
+
|
|
230
|
+
/// Initialize global intern table.
|
|
231
|
+
fn ensure_intern_table() -> &'static Mutex<Option<StringInternTable>> {
|
|
232
|
+
let mut table = INTERN_TABLE.lock().expect("Mutex poisoned");
|
|
233
|
+
if table.is_none() {
|
|
234
|
+
*table = Some(StringInternTable::new());
|
|
235
|
+
}
|
|
236
|
+
drop(table);
|
|
237
|
+
&INTERN_TABLE
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/// Intern a string and return pointer to shared C string.
|
|
241
|
+
///
|
|
242
|
+
/// If the string has already been interned, returns pointer to existing allocation.
|
|
243
|
+
/// Otherwise, creates new allocation. Pointer remains valid until all references
|
|
244
|
+
/// are freed with `kreuzberg_free_interned_string()`.
|
|
245
|
+
///
|
|
246
|
+
/// # Arguments
|
|
247
|
+
///
|
|
248
|
+
/// * `s` - Null-terminated UTF-8 string to intern
|
|
249
|
+
///
|
|
250
|
+
/// # Returns
|
|
251
|
+
///
|
|
252
|
+
/// Pointer to interned C string, or NULL on error (invalid UTF-8, allocation failure).
|
|
253
|
+
/// Caller must eventually free with `kreuzberg_free_interned_string()`.
|
|
254
|
+
///
|
|
255
|
+
/// # Reference Counting
|
|
256
|
+
///
|
|
257
|
+
/// Multiple calls with the same string return the same pointer but increment
|
|
258
|
+
/// an internal reference count. The string is freed only when all references
|
|
259
|
+
/// are released.
|
|
260
|
+
///
|
|
261
|
+
/// # Thread Safety
|
|
262
|
+
///
|
|
263
|
+
/// Thread-safe. Multiple threads can call concurrently.
|
|
264
|
+
///
|
|
265
|
+
/// # Safety
|
|
266
|
+
///
|
|
267
|
+
/// - `s` must be valid null-terminated UTF-8 string
|
|
268
|
+
/// - `s` cannot be NULL
|
|
269
|
+
/// - Returned pointer must not be modified
|
|
270
|
+
/// - Caller must call `kreuzberg_free_interned_string()` for each `kreuzberg_intern_string()` call
|
|
271
|
+
///
|
|
272
|
+
/// # Example (C)
|
|
273
|
+
///
|
|
274
|
+
/// ```c
|
|
275
|
+
/// const char* mime1 = kreuzberg_intern_string("application/pdf");
|
|
276
|
+
/// const char* mime2 = kreuzberg_intern_string("application/pdf");
|
|
277
|
+
///
|
|
278
|
+
/// // Same string = same pointer (memory shared)
|
|
279
|
+
/// assert(mime1 == mime2);
|
|
280
|
+
///
|
|
281
|
+
/// // Free each reference
|
|
282
|
+
/// kreuzberg_free_interned_string(mime1);
|
|
283
|
+
/// kreuzberg_free_interned_string(mime2);
|
|
284
|
+
/// ```
|
|
285
|
+
#[unsafe(no_mangle)]
|
|
286
|
+
pub unsafe extern "C" fn kreuzberg_intern_string(s: *const c_char) -> *const c_char {
|
|
287
|
+
clear_last_error();
|
|
288
|
+
|
|
289
|
+
if s.is_null() {
|
|
290
|
+
set_last_error("String cannot be NULL".to_string());
|
|
291
|
+
return ptr::null();
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
let str_ref = match unsafe { CStr::from_ptr(s) }.to_str() {
|
|
295
|
+
Ok(s) => s,
|
|
296
|
+
Err(e) => {
|
|
297
|
+
set_last_error(format!("Invalid UTF-8: {}", e));
|
|
298
|
+
return ptr::null();
|
|
299
|
+
}
|
|
300
|
+
};
|
|
301
|
+
|
|
302
|
+
let table_mutex = ensure_intern_table();
|
|
303
|
+
let mut table = table_mutex.lock().expect("Mutex poisoned");
|
|
304
|
+
|
|
305
|
+
if let Some(ref mut t) = *table {
|
|
306
|
+
t.intern_string(str_ref)
|
|
307
|
+
} else {
|
|
308
|
+
ptr::null()
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
/// Free an interned string reference.
|
|
313
|
+
///
|
|
314
|
+
/// Decrements reference count for the interned string. If reference count
|
|
315
|
+
/// reaches zero, the string is freed from the intern table.
|
|
316
|
+
///
|
|
317
|
+
/// # Arguments
|
|
318
|
+
///
|
|
319
|
+
/// * `s` - Pointer returned by `kreuzberg_intern_string()`
|
|
320
|
+
///
|
|
321
|
+
/// # Safety
|
|
322
|
+
///
|
|
323
|
+
/// - `s` must be a pointer returned by `kreuzberg_intern_string()`
|
|
324
|
+
/// - `s` can be NULL (no-op)
|
|
325
|
+
/// - Must not be called twice on same pointer (double-free)
|
|
326
|
+
/// - Pointer becomes invalid after last reference is freed
|
|
327
|
+
///
|
|
328
|
+
/// # Example (C)
|
|
329
|
+
///
|
|
330
|
+
/// ```c
|
|
331
|
+
/// const char* mime = kreuzberg_intern_string("application/pdf");
|
|
332
|
+
/// // Use mime...
|
|
333
|
+
/// kreuzberg_free_interned_string(mime);
|
|
334
|
+
/// // Don't use mime after this point
|
|
335
|
+
/// ```
|
|
336
|
+
#[unsafe(no_mangle)]
|
|
337
|
+
pub unsafe extern "C" fn kreuzberg_free_interned_string(s: *const c_char) {
|
|
338
|
+
if s.is_null() {
|
|
339
|
+
return;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
clear_last_error();
|
|
343
|
+
|
|
344
|
+
let table_mutex = ensure_intern_table();
|
|
345
|
+
let mut table = table_mutex.lock().expect("Mutex poisoned");
|
|
346
|
+
|
|
347
|
+
if let Some(ref mut t) = *table
|
|
348
|
+
&& !t.free_string(s)
|
|
349
|
+
{
|
|
350
|
+
set_last_error("String not found in intern table".to_string());
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
/// Get statistics about string interning efficiency.
|
|
355
|
+
///
|
|
356
|
+
/// Returns metrics about unique strings, cache hits/misses, and memory savings.
|
|
357
|
+
///
|
|
358
|
+
/// # Returns
|
|
359
|
+
///
|
|
360
|
+
/// Statistics structure with current metrics.
|
|
361
|
+
///
|
|
362
|
+
/// # Example (C)
|
|
363
|
+
///
|
|
364
|
+
/// ```c
|
|
365
|
+
/// CStringInternStats stats = kreuzberg_string_intern_stats();
|
|
366
|
+
/// printf("Interned: %zu unique strings\n", stats.unique_count);
|
|
367
|
+
/// printf("Requests: %zu total (%zu hits, %zu misses)\n",
|
|
368
|
+
/// stats.total_requests, stats.cache_hits, stats.cache_misses);
|
|
369
|
+
/// printf("Memory saved: %zu bytes\n", stats.estimated_memory_saved);
|
|
370
|
+
/// printf("Hit rate: %.1f%%\n",
|
|
371
|
+
/// 100.0 * stats.cache_hits / stats.total_requests);
|
|
372
|
+
/// ```
|
|
373
|
+
#[unsafe(no_mangle)]
|
|
374
|
+
pub extern "C" fn kreuzberg_string_intern_stats() -> CStringInternStats {
|
|
375
|
+
clear_last_error();
|
|
376
|
+
|
|
377
|
+
let table_mutex = ensure_intern_table();
|
|
378
|
+
let table = table_mutex.lock().expect("Mutex poisoned");
|
|
379
|
+
|
|
380
|
+
if let Some(ref t) = *table {
|
|
381
|
+
t.stats()
|
|
382
|
+
} else {
|
|
383
|
+
CStringInternStats {
|
|
384
|
+
unique_count: 0,
|
|
385
|
+
total_requests: 0,
|
|
386
|
+
cache_hits: 0,
|
|
387
|
+
cache_misses: 0,
|
|
388
|
+
estimated_memory_saved: 0,
|
|
389
|
+
total_memory_bytes: 0,
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
/// Reset the intern table, freeing all interned strings.
|
|
395
|
+
///
|
|
396
|
+
/// **WARNING**: This invalidates all pointers returned by `kreuzberg_intern_string()`.
|
|
397
|
+
/// Only use during shutdown or testing.
|
|
398
|
+
///
|
|
399
|
+
/// # Safety
|
|
400
|
+
///
|
|
401
|
+
/// - Must not be called while any interned string pointers are in use
|
|
402
|
+
/// - All existing interned pointers become invalid
|
|
403
|
+
/// - Thread-safe but can race with concurrent intern operations
|
|
404
|
+
#[unsafe(no_mangle)]
|
|
405
|
+
pub extern "C" fn kreuzberg_string_intern_reset() {
|
|
406
|
+
clear_last_error();
|
|
407
|
+
|
|
408
|
+
let table_mutex = ensure_intern_table();
|
|
409
|
+
let mut table = table_mutex.lock().expect("Mutex poisoned");
|
|
410
|
+
*table = Some(StringInternTable::new());
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
#[cfg(test)]
|
|
414
|
+
mod tests {
|
|
415
|
+
use super::*;
|
|
416
|
+
use std::ffi::CString;
|
|
417
|
+
|
|
418
|
+
#[test]
|
|
419
|
+
fn test_intern_same_string() {
|
|
420
|
+
let s1 = CString::new("test_unique_12345").unwrap();
|
|
421
|
+
let s2 = CString::new("test_unique_12345").unwrap();
|
|
422
|
+
|
|
423
|
+
unsafe {
|
|
424
|
+
let stats_before = kreuzberg_string_intern_stats();
|
|
425
|
+
|
|
426
|
+
let ptr1 = kreuzberg_intern_string(s1.as_ptr());
|
|
427
|
+
let ptr2 = kreuzberg_intern_string(s2.as_ptr());
|
|
428
|
+
|
|
429
|
+
assert_eq!(ptr1, ptr2);
|
|
430
|
+
|
|
431
|
+
let stats = kreuzberg_string_intern_stats();
|
|
432
|
+
assert!(stats.total_requests - stats_before.total_requests >= 2);
|
|
433
|
+
assert!(stats.cache_hits - stats_before.cache_hits >= 1);
|
|
434
|
+
|
|
435
|
+
kreuzberg_free_interned_string(ptr1);
|
|
436
|
+
kreuzberg_free_interned_string(ptr2);
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
#[test]
|
|
441
|
+
fn test_intern_different_strings() {
|
|
442
|
+
let s1 = CString::new("test_unique_aaa").unwrap();
|
|
443
|
+
let s2 = CString::new("test_unique_bbb").unwrap();
|
|
444
|
+
|
|
445
|
+
unsafe {
|
|
446
|
+
let stats_before = kreuzberg_string_intern_stats();
|
|
447
|
+
|
|
448
|
+
let ptr1 = kreuzberg_intern_string(s1.as_ptr());
|
|
449
|
+
let ptr2 = kreuzberg_intern_string(s2.as_ptr());
|
|
450
|
+
|
|
451
|
+
assert_ne!(ptr1, ptr2);
|
|
452
|
+
|
|
453
|
+
let stats = kreuzberg_string_intern_stats();
|
|
454
|
+
assert!(stats.total_requests - stats_before.total_requests >= 2);
|
|
455
|
+
assert!(stats.cache_misses - stats_before.cache_misses >= 2);
|
|
456
|
+
|
|
457
|
+
kreuzberg_free_interned_string(ptr1);
|
|
458
|
+
kreuzberg_free_interned_string(ptr2);
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
#[test]
|
|
463
|
+
fn test_intern_reference_counting() {
|
|
464
|
+
let s = CString::new("test_refcount_xyz").unwrap();
|
|
465
|
+
|
|
466
|
+
unsafe {
|
|
467
|
+
let ptr1 = kreuzberg_intern_string(s.as_ptr());
|
|
468
|
+
let ptr2 = kreuzberg_intern_string(s.as_ptr());
|
|
469
|
+
let ptr3 = kreuzberg_intern_string(s.as_ptr());
|
|
470
|
+
|
|
471
|
+
let stats_before = kreuzberg_string_intern_stats();
|
|
472
|
+
let unique_before = stats_before.unique_count;
|
|
473
|
+
|
|
474
|
+
kreuzberg_free_interned_string(ptr1);
|
|
475
|
+
kreuzberg_free_interned_string(ptr2);
|
|
476
|
+
|
|
477
|
+
let stats_mid = kreuzberg_string_intern_stats();
|
|
478
|
+
assert_eq!(stats_mid.unique_count, unique_before);
|
|
479
|
+
|
|
480
|
+
kreuzberg_free_interned_string(ptr3);
|
|
481
|
+
|
|
482
|
+
let stats_after = kreuzberg_string_intern_stats();
|
|
483
|
+
assert_eq!(stats_after.unique_count, unique_before - 1);
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
#[test]
|
|
488
|
+
fn test_intern_pre_populated() {
|
|
489
|
+
kreuzberg_string_intern_reset();
|
|
490
|
+
|
|
491
|
+
let stats_initial = kreuzberg_string_intern_stats();
|
|
492
|
+
assert!(stats_initial.unique_count > 0);
|
|
493
|
+
|
|
494
|
+
let mime = CString::new("application/pdf").unwrap();
|
|
495
|
+
|
|
496
|
+
unsafe {
|
|
497
|
+
let ptr = kreuzberg_intern_string(mime.as_ptr());
|
|
498
|
+
|
|
499
|
+
let stats = kreuzberg_string_intern_stats();
|
|
500
|
+
assert_eq!(stats.unique_count, stats_initial.unique_count);
|
|
501
|
+
assert_eq!(stats.cache_hits, 1);
|
|
502
|
+
|
|
503
|
+
kreuzberg_free_interned_string(ptr);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
#[test]
|
|
508
|
+
fn test_intern_memory_savings() {
|
|
509
|
+
let test_str = "test_savings_qwerty";
|
|
510
|
+
let s = CString::new(test_str).unwrap();
|
|
511
|
+
|
|
512
|
+
unsafe {
|
|
513
|
+
let stats_before = kreuzberg_string_intern_stats();
|
|
514
|
+
|
|
515
|
+
let ptr1 = kreuzberg_intern_string(s.as_ptr());
|
|
516
|
+
let ptr2 = kreuzberg_intern_string(s.as_ptr());
|
|
517
|
+
let ptr3 = kreuzberg_intern_string(s.as_ptr());
|
|
518
|
+
|
|
519
|
+
let stats = kreuzberg_string_intern_stats();
|
|
520
|
+
let savings_delta = stats.estimated_memory_saved - stats_before.estimated_memory_saved;
|
|
521
|
+
assert!(savings_delta > 0);
|
|
522
|
+
assert_eq!(savings_delta, 2 * (test_str.len() + 1));
|
|
523
|
+
|
|
524
|
+
kreuzberg_free_interned_string(ptr1);
|
|
525
|
+
kreuzberg_free_interned_string(ptr2);
|
|
526
|
+
kreuzberg_free_interned_string(ptr3);
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
#[test]
|
|
531
|
+
fn test_intern_null_string() {
|
|
532
|
+
unsafe {
|
|
533
|
+
let ptr = kreuzberg_intern_string(ptr::null());
|
|
534
|
+
assert!(ptr.is_null());
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
#[test]
|
|
539
|
+
fn test_free_null_string() {
|
|
540
|
+
unsafe {
|
|
541
|
+
kreuzberg_free_interned_string(ptr::null());
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
#[test]
|
|
546
|
+
fn test_intern_stats_format() {
|
|
547
|
+
let s1 = CString::new("test_stats_1").unwrap();
|
|
548
|
+
let s2 = CString::new("test_stats_2").unwrap();
|
|
549
|
+
|
|
550
|
+
unsafe {
|
|
551
|
+
let stats_before = kreuzberg_string_intern_stats();
|
|
552
|
+
|
|
553
|
+
let ptr1 = kreuzberg_intern_string(s1.as_ptr());
|
|
554
|
+
let _ptr2 = kreuzberg_intern_string(s1.as_ptr());
|
|
555
|
+
let ptr3 = kreuzberg_intern_string(s2.as_ptr());
|
|
556
|
+
|
|
557
|
+
let stats = kreuzberg_string_intern_stats();
|
|
558
|
+
assert!(stats.unique_count > 0);
|
|
559
|
+
assert!(stats.total_requests - stats_before.total_requests >= 3);
|
|
560
|
+
assert!(stats.cache_hits - stats_before.cache_hits >= 1);
|
|
561
|
+
assert!(stats.cache_misses - stats_before.cache_misses >= 2);
|
|
562
|
+
|
|
563
|
+
kreuzberg_free_interned_string(ptr1);
|
|
564
|
+
kreuzberg_free_interned_string(_ptr2);
|
|
565
|
+
kreuzberg_free_interned_string(ptr3);
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
}
|