kreuzberg 4.0.0.rc2 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +396 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -1,154 +1,154 @@
|
|
|
1
|
-
use std::any::Any;
|
|
2
|
-
use std::time::{SystemTime, UNIX_EPOCH};
|
|
3
|
-
|
|
4
|
-
/// Context information captured when a panic occurs.
|
|
5
|
-
///
|
|
6
|
-
/// This struct stores detailed information about where and when a panic happened,
|
|
7
|
-
/// enabling better error reporting across FFI boundaries.
|
|
8
|
-
#[derive(Debug, Clone)]
|
|
9
|
-
pub struct PanicContext {
|
|
10
|
-
/// Source file where the panic occurred
|
|
11
|
-
pub file: &'static str,
|
|
12
|
-
/// Line number where the panic occurred
|
|
13
|
-
pub line: u32,
|
|
14
|
-
/// Function name where the panic occurred
|
|
15
|
-
pub function: &'static str,
|
|
16
|
-
/// Panic message extracted from the panic payload
|
|
17
|
-
pub message: String,
|
|
18
|
-
/// Timestamp when the panic was captured
|
|
19
|
-
pub timestamp: SystemTime,
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
impl PanicContext {
|
|
23
|
-
/// Creates a new PanicContext with the given parameters.
|
|
24
|
-
///
|
|
25
|
-
/// # Arguments
|
|
26
|
-
///
|
|
27
|
-
/// * `file` - Source file path
|
|
28
|
-
/// * `line` - Line number
|
|
29
|
-
/// * `function` - Function name
|
|
30
|
-
/// * `panic_info` - The panic payload to extract message from
|
|
31
|
-
pub fn new(file: &'static str, line: u32, function: &'static str, panic_info: &dyn Any) -> Self {
|
|
32
|
-
let timestamp = std::panic::catch_unwind(SystemTime::now).unwrap_or(UNIX_EPOCH);
|
|
33
|
-
|
|
34
|
-
Self {
|
|
35
|
-
file,
|
|
36
|
-
line,
|
|
37
|
-
function,
|
|
38
|
-
message: extract_panic_message(panic_info),
|
|
39
|
-
timestamp,
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
/// Formats the panic context as a human-readable string.
|
|
44
|
-
pub fn format(&self) -> String {
|
|
45
|
-
format!(
|
|
46
|
-
"Panic at {}:{}:{} - {}",
|
|
47
|
-
self.file, self.line, self.function, self.message
|
|
48
|
-
)
|
|
49
|
-
}
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
/// Maximum panic message length to prevent DoS attacks
|
|
53
|
-
const MAX_PANIC_MESSAGE_LEN: usize = 4096;
|
|
54
|
-
|
|
55
|
-
/// Extracts a human-readable message from a panic payload.
|
|
56
|
-
///
|
|
57
|
-
/// Attempts to downcast the panic payload to common types (String, &str)
|
|
58
|
-
/// to extract a meaningful error message.
|
|
59
|
-
///
|
|
60
|
-
/// Message is truncated to 4KB to prevent DoS attacks via extremely large panic messages.
|
|
61
|
-
///
|
|
62
|
-
/// # Arguments
|
|
63
|
-
///
|
|
64
|
-
/// * `panic_info` - The panic payload from catch_unwind
|
|
65
|
-
///
|
|
66
|
-
/// # Returns
|
|
67
|
-
///
|
|
68
|
-
/// A string representation of the panic message (truncated if necessary)
|
|
69
|
-
pub fn extract_panic_message(panic_info: &dyn Any) -> String {
|
|
70
|
-
let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
|
|
71
|
-
s.clone()
|
|
72
|
-
} else if let Some(s) = panic_info.downcast_ref::<&str>() {
|
|
73
|
-
(*s).to_string()
|
|
74
|
-
} else {
|
|
75
|
-
"Unknown panic payload".to_string()
|
|
76
|
-
};
|
|
77
|
-
|
|
78
|
-
if msg.len() > MAX_PANIC_MESSAGE_LEN {
|
|
79
|
-
let truncate_at = msg.floor_char_boundary(MAX_PANIC_MESSAGE_LEN);
|
|
80
|
-
format!("{}... [truncated]", &msg[..truncate_at])
|
|
81
|
-
} else {
|
|
82
|
-
msg
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
#[cfg(test)]
|
|
87
|
-
mod tests {
|
|
88
|
-
use super::*;
|
|
89
|
-
|
|
90
|
-
#[test]
|
|
91
|
-
fn test_extract_panic_message_string() {
|
|
92
|
-
let panic_msg = "test panic".to_string();
|
|
93
|
-
let msg = extract_panic_message(&panic_msg);
|
|
94
|
-
assert_eq!(msg, "test panic");
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
#[test]
|
|
98
|
-
fn test_extract_panic_message_str() {
|
|
99
|
-
let panic_msg: &str = "test panic";
|
|
100
|
-
let msg = extract_panic_message(&panic_msg);
|
|
101
|
-
assert_eq!(msg, "test panic");
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
#[test]
|
|
105
|
-
fn test_extract_panic_message_unknown() {
|
|
106
|
-
let panic_msg = 42i32;
|
|
107
|
-
let msg = extract_panic_message(&panic_msg);
|
|
108
|
-
assert_eq!(msg, "Unknown panic payload");
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
#[test]
|
|
112
|
-
fn test_panic_context_format() {
|
|
113
|
-
let panic_msg = "test error".to_string();
|
|
114
|
-
let ctx = PanicContext::new("test.rs", 42, "test_function", &panic_msg);
|
|
115
|
-
|
|
116
|
-
let formatted = ctx.format();
|
|
117
|
-
assert!(formatted.contains("test.rs"));
|
|
118
|
-
assert!(formatted.contains("42"));
|
|
119
|
-
assert!(formatted.contains("test_function"));
|
|
120
|
-
assert!(formatted.contains("test error"));
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
#[test]
|
|
124
|
-
fn test_panic_message_truncation() {
|
|
125
|
-
let long_msg = "x".repeat(5000);
|
|
126
|
-
let msg = extract_panic_message(&long_msg);
|
|
127
|
-
assert!(msg.len() <= MAX_PANIC_MESSAGE_LEN + 20);
|
|
128
|
-
assert!(msg.ends_with("... [truncated]"));
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
#[test]
|
|
132
|
-
fn test_panic_message_truncation_utf8_boundary() {
|
|
133
|
-
let mut msg = "x".repeat(4093);
|
|
134
|
-
msg.push('🦀');
|
|
135
|
-
msg.push_str("yyy");
|
|
136
|
-
|
|
137
|
-
let truncated = extract_panic_message(&msg);
|
|
138
|
-
|
|
139
|
-
assert!(truncated.ends_with("... [truncated]"));
|
|
140
|
-
|
|
141
|
-
assert!(std::str::from_utf8(truncated.as_bytes()).is_ok());
|
|
142
|
-
|
|
143
|
-
assert!(!truncated.contains("🦀"));
|
|
144
|
-
assert!(!truncated.contains("yyy"));
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
#[test]
|
|
148
|
-
fn test_panic_message_no_truncation_needed() {
|
|
149
|
-
let short_msg = "short".to_string();
|
|
150
|
-
let msg = extract_panic_message(&short_msg);
|
|
151
|
-
assert_eq!(msg, "short");
|
|
152
|
-
assert!(!msg.contains("[truncated]"));
|
|
153
|
-
}
|
|
154
|
-
}
|
|
1
|
+
use std::any::Any;
|
|
2
|
+
use std::time::{SystemTime, UNIX_EPOCH};
|
|
3
|
+
|
|
4
|
+
/// Context information captured when a panic occurs.
|
|
5
|
+
///
|
|
6
|
+
/// This struct stores detailed information about where and when a panic happened,
|
|
7
|
+
/// enabling better error reporting across FFI boundaries.
|
|
8
|
+
#[derive(Debug, Clone)]
|
|
9
|
+
pub struct PanicContext {
|
|
10
|
+
/// Source file where the panic occurred
|
|
11
|
+
pub file: &'static str,
|
|
12
|
+
/// Line number where the panic occurred
|
|
13
|
+
pub line: u32,
|
|
14
|
+
/// Function name where the panic occurred
|
|
15
|
+
pub function: &'static str,
|
|
16
|
+
/// Panic message extracted from the panic payload
|
|
17
|
+
pub message: String,
|
|
18
|
+
/// Timestamp when the panic was captured
|
|
19
|
+
pub timestamp: SystemTime,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
impl PanicContext {
|
|
23
|
+
/// Creates a new PanicContext with the given parameters.
|
|
24
|
+
///
|
|
25
|
+
/// # Arguments
|
|
26
|
+
///
|
|
27
|
+
/// * `file` - Source file path
|
|
28
|
+
/// * `line` - Line number
|
|
29
|
+
/// * `function` - Function name
|
|
30
|
+
/// * `panic_info` - The panic payload to extract message from
|
|
31
|
+
pub fn new(file: &'static str, line: u32, function: &'static str, panic_info: &dyn Any) -> Self {
|
|
32
|
+
let timestamp = std::panic::catch_unwind(SystemTime::now).unwrap_or(UNIX_EPOCH);
|
|
33
|
+
|
|
34
|
+
Self {
|
|
35
|
+
file,
|
|
36
|
+
line,
|
|
37
|
+
function,
|
|
38
|
+
message: extract_panic_message(panic_info),
|
|
39
|
+
timestamp,
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/// Formats the panic context as a human-readable string.
|
|
44
|
+
pub fn format(&self) -> String {
|
|
45
|
+
format!(
|
|
46
|
+
"Panic at {}:{}:{} - {}",
|
|
47
|
+
self.file, self.line, self.function, self.message
|
|
48
|
+
)
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/// Maximum panic message length to prevent DoS attacks
|
|
53
|
+
const MAX_PANIC_MESSAGE_LEN: usize = 4096;
|
|
54
|
+
|
|
55
|
+
/// Extracts a human-readable message from a panic payload.
|
|
56
|
+
///
|
|
57
|
+
/// Attempts to downcast the panic payload to common types (String, &str)
|
|
58
|
+
/// to extract a meaningful error message.
|
|
59
|
+
///
|
|
60
|
+
/// Message is truncated to 4KB to prevent DoS attacks via extremely large panic messages.
|
|
61
|
+
///
|
|
62
|
+
/// # Arguments
|
|
63
|
+
///
|
|
64
|
+
/// * `panic_info` - The panic payload from catch_unwind
|
|
65
|
+
///
|
|
66
|
+
/// # Returns
|
|
67
|
+
///
|
|
68
|
+
/// A string representation of the panic message (truncated if necessary)
|
|
69
|
+
pub fn extract_panic_message(panic_info: &dyn Any) -> String {
|
|
70
|
+
let msg = if let Some(s) = panic_info.downcast_ref::<String>() {
|
|
71
|
+
s.clone()
|
|
72
|
+
} else if let Some(s) = panic_info.downcast_ref::<&str>() {
|
|
73
|
+
(*s).to_string()
|
|
74
|
+
} else {
|
|
75
|
+
"Unknown panic payload".to_string()
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
if msg.len() > MAX_PANIC_MESSAGE_LEN {
|
|
79
|
+
let truncate_at = msg.floor_char_boundary(MAX_PANIC_MESSAGE_LEN);
|
|
80
|
+
format!("{}... [truncated]", &msg[..truncate_at])
|
|
81
|
+
} else {
|
|
82
|
+
msg
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[cfg(test)]
|
|
87
|
+
mod tests {
|
|
88
|
+
use super::*;
|
|
89
|
+
|
|
90
|
+
#[test]
|
|
91
|
+
fn test_extract_panic_message_string() {
|
|
92
|
+
let panic_msg = "test panic".to_string();
|
|
93
|
+
let msg = extract_panic_message(&panic_msg);
|
|
94
|
+
assert_eq!(msg, "test panic");
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
#[test]
|
|
98
|
+
fn test_extract_panic_message_str() {
|
|
99
|
+
let panic_msg: &str = "test panic";
|
|
100
|
+
let msg = extract_panic_message(&panic_msg);
|
|
101
|
+
assert_eq!(msg, "test panic");
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
#[test]
|
|
105
|
+
fn test_extract_panic_message_unknown() {
|
|
106
|
+
let panic_msg = 42i32;
|
|
107
|
+
let msg = extract_panic_message(&panic_msg);
|
|
108
|
+
assert_eq!(msg, "Unknown panic payload");
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
#[test]
|
|
112
|
+
fn test_panic_context_format() {
|
|
113
|
+
let panic_msg = "test error".to_string();
|
|
114
|
+
let ctx = PanicContext::new("test.rs", 42, "test_function", &panic_msg);
|
|
115
|
+
|
|
116
|
+
let formatted = ctx.format();
|
|
117
|
+
assert!(formatted.contains("test.rs"));
|
|
118
|
+
assert!(formatted.contains("42"));
|
|
119
|
+
assert!(formatted.contains("test_function"));
|
|
120
|
+
assert!(formatted.contains("test error"));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[test]
|
|
124
|
+
fn test_panic_message_truncation() {
|
|
125
|
+
let long_msg = "x".repeat(5000);
|
|
126
|
+
let msg = extract_panic_message(&long_msg);
|
|
127
|
+
assert!(msg.len() <= MAX_PANIC_MESSAGE_LEN + 20);
|
|
128
|
+
assert!(msg.ends_with("... [truncated]"));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
#[test]
|
|
132
|
+
fn test_panic_message_truncation_utf8_boundary() {
|
|
133
|
+
let mut msg = "x".repeat(4093);
|
|
134
|
+
msg.push('🦀');
|
|
135
|
+
msg.push_str("yyy");
|
|
136
|
+
|
|
137
|
+
let truncated = extract_panic_message(&msg);
|
|
138
|
+
|
|
139
|
+
assert!(truncated.ends_with("... [truncated]"));
|
|
140
|
+
|
|
141
|
+
assert!(std::str::from_utf8(truncated.as_bytes()).is_ok());
|
|
142
|
+
|
|
143
|
+
assert!(!truncated.contains("🦀"));
|
|
144
|
+
assert!(!truncated.contains("yyy"));
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
#[test]
|
|
148
|
+
fn test_panic_message_no_truncation_needed() {
|
|
149
|
+
let short_msg = "short".to_string();
|
|
150
|
+
let msg = extract_panic_message(&short_msg);
|
|
151
|
+
assert_eq!(msg, "short");
|
|
152
|
+
assert!(!msg.contains("[truncated]"));
|
|
153
|
+
}
|
|
154
|
+
}
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
use super::error::PdfError;
|
|
2
|
+
use pdfium_render::prelude::*;
|
|
3
|
+
use std::ops::Deref;
|
|
4
|
+
use std::path::PathBuf;
|
|
5
|
+
use std::sync::{Mutex, MutexGuard, OnceLock};
|
|
6
|
+
|
|
7
|
+
/// Global singleton for the Pdfium instance.
|
|
8
|
+
///
|
|
9
|
+
/// The pdfium-render library only allows binding to the Pdfium library ONCE per process.
|
|
10
|
+
/// Subsequent calls to `Pdfium::bind_to_library()` or `Pdfium::bind_to_system_library()`
|
|
11
|
+
/// will fail with a library loading error because the dynamic library is already loaded.
|
|
12
|
+
///
|
|
13
|
+
/// Additionally, `Pdfium::new()` calls `FPDF_InitLibrary()` which must only be called once,
|
|
14
|
+
/// and when `Pdfium` is dropped, it calls `FPDF_DestroyLibrary()` which would invalidate
|
|
15
|
+
/// all subsequent PDF operations.
|
|
16
|
+
///
|
|
17
|
+
/// This singleton ensures:
|
|
18
|
+
/// 1. Library binding happens exactly once (on first access)
|
|
19
|
+
/// 2. `FPDF_InitLibrary()` is called exactly once
|
|
20
|
+
/// 3. The `Pdfium` instance is never dropped, so `FPDF_DestroyLibrary()` is never called
|
|
21
|
+
/// 4. All callers share the same `Pdfium` instance safely
|
|
22
|
+
///
|
|
23
|
+
/// CRITICAL: We use `&'static Pdfium` (a leaked reference) instead of `Pdfium` to prevent
|
|
24
|
+
/// the instance from being dropped during process exit. Without this, when Rust's runtime
|
|
25
|
+
/// cleans up static variables during process teardown, the Pdfium destructor runs and calls
|
|
26
|
+
/// `FPDF_DestroyLibrary()`, which can cause segfaults/SIGTRAP (exit code 201 on macOS) in
|
|
27
|
+
/// FFI scenarios, especially in Go tests where cgo cleanup happens in a specific order.
|
|
28
|
+
static PDFIUM_SINGLETON: OnceLock<Result<&'static Pdfium, String>> = OnceLock::new();
|
|
29
|
+
|
|
30
|
+
/// Global mutex to serialize all PDFium operations.
|
|
31
|
+
///
|
|
32
|
+
/// PDFium is NOT thread-safe. While the pdfium-render library provides a safe Rust API,
|
|
33
|
+
/// the underlying C library can crash when accessed concurrently from multiple threads.
|
|
34
|
+
/// This is especially problematic in batch processing mode where multiple `spawn_blocking`
|
|
35
|
+
/// tasks may try to process PDFs simultaneously.
|
|
36
|
+
///
|
|
37
|
+
/// This mutex ensures that only one thread can be executing PDFium operations at any time.
|
|
38
|
+
/// While this serializes PDF processing and eliminates parallelism for PDFs, it prevents
|
|
39
|
+
/// crashes and ensures correctness.
|
|
40
|
+
///
|
|
41
|
+
/// # Performance Impact
|
|
42
|
+
///
|
|
43
|
+
/// In batch mode, PDFs will be processed sequentially rather than in parallel. However,
|
|
44
|
+
/// other document types (text, HTML, etc.) can still be processed in parallel. For
|
|
45
|
+
/// workloads with mixed document types, this provides good overall performance.
|
|
46
|
+
///
|
|
47
|
+
/// # Alternatives Considered
|
|
48
|
+
///
|
|
49
|
+
/// 1. **Process-based parallelism**: Spawn separate processes for PDF extraction.
|
|
50
|
+
/// This would allow true parallelism but adds significant complexity and overhead.
|
|
51
|
+
///
|
|
52
|
+
/// 2. **Thread-local PDFium instances**: Not possible because the library only allows
|
|
53
|
+
/// binding once per process (`FPDF_InitLibrary` can only be called once).
|
|
54
|
+
///
|
|
55
|
+
/// 3. **Disable batch mode for PDFs**: Would require changes to the batch orchestration
|
|
56
|
+
/// to detect PDF types and process them differently.
|
|
57
|
+
static PDFIUM_OPERATION_LOCK: Mutex<()> = Mutex::new(());
|
|
58
|
+
|
|
59
|
+
/// Extract the bundled pdfium library and return its directory path.
|
|
60
|
+
///
|
|
61
|
+
/// This is only called on first initialization when `bundled-pdfium` feature is enabled.
|
|
62
|
+
fn extract_and_get_lib_dir() -> Result<Option<PathBuf>, String> {
|
|
63
|
+
#[cfg(all(feature = "pdf", feature = "bundled-pdfium", not(target_arch = "wasm32")))]
|
|
64
|
+
{
|
|
65
|
+
let lib_path =
|
|
66
|
+
crate::pdf::extract_bundled_pdfium().map_err(|e| format!("Failed to extract bundled Pdfium: {}", e))?;
|
|
67
|
+
|
|
68
|
+
let lib_dir = lib_path.parent().ok_or_else(|| {
|
|
69
|
+
format!(
|
|
70
|
+
"Failed to determine Pdfium extraction directory for '{}'",
|
|
71
|
+
lib_path.display()
|
|
72
|
+
)
|
|
73
|
+
})?;
|
|
74
|
+
|
|
75
|
+
Ok(Some(lib_dir.to_path_buf()))
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#[cfg(any(not(feature = "bundled-pdfium"), target_arch = "wasm32"))]
|
|
79
|
+
{
|
|
80
|
+
Ok(None)
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/// Bind to the Pdfium library and create bindings.
|
|
85
|
+
///
|
|
86
|
+
/// This function is only called once during singleton initialization.
|
|
87
|
+
fn create_pdfium_bindings(lib_dir: &Option<PathBuf>) -> Result<Box<dyn PdfiumLibraryBindings>, String> {
|
|
88
|
+
let _ = lib_dir;
|
|
89
|
+
|
|
90
|
+
#[cfg(all(feature = "pdf", feature = "bundled-pdfium", not(target_arch = "wasm32")))]
|
|
91
|
+
{
|
|
92
|
+
if let Some(dir) = lib_dir {
|
|
93
|
+
return Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(dir))
|
|
94
|
+
.map_err(|e| format!("Failed to bind to Pdfium library: {}", e));
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// For system library or WASM
|
|
99
|
+
Pdfium::bind_to_system_library().map_err(|e| format!("Failed to bind to system Pdfium library: {}", e))
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/// Initialize the Pdfium singleton.
|
|
103
|
+
///
|
|
104
|
+
/// This function performs the one-time initialization:
|
|
105
|
+
/// 1. Extracts bundled library if using `bundled-pdfium` feature
|
|
106
|
+
/// 2. Creates bindings to the Pdfium library
|
|
107
|
+
/// 3. Creates and leaks the `Pdfium` instance to prevent cleanup during process exit
|
|
108
|
+
///
|
|
109
|
+
/// This is only called once, on first access to the singleton.
|
|
110
|
+
///
|
|
111
|
+
/// CRITICAL: We intentionally leak the Pdfium instance using `Box::leak()` to prevent
|
|
112
|
+
/// it from being dropped during process exit. If the instance were dropped, it would call
|
|
113
|
+
/// `FPDF_DestroyLibrary()` which causes segfaults/SIGTRAP in FFI scenarios (exit code 201
|
|
114
|
+
/// on macOS), particularly visible in Go tests where cgo cleanup order matters.
|
|
115
|
+
fn initialize_pdfium() -> Result<&'static Pdfium, String> {
|
|
116
|
+
// Step 1: Extract bundled library (if applicable)
|
|
117
|
+
let lib_dir = extract_and_get_lib_dir()?;
|
|
118
|
+
|
|
119
|
+
// Step 2: Create bindings to the library
|
|
120
|
+
let bindings = create_pdfium_bindings(&lib_dir)?;
|
|
121
|
+
|
|
122
|
+
// Step 3: Create Pdfium instance (this calls FPDF_InitLibrary)
|
|
123
|
+
let pdfium = Pdfium::new(bindings);
|
|
124
|
+
|
|
125
|
+
// Step 4: Leak the instance to prevent Drop from being called during process exit
|
|
126
|
+
// This is intentional and necessary for FFI safety across language boundaries
|
|
127
|
+
Ok(Box::leak(Box::new(pdfium)))
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/// A handle to the global Pdfium instance with exclusive access.
|
|
131
|
+
///
|
|
132
|
+
/// This wrapper provides access to the singleton `Pdfium` instance. It implements
|
|
133
|
+
/// `Deref<Target = Pdfium>` so it can be used anywhere a `&Pdfium` is expected.
|
|
134
|
+
///
|
|
135
|
+
/// # Design
|
|
136
|
+
///
|
|
137
|
+
/// The handle holds an exclusive lock on PDFium operations via `PDFIUM_OPERATION_LOCK`.
|
|
138
|
+
/// When the handle is dropped, the lock is released, allowing other threads to
|
|
139
|
+
/// acquire PDFium access.
|
|
140
|
+
///
|
|
141
|
+
/// This design ensures:
|
|
142
|
+
/// - The Pdfium library is initialized exactly once
|
|
143
|
+
/// - The library is never destroyed during the process lifetime
|
|
144
|
+
/// - Only one thread can access PDFium at a time (thread safety)
|
|
145
|
+
/// - The lock is automatically released when the handle goes out of scope
|
|
146
|
+
///
|
|
147
|
+
/// # Thread Safety
|
|
148
|
+
///
|
|
149
|
+
/// PDFium is NOT thread-safe, so this handle serializes all PDFium operations.
|
|
150
|
+
/// While this prevents parallel PDF processing, it ensures correctness and
|
|
151
|
+
/// prevents crashes in batch processing scenarios.
|
|
152
|
+
pub(crate) struct PdfiumHandle<'a> {
|
|
153
|
+
// Hold the mutex guard to ensure exclusive access to PDFium.
|
|
154
|
+
// The guard is automatically released when PdfiumHandle is dropped.
|
|
155
|
+
#[allow(dead_code)]
|
|
156
|
+
_guard: MutexGuard<'a, ()>,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
impl Deref for PdfiumHandle<'_> {
|
|
160
|
+
type Target = Pdfium;
|
|
161
|
+
|
|
162
|
+
fn deref(&self) -> &Self::Target {
|
|
163
|
+
// SAFETY: We only create PdfiumHandle after successfully initializing
|
|
164
|
+
// the singleton, so this unwrap is guaranteed to succeed.
|
|
165
|
+
// The Result inside is also guaranteed to be Ok because bind_pdfium()
|
|
166
|
+
// only returns PdfiumHandle on success.
|
|
167
|
+
// Since we now store &'static Pdfium, we can directly dereference it.
|
|
168
|
+
PDFIUM_SINGLETON.get().unwrap().as_ref().unwrap()
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/// Get a handle to the Pdfium library with lazy initialization.
|
|
173
|
+
///
|
|
174
|
+
/// The first call to this function triggers initialization of the global Pdfium singleton.
|
|
175
|
+
/// This includes:
|
|
176
|
+
/// - Extracting the bundled Pdfium library (if using `bundled-pdfium` feature)
|
|
177
|
+
/// - Loading and binding to the Pdfium dynamic library
|
|
178
|
+
/// - Calling `FPDF_InitLibrary()` to initialize the library
|
|
179
|
+
///
|
|
180
|
+
/// Subsequent calls return immediately with a handle to the same singleton instance.
|
|
181
|
+
///
|
|
182
|
+
/// # Arguments
|
|
183
|
+
///
|
|
184
|
+
/// * `map_err` - Function to convert error strings into `PdfError` variants
|
|
185
|
+
/// * `context` - Context string for error messages (e.g., "text extraction")
|
|
186
|
+
///
|
|
187
|
+
/// # Returns
|
|
188
|
+
///
|
|
189
|
+
/// A `PdfiumHandle` that provides access to the global `Pdfium` instance via `Deref`.
|
|
190
|
+
/// The handle can be used anywhere a `&Pdfium` reference is expected.
|
|
191
|
+
///
|
|
192
|
+
/// # Performance
|
|
193
|
+
///
|
|
194
|
+
/// - **First call**: Performs full initialization (~8-12ms for bundled extraction + binding)
|
|
195
|
+
/// - **Subsequent calls**: Returns immediately (just fetches from `OnceLock`, ~nanoseconds)
|
|
196
|
+
///
|
|
197
|
+
/// This lazy initialization defers Pdfium setup until the first PDF is processed,
|
|
198
|
+
/// improving cold start time for non-PDF workloads.
|
|
199
|
+
///
|
|
200
|
+
/// # Thread Safety
|
|
201
|
+
///
|
|
202
|
+
/// This function is thread-safe but SERIALIZES access to PDFium:
|
|
203
|
+
/// - The `OnceLock` ensures initialization happens exactly once
|
|
204
|
+
/// - The `PDFIUM_OPERATION_LOCK` mutex ensures only one thread can access PDFium at a time
|
|
205
|
+
/// - The returned `PdfiumHandle` holds the mutex guard; when dropped, the lock is released
|
|
206
|
+
///
|
|
207
|
+
/// This serialization is necessary because PDFium is NOT thread-safe. Concurrent access
|
|
208
|
+
/// to PDFium from multiple threads causes crashes (segfaults, abort traps).
|
|
209
|
+
///
|
|
210
|
+
/// # Error Handling
|
|
211
|
+
///
|
|
212
|
+
/// If initialization fails (e.g., library not found, extraction failed), the error
|
|
213
|
+
/// is cached and returned on all subsequent calls. The process cannot recover from
|
|
214
|
+
/// a failed initialization - restart the process to retry.
|
|
215
|
+
///
|
|
216
|
+
/// # Example
|
|
217
|
+
///
|
|
218
|
+
/// ```ignore
|
|
219
|
+
/// // First call initializes the singleton
|
|
220
|
+
/// let pdfium = bind_pdfium(PdfError::TextExtractionFailed, "text extraction")?;
|
|
221
|
+
///
|
|
222
|
+
/// // Use it like a &Pdfium
|
|
223
|
+
/// let document = pdfium.load_pdf_from_byte_slice(bytes, None)?;
|
|
224
|
+
///
|
|
225
|
+
/// // Subsequent calls return immediately
|
|
226
|
+
/// let pdfium2 = bind_pdfium(PdfError::RenderingFailed, "page rendering")?;
|
|
227
|
+
/// // pdfium and pdfium2 reference the same underlying instance
|
|
228
|
+
/// ```
|
|
229
|
+
pub(crate) fn bind_pdfium(
|
|
230
|
+
map_err: fn(String) -> PdfError,
|
|
231
|
+
context: &'static str,
|
|
232
|
+
) -> Result<PdfiumHandle<'static>, PdfError> {
|
|
233
|
+
// Acquire exclusive lock on PDFium operations.
|
|
234
|
+
// This prevents concurrent access to PDFium which is NOT thread-safe.
|
|
235
|
+
// The lock is held for the duration of the PdfiumHandle's lifetime.
|
|
236
|
+
let guard = PDFIUM_OPERATION_LOCK
|
|
237
|
+
.lock()
|
|
238
|
+
.map_err(|e| map_err(format!("PDFium operation lock poisoned ({}): {}", context, e)))?;
|
|
239
|
+
|
|
240
|
+
// Initialize the singleton on first access, or get the cached result
|
|
241
|
+
let result = PDFIUM_SINGLETON.get_or_init(initialize_pdfium);
|
|
242
|
+
|
|
243
|
+
// Convert the cached Result into our return type
|
|
244
|
+
match result {
|
|
245
|
+
Ok(_) => Ok(PdfiumHandle { _guard: guard }),
|
|
246
|
+
Err(cached_error) => Err(map_err(format!(
|
|
247
|
+
"Pdfium initialization failed ({}): {}",
|
|
248
|
+
context, cached_error
|
|
249
|
+
))),
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
#[cfg(test)]
|
|
254
|
+
mod tests {
|
|
255
|
+
use super::*;
|
|
256
|
+
use crate::pdf::error::PdfError;
|
|
257
|
+
|
|
258
|
+
#[test]
|
|
259
|
+
fn test_bind_pdfium_lazy_initialization() {
|
|
260
|
+
let result = bind_pdfium(PdfError::TextExtractionFailed, "test context");
|
|
261
|
+
assert!(result.is_ok(), "First bind_pdfium call should succeed");
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
#[test]
|
|
265
|
+
fn test_bind_pdfium_multiple_calls() {
|
|
266
|
+
let result1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1");
|
|
267
|
+
let result2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2");
|
|
268
|
+
|
|
269
|
+
assert!(result1.is_ok(), "First call should succeed");
|
|
270
|
+
assert!(result2.is_ok(), "Second call should also succeed");
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
#[test]
|
|
274
|
+
fn test_bind_pdfium_returns_same_instance() {
|
|
275
|
+
let handle1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1").unwrap();
|
|
276
|
+
let handle2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2").unwrap();
|
|
277
|
+
|
|
278
|
+
// Both handles should dereference to the same Pdfium instance
|
|
279
|
+
let ptr1 = &*handle1 as *const Pdfium;
|
|
280
|
+
let ptr2 = &*handle2 as *const Pdfium;
|
|
281
|
+
assert_eq!(ptr1, ptr2, "Both handles should reference the same Pdfium instance");
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
#[test]
|
|
285
|
+
fn test_bind_pdfium_error_mapping() {
|
|
286
|
+
let map_err = |msg: String| PdfError::TextExtractionFailed(msg);
|
|
287
|
+
|
|
288
|
+
let test_error = map_err("test".to_string());
|
|
289
|
+
match test_error {
|
|
290
|
+
PdfError::TextExtractionFailed(msg) => {
|
|
291
|
+
assert_eq!(msg, "test");
|
|
292
|
+
}
|
|
293
|
+
_ => panic!("Error mapping failed"),
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
#[test]
|
|
298
|
+
fn test_pdfium_handle_deref() {
|
|
299
|
+
let handle = bind_pdfium(PdfError::TextExtractionFailed, "test").unwrap();
|
|
300
|
+
|
|
301
|
+
// Test that we can use the handle like a &Pdfium by calling a method
|
|
302
|
+
// that requires &Pdfium. create_new_pdf() takes &self and returns a Result.
|
|
303
|
+
let result = handle.create_new_pdf();
|
|
304
|
+
assert!(result.is_ok(), "Should be able to create a new PDF document");
|
|
305
|
+
}
|
|
306
|
+
}
|