kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,80 +1,80 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'json'
|
|
4
|
-
|
|
5
|
-
module Kreuzberg
|
|
6
|
-
# ErrorContext module provides access to FFI error introspection functions.
|
|
7
|
-
#
|
|
8
|
-
# This module retrieves detailed error and panic context information from the native
|
|
9
|
-
# Rust core. It allows inspection of the last error that occurred during extraction,
|
|
10
|
-
# including panic information with file, line, function, and timestamp details.
|
|
11
|
-
module ErrorContext
|
|
12
|
-
class << self
|
|
13
|
-
# Get the error code of the last operation.
|
|
14
|
-
#
|
|
15
|
-
# Returns the error code from the last FFI call. Returns 0 (SUCCESS) if no error
|
|
16
|
-
# occurred or if introspection fails.
|
|
17
|
-
#
|
|
18
|
-
# @return [Integer] Error code constant (ERROR_CODE_* values), or 0 on success
|
|
19
|
-
#
|
|
20
|
-
# @example Check last error
|
|
21
|
-
# code = Kreuzberg::ErrorContext.last_error_code
|
|
22
|
-
# case code
|
|
23
|
-
# when Kreuzberg::ERROR_CODE_IO
|
|
24
|
-
# puts "I/O error occurred"
|
|
25
|
-
# when Kreuzberg::ERROR_CODE_PARSING
|
|
26
|
-
# puts "Parsing error occurred"
|
|
27
|
-
# else
|
|
28
|
-
# puts "Success or unknown error"
|
|
29
|
-
# end
|
|
30
|
-
def last_error_code
|
|
31
|
-
Kreuzberg._last_error_code_native
|
|
32
|
-
rescue StandardError
|
|
33
|
-
0
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
# Get panic context information from the last error.
|
|
37
|
-
#
|
|
38
|
-
# Returns a {Errors::PanicContext} object containing detailed information about
|
|
39
|
-
# the last panic that occurred in the Rust core. Includes file path, line number,
|
|
40
|
-
# function name, error message, and timestamp.
|
|
41
|
-
#
|
|
42
|
-
# @return [Errors::PanicContext, nil] Panic context if a panic occurred, nil otherwise
|
|
43
|
-
#
|
|
44
|
-
# @example Get panic details
|
|
45
|
-
# panic = Kreuzberg::ErrorContext.last_panic_context
|
|
46
|
-
# if panic
|
|
47
|
-
# puts "Panic at #{panic.file}:#{panic.line} in #{panic.function}"
|
|
48
|
-
# puts "Message: #{panic.message}"
|
|
49
|
-
# puts "Time: #{panic.timestamp_secs}"
|
|
50
|
-
# end
|
|
51
|
-
def last_panic_context
|
|
52
|
-
json_str = Kreuzberg._last_panic_context_json_native
|
|
53
|
-
return nil unless json_str
|
|
54
|
-
|
|
55
|
-
Errors::PanicContext.from_json(json_str)
|
|
56
|
-
rescue StandardError
|
|
57
|
-
nil
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
# Get panic context as raw JSON string.
|
|
61
|
-
#
|
|
62
|
-
# Returns the panic context information as a JSON string for raw access or
|
|
63
|
-
# custom parsing. Returns nil if no panic has occurred.
|
|
64
|
-
#
|
|
65
|
-
# @return [String, nil] JSON-serialized panic context, or nil if no panic
|
|
66
|
-
#
|
|
67
|
-
# @example Get raw JSON panic context
|
|
68
|
-
# json = Kreuzberg::ErrorContext.last_panic_context_json
|
|
69
|
-
# if json
|
|
70
|
-
# panic_data = JSON.parse(json)
|
|
71
|
-
# puts panic_data
|
|
72
|
-
# end
|
|
73
|
-
def last_panic_context_json
|
|
74
|
-
Kreuzberg._last_panic_context_json_native
|
|
75
|
-
rescue StandardError
|
|
76
|
-
nil
|
|
77
|
-
end
|
|
78
|
-
end
|
|
79
|
-
end
|
|
80
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Kreuzberg
|
|
6
|
+
# ErrorContext module provides access to FFI error introspection functions.
|
|
7
|
+
#
|
|
8
|
+
# This module retrieves detailed error and panic context information from the native
|
|
9
|
+
# Rust core. It allows inspection of the last error that occurred during extraction,
|
|
10
|
+
# including panic information with file, line, function, and timestamp details.
|
|
11
|
+
module ErrorContext
|
|
12
|
+
class << self
|
|
13
|
+
# Get the error code of the last operation.
|
|
14
|
+
#
|
|
15
|
+
# Returns the error code from the last FFI call. Returns 0 (SUCCESS) if no error
|
|
16
|
+
# occurred or if introspection fails.
|
|
17
|
+
#
|
|
18
|
+
# @return [Integer] Error code constant (ERROR_CODE_* values), or 0 on success
|
|
19
|
+
#
|
|
20
|
+
# @example Check last error
|
|
21
|
+
# code = Kreuzberg::ErrorContext.last_error_code
|
|
22
|
+
# case code
|
|
23
|
+
# when Kreuzberg::ERROR_CODE_IO
|
|
24
|
+
# puts "I/O error occurred"
|
|
25
|
+
# when Kreuzberg::ERROR_CODE_PARSING
|
|
26
|
+
# puts "Parsing error occurred"
|
|
27
|
+
# else
|
|
28
|
+
# puts "Success or unknown error"
|
|
29
|
+
# end
|
|
30
|
+
def last_error_code
|
|
31
|
+
Kreuzberg._last_error_code_native
|
|
32
|
+
rescue StandardError
|
|
33
|
+
0
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Get panic context information from the last error.
|
|
37
|
+
#
|
|
38
|
+
# Returns a {Errors::PanicContext} object containing detailed information about
|
|
39
|
+
# the last panic that occurred in the Rust core. Includes file path, line number,
|
|
40
|
+
# function name, error message, and timestamp.
|
|
41
|
+
#
|
|
42
|
+
# @return [Errors::PanicContext, nil] Panic context if a panic occurred, nil otherwise
|
|
43
|
+
#
|
|
44
|
+
# @example Get panic details
|
|
45
|
+
# panic = Kreuzberg::ErrorContext.last_panic_context
|
|
46
|
+
# if panic
|
|
47
|
+
# puts "Panic at #{panic.file}:#{panic.line} in #{panic.function}"
|
|
48
|
+
# puts "Message: #{panic.message}"
|
|
49
|
+
# puts "Time: #{panic.timestamp_secs}"
|
|
50
|
+
# end
|
|
51
|
+
def last_panic_context
|
|
52
|
+
json_str = Kreuzberg._last_panic_context_json_native
|
|
53
|
+
return nil unless json_str
|
|
54
|
+
|
|
55
|
+
Errors::PanicContext.from_json(json_str)
|
|
56
|
+
rescue StandardError
|
|
57
|
+
nil
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Get panic context as raw JSON string.
|
|
61
|
+
#
|
|
62
|
+
# Returns the panic context information as a JSON string for raw access or
|
|
63
|
+
# custom parsing. Returns nil if no panic has occurred.
|
|
64
|
+
#
|
|
65
|
+
# @return [String, nil] JSON-serialized panic context, or nil if no panic
|
|
66
|
+
#
|
|
67
|
+
# @example Get raw JSON panic context
|
|
68
|
+
# json = Kreuzberg::ErrorContext.last_panic_context_json
|
|
69
|
+
# if json
|
|
70
|
+
# panic_data = JSON.parse(json)
|
|
71
|
+
# puts panic_data
|
|
72
|
+
# end
|
|
73
|
+
def last_panic_context_json
|
|
74
|
+
Kreuzberg._last_panic_context_json_native
|
|
75
|
+
rescue StandardError
|
|
76
|
+
nil
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
data/lib/kreuzberg/errors.rb
CHANGED
|
@@ -1,118 +1,118 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'json'
|
|
4
|
-
|
|
5
|
-
module Kreuzberg
|
|
6
|
-
# Error code constants matching kreuzberg-ffi error codes
|
|
7
|
-
ERROR_CODE_SUCCESS = 0
|
|
8
|
-
ERROR_CODE_GENERIC = 1
|
|
9
|
-
ERROR_CODE_PANIC = 2
|
|
10
|
-
ERROR_CODE_INVALID_ARGUMENT = 3
|
|
11
|
-
ERROR_CODE_IO = 4
|
|
12
|
-
ERROR_CODE_PARSING = 5
|
|
13
|
-
ERROR_CODE_OCR = 6
|
|
14
|
-
ERROR_CODE_MISSING_DEPENDENCY = 7
|
|
15
|
-
|
|
16
|
-
module Errors
|
|
17
|
-
# Panic context information from FFI error introspection
|
|
18
|
-
class PanicContext
|
|
19
|
-
attr_reader :file, :line, :function, :message, :timestamp_secs
|
|
20
|
-
|
|
21
|
-
def initialize(file:, line:, function:, message:, timestamp_secs:)
|
|
22
|
-
@file = file
|
|
23
|
-
@line = line
|
|
24
|
-
@function = function
|
|
25
|
-
@message = message
|
|
26
|
-
@timestamp_secs = timestamp_secs
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
def to_s
|
|
30
|
-
"#{file}:#{line}:#{function}: #{message}"
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
def to_h
|
|
34
|
-
{
|
|
35
|
-
file:,
|
|
36
|
-
line:,
|
|
37
|
-
function:,
|
|
38
|
-
message:,
|
|
39
|
-
timestamp_secs:
|
|
40
|
-
}
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
def self.from_json(json_string)
|
|
44
|
-
return nil if json_string.nil? || json_string.empty?
|
|
45
|
-
|
|
46
|
-
data = JSON.parse(json_string, symbolize_names: true)
|
|
47
|
-
sliced = data.slice(:file, :line, :function, :message, :timestamp_secs)
|
|
48
|
-
new(**with_defaults(sliced))
|
|
49
|
-
rescue JSON::ParserError
|
|
50
|
-
nil
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
def self.with_defaults(sliced)
|
|
54
|
-
{
|
|
55
|
-
file: sliced[:file] || '',
|
|
56
|
-
line: sliced[:line] || 0,
|
|
57
|
-
function: sliced[:function] || '',
|
|
58
|
-
message: sliced[:message] || '',
|
|
59
|
-
timestamp_secs: sliced[:timestamp_secs] || 0
|
|
60
|
-
}
|
|
61
|
-
end
|
|
62
|
-
private_class_method :with_defaults
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
# Base error class for all Kreuzberg errors
|
|
66
|
-
class Error < StandardError
|
|
67
|
-
attr_reader :panic_context, :error_code
|
|
68
|
-
|
|
69
|
-
def initialize(message, panic_context: nil, error_code: nil)
|
|
70
|
-
super(message)
|
|
71
|
-
@panic_context = panic_context
|
|
72
|
-
@error_code = error_code
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
# Raised when validation fails
|
|
77
|
-
class ValidationError < Error; end
|
|
78
|
-
|
|
79
|
-
# Raised when document parsing fails
|
|
80
|
-
class ParsingError < Error
|
|
81
|
-
attr_reader :context
|
|
82
|
-
|
|
83
|
-
def initialize(message, context: nil, panic_context: nil, error_code: nil)
|
|
84
|
-
super(message, panic_context:, error_code:)
|
|
85
|
-
@context = context
|
|
86
|
-
end
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
# Raised when OCR processing fails
|
|
90
|
-
class OCRError < Error
|
|
91
|
-
attr_reader :context
|
|
92
|
-
|
|
93
|
-
def initialize(message, context: nil, panic_context: nil, error_code: nil)
|
|
94
|
-
super(message, panic_context:, error_code:)
|
|
95
|
-
@context = context
|
|
96
|
-
end
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
# Raised when a required dependency is missing
|
|
100
|
-
class MissingDependencyError < Error
|
|
101
|
-
attr_reader :dependency
|
|
102
|
-
|
|
103
|
-
def initialize(message, dependency: nil, panic_context: nil, error_code: nil)
|
|
104
|
-
super(message, panic_context:, error_code:)
|
|
105
|
-
@dependency = dependency
|
|
106
|
-
end
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
# Raised when an I/O operation fails
|
|
110
|
-
class IOError < Error; end
|
|
111
|
-
|
|
112
|
-
# Raised when plugin operations fail
|
|
113
|
-
class PluginError < Error; end
|
|
114
|
-
|
|
115
|
-
# Raised when an unsupported file format or MIME type is encountered
|
|
116
|
-
class UnsupportedFormatError < Error; end
|
|
117
|
-
end
|
|
118
|
-
end
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Kreuzberg
|
|
6
|
+
# Error code constants matching kreuzberg-ffi error codes
|
|
7
|
+
ERROR_CODE_SUCCESS = 0
|
|
8
|
+
ERROR_CODE_GENERIC = 1
|
|
9
|
+
ERROR_CODE_PANIC = 2
|
|
10
|
+
ERROR_CODE_INVALID_ARGUMENT = 3
|
|
11
|
+
ERROR_CODE_IO = 4
|
|
12
|
+
ERROR_CODE_PARSING = 5
|
|
13
|
+
ERROR_CODE_OCR = 6
|
|
14
|
+
ERROR_CODE_MISSING_DEPENDENCY = 7
|
|
15
|
+
|
|
16
|
+
module Errors
|
|
17
|
+
# Panic context information from FFI error introspection
|
|
18
|
+
class PanicContext
|
|
19
|
+
attr_reader :file, :line, :function, :message, :timestamp_secs
|
|
20
|
+
|
|
21
|
+
def initialize(file:, line:, function:, message:, timestamp_secs:)
|
|
22
|
+
@file = file
|
|
23
|
+
@line = line
|
|
24
|
+
@function = function
|
|
25
|
+
@message = message
|
|
26
|
+
@timestamp_secs = timestamp_secs
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def to_s
|
|
30
|
+
"#{file}:#{line}:#{function}: #{message}"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def to_h
|
|
34
|
+
{
|
|
35
|
+
file:,
|
|
36
|
+
line:,
|
|
37
|
+
function:,
|
|
38
|
+
message:,
|
|
39
|
+
timestamp_secs:
|
|
40
|
+
}
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def self.from_json(json_string)
|
|
44
|
+
return nil if json_string.nil? || json_string.empty?
|
|
45
|
+
|
|
46
|
+
data = JSON.parse(json_string, symbolize_names: true)
|
|
47
|
+
sliced = data.slice(:file, :line, :function, :message, :timestamp_secs)
|
|
48
|
+
new(**with_defaults(sliced))
|
|
49
|
+
rescue JSON::ParserError
|
|
50
|
+
nil
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def self.with_defaults(sliced)
|
|
54
|
+
{
|
|
55
|
+
file: sliced[:file] || '',
|
|
56
|
+
line: sliced[:line] || 0,
|
|
57
|
+
function: sliced[:function] || '',
|
|
58
|
+
message: sliced[:message] || '',
|
|
59
|
+
timestamp_secs: sliced[:timestamp_secs] || 0
|
|
60
|
+
}
|
|
61
|
+
end
|
|
62
|
+
private_class_method :with_defaults
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Base error class for all Kreuzberg errors
|
|
66
|
+
class Error < StandardError
|
|
67
|
+
attr_reader :panic_context, :error_code
|
|
68
|
+
|
|
69
|
+
def initialize(message, panic_context: nil, error_code: nil)
|
|
70
|
+
super(message)
|
|
71
|
+
@panic_context = panic_context
|
|
72
|
+
@error_code = error_code
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Raised when validation fails
|
|
77
|
+
class ValidationError < Error; end
|
|
78
|
+
|
|
79
|
+
# Raised when document parsing fails
|
|
80
|
+
class ParsingError < Error
|
|
81
|
+
attr_reader :context
|
|
82
|
+
|
|
83
|
+
def initialize(message, context: nil, panic_context: nil, error_code: nil)
|
|
84
|
+
super(message, panic_context:, error_code:)
|
|
85
|
+
@context = context
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Raised when OCR processing fails
|
|
90
|
+
class OCRError < Error
|
|
91
|
+
attr_reader :context
|
|
92
|
+
|
|
93
|
+
def initialize(message, context: nil, panic_context: nil, error_code: nil)
|
|
94
|
+
super(message, panic_context:, error_code:)
|
|
95
|
+
@context = context
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Raised when a required dependency is missing
|
|
100
|
+
class MissingDependencyError < Error
|
|
101
|
+
attr_reader :dependency
|
|
102
|
+
|
|
103
|
+
def initialize(message, dependency: nil, panic_context: nil, error_code: nil)
|
|
104
|
+
super(message, panic_context:, error_code:)
|
|
105
|
+
@dependency = dependency
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
# Raised when an I/O operation fails
|
|
110
|
+
class IOError < Error; end
|
|
111
|
+
|
|
112
|
+
# Raised when plugin operations fail
|
|
113
|
+
class PluginError < Error; end
|
|
114
|
+
|
|
115
|
+
# Raised when an unsupported file format or MIME type is encountered
|
|
116
|
+
class UnsupportedFormatError < Error; end
|
|
117
|
+
end
|
|
118
|
+
end
|