kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,37 +1,37 @@
|
|
|
1
|
-
use std::fmt;
|
|
2
|
-
|
|
3
|
-
/// OCR-specific errors (pure Rust, no PyO3)
|
|
4
|
-
#[derive(Debug, Clone)]
|
|
5
|
-
pub enum OcrError {
|
|
6
|
-
TesseractInitializationFailed(String),
|
|
7
|
-
UnsupportedVersion(String),
|
|
8
|
-
InvalidConfiguration(String),
|
|
9
|
-
InvalidLanguageCode(String),
|
|
10
|
-
ImageProcessingFailed(String),
|
|
11
|
-
ProcessingFailed(String),
|
|
12
|
-
CacheError(String),
|
|
13
|
-
IOError(String),
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
impl fmt::Display for OcrError {
|
|
17
|
-
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
18
|
-
match self {
|
|
19
|
-
Self::TesseractInitializationFailed(msg) => {
|
|
20
|
-
write!(f, "Tesseract initialization failed: {}", msg)
|
|
21
|
-
}
|
|
22
|
-
Self::UnsupportedVersion(msg) => {
|
|
23
|
-
write!(f, "Unsupported Tesseract version: {}", msg)
|
|
24
|
-
}
|
|
25
|
-
Self::InvalidConfiguration(msg) => write!(f, "Invalid configuration: {}", msg),
|
|
26
|
-
Self::InvalidLanguageCode(msg) => write!(f, "Invalid language code: {}", msg),
|
|
27
|
-
Self::ImageProcessingFailed(msg) => write!(f, "Image processing failed: {}", msg),
|
|
28
|
-
Self::ProcessingFailed(msg) => write!(f, "OCR processing failed: {}", msg),
|
|
29
|
-
Self::CacheError(msg) => write!(f, "Cache error: {}", msg),
|
|
30
|
-
Self::IOError(msg) => write!(f, "I/O error: {}", msg),
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
impl std::error::Error for OcrError {}
|
|
36
|
-
|
|
37
|
-
// NOTE: No From<std::io::Error> impl - IO errors must bubble up unchanged per error handling policy
|
|
1
|
+
use std::fmt;
|
|
2
|
+
|
|
3
|
+
/// OCR-specific errors (pure Rust, no PyO3)
|
|
4
|
+
#[derive(Debug, Clone)]
|
|
5
|
+
pub enum OcrError {
|
|
6
|
+
TesseractInitializationFailed(String),
|
|
7
|
+
UnsupportedVersion(String),
|
|
8
|
+
InvalidConfiguration(String),
|
|
9
|
+
InvalidLanguageCode(String),
|
|
10
|
+
ImageProcessingFailed(String),
|
|
11
|
+
ProcessingFailed(String),
|
|
12
|
+
CacheError(String),
|
|
13
|
+
IOError(String),
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
impl fmt::Display for OcrError {
|
|
17
|
+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
18
|
+
match self {
|
|
19
|
+
Self::TesseractInitializationFailed(msg) => {
|
|
20
|
+
write!(f, "Tesseract initialization failed: {}", msg)
|
|
21
|
+
}
|
|
22
|
+
Self::UnsupportedVersion(msg) => {
|
|
23
|
+
write!(f, "Unsupported Tesseract version: {}", msg)
|
|
24
|
+
}
|
|
25
|
+
Self::InvalidConfiguration(msg) => write!(f, "Invalid configuration: {}", msg),
|
|
26
|
+
Self::InvalidLanguageCode(msg) => write!(f, "Invalid language code: {}", msg),
|
|
27
|
+
Self::ImageProcessingFailed(msg) => write!(f, "Image processing failed: {}", msg),
|
|
28
|
+
Self::ProcessingFailed(msg) => write!(f, "OCR processing failed: {}", msg),
|
|
29
|
+
Self::CacheError(msg) => write!(f, "Cache error: {}", msg),
|
|
30
|
+
Self::IOError(msg) => write!(f, "I/O error: {}", msg),
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
impl std::error::Error for OcrError {}
|
|
36
|
+
|
|
37
|
+
// NOTE: No From<std::io::Error> impl - IO errors must bubble up unchanged per error handling policy
|
|
@@ -1,216 +1,216 @@
|
|
|
1
|
-
use super::error::OcrError;
|
|
2
|
-
use html_to_markdown_rs::{ConversionOptions, convert};
|
|
3
|
-
|
|
4
|
-
pub fn convert_hocr_to_markdown(hocr_html: &str, options: Option<ConversionOptions>) -> Result<String, OcrError> {
|
|
5
|
-
let use_default = options.is_none();
|
|
6
|
-
let mut opts = options.unwrap_or_default();
|
|
7
|
-
|
|
8
|
-
if use_default {
|
|
9
|
-
opts.hocr_spatial_tables = false;
|
|
10
|
-
opts.extract_metadata = false;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
convert(hocr_html, Some(opts)).map_err(|e| OcrError::ProcessingFailed(format!("hOCR conversion failed: {}", e)))
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
#[cfg(test)]
|
|
17
|
-
mod tests {
|
|
18
|
-
use super::*;
|
|
19
|
-
|
|
20
|
-
#[test]
|
|
21
|
-
fn test_simple_hocr_conversion() {
|
|
22
|
-
let hocr = r#"<div class="ocr_page">
|
|
23
|
-
<p class="ocr_par">
|
|
24
|
-
<span class="ocrx_word">Hello</span>
|
|
25
|
-
<span class="ocrx_word">World</span>
|
|
26
|
-
</p>
|
|
27
|
-
</div>"#;
|
|
28
|
-
|
|
29
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
30
|
-
assert!(markdown.contains("Hello"));
|
|
31
|
-
assert!(markdown.contains("World"));
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
#[test]
|
|
35
|
-
fn test_hocr_with_formatting() {
|
|
36
|
-
let hocr = r#"<div class="ocr_page">
|
|
37
|
-
<p class="ocr_par">
|
|
38
|
-
<strong class="ocrx_word">Bold</strong>
|
|
39
|
-
<em class="ocrx_word">Italic</em>
|
|
40
|
-
</p>
|
|
41
|
-
</div>"#;
|
|
42
|
-
|
|
43
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
44
|
-
assert!(!markdown.is_empty());
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
#[test]
|
|
48
|
-
fn test_empty_hocr() {
|
|
49
|
-
let hocr = "";
|
|
50
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
51
|
-
assert!(markdown.is_empty() || markdown.trim().is_empty());
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
#[test]
|
|
55
|
-
fn test_hocr_with_headings() {
|
|
56
|
-
let hocr = r#"<div class="ocr_page">
|
|
57
|
-
<h1>Title</h1>
|
|
58
|
-
<p class="ocr_par">
|
|
59
|
-
<span class="ocrx_word">Content</span>
|
|
60
|
-
</p>
|
|
61
|
-
</div>"#;
|
|
62
|
-
|
|
63
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
64
|
-
assert!(!markdown.is_empty());
|
|
65
|
-
assert!(markdown.contains("Content"));
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
#[test]
|
|
69
|
-
fn test_hocr_with_multiple_paragraphs() {
|
|
70
|
-
let hocr = r#"<div class="ocr_page">
|
|
71
|
-
<p class="ocr_par">
|
|
72
|
-
<span class="ocrx_word">First</span>
|
|
73
|
-
<span class="ocrx_word">paragraph</span>
|
|
74
|
-
</p>
|
|
75
|
-
<p class="ocr_par">
|
|
76
|
-
<span class="ocrx_word">Second</span>
|
|
77
|
-
<span class="ocrx_word">paragraph</span>
|
|
78
|
-
</p>
|
|
79
|
-
</div>"#;
|
|
80
|
-
|
|
81
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
82
|
-
assert!(markdown.contains("First"));
|
|
83
|
-
assert!(markdown.contains("Second"));
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
#[test]
|
|
87
|
-
fn test_hocr_with_line_breaks() {
|
|
88
|
-
let hocr = r#"<div class="ocr_page">
|
|
89
|
-
<p class="ocr_par">
|
|
90
|
-
<span class="ocrx_line">
|
|
91
|
-
<span class="ocrx_word">Line</span>
|
|
92
|
-
<span class="ocrx_word">one</span>
|
|
93
|
-
</span>
|
|
94
|
-
<span class="ocrx_line">
|
|
95
|
-
<span class="ocrx_word">Line</span>
|
|
96
|
-
<span class="ocrx_word">two</span>
|
|
97
|
-
</span>
|
|
98
|
-
</p>
|
|
99
|
-
</div>"#;
|
|
100
|
-
|
|
101
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
102
|
-
assert!(!markdown.is_empty());
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
#[test]
|
|
106
|
-
fn test_hocr_whitespace_handling() {
|
|
107
|
-
let hocr = r#"<div class="ocr_page">
|
|
108
|
-
<p class="ocr_par">
|
|
109
|
-
<span class="ocrx_word"> Padded </span>
|
|
110
|
-
<span class="ocrx_word"> Text </span>
|
|
111
|
-
</p>
|
|
112
|
-
</div>"#;
|
|
113
|
-
|
|
114
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
115
|
-
assert!(!markdown.is_empty());
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
#[test]
|
|
119
|
-
fn test_hocr_special_characters() {
|
|
120
|
-
let hocr = r#"<div class="ocr_page">
|
|
121
|
-
<p class="ocr_par">
|
|
122
|
-
<span class="ocrx_word"><special></span>
|
|
123
|
-
<span class="ocrx_word">&chars&</span>
|
|
124
|
-
</p>
|
|
125
|
-
</div>"#;
|
|
126
|
-
|
|
127
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
128
|
-
assert!(!markdown.is_empty());
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
#[test]
|
|
132
|
-
fn test_hocr_nested_structure() {
|
|
133
|
-
let hocr = r#"<div class="ocr_page">
|
|
134
|
-
<div class="ocr_carea">
|
|
135
|
-
<p class="ocr_par">
|
|
136
|
-
<span class="ocr_line">
|
|
137
|
-
<span class="ocrx_word">Nested</span>
|
|
138
|
-
</span>
|
|
139
|
-
</p>
|
|
140
|
-
</div>
|
|
141
|
-
</div>"#;
|
|
142
|
-
|
|
143
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
144
|
-
assert!(markdown.contains("Nested"));
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
#[test]
|
|
148
|
-
fn test_hocr_malformed_html() {
|
|
149
|
-
let hocr = r#"<div class="ocr_page">
|
|
150
|
-
<p class="ocr_par">
|
|
151
|
-
<span class="ocrx_word">Unclosed
|
|
152
|
-
</div>"#;
|
|
153
|
-
|
|
154
|
-
let result = convert_hocr_to_markdown(hocr, None);
|
|
155
|
-
assert!(result.is_ok());
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
#[test]
|
|
159
|
-
fn test_hocr_no_ocr_classes() {
|
|
160
|
-
let hocr = r#"<div>
|
|
161
|
-
<p>
|
|
162
|
-
<span>Regular HTML</span>
|
|
163
|
-
</p>
|
|
164
|
-
</div>"#;
|
|
165
|
-
|
|
166
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
167
|
-
assert!(!markdown.is_empty());
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
#[test]
|
|
171
|
-
fn test_hocr_mixed_content() {
|
|
172
|
-
let hocr = r#"<div class="ocr_page">
|
|
173
|
-
<h1>Heading</h1>
|
|
174
|
-
<p class="ocr_par">
|
|
175
|
-
<span class="ocrx_word">Paragraph</span>
|
|
176
|
-
</p>
|
|
177
|
-
<ul>
|
|
178
|
-
<li>List item</li>
|
|
179
|
-
</ul>
|
|
180
|
-
</div>"#;
|
|
181
|
-
|
|
182
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
183
|
-
assert!(markdown.contains("Heading") || markdown.contains("heading") || !markdown.is_empty());
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
#[test]
|
|
187
|
-
fn test_hocr_unicode_content() {
|
|
188
|
-
let hocr = r#"<div class="ocr_page">
|
|
189
|
-
<p class="ocr_par">
|
|
190
|
-
<span class="ocrx_word">Ñoño</span>
|
|
191
|
-
<span class="ocrx_word">日本語</span>
|
|
192
|
-
<span class="ocrx_word">العربية</span>
|
|
193
|
-
</p>
|
|
194
|
-
</div>"#;
|
|
195
|
-
|
|
196
|
-
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
197
|
-
assert!(markdown.contains("Ñoño") || !markdown.is_empty());
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
#[test]
|
|
201
|
-
fn test_hocr_large_document() {
|
|
202
|
-
let mut hocr = String::from(r#"<div class="ocr_page">"#);
|
|
203
|
-
for i in 0..100 {
|
|
204
|
-
hocr.push_str(&format!(
|
|
205
|
-
r#"<p class="ocr_par"><span class="ocrx_word">Word{}</span></p>"#,
|
|
206
|
-
i
|
|
207
|
-
));
|
|
208
|
-
}
|
|
209
|
-
hocr.push_str("</div>");
|
|
210
|
-
|
|
211
|
-
let result = convert_hocr_to_markdown(&hocr, None);
|
|
212
|
-
assert!(result.is_ok());
|
|
213
|
-
let markdown = result.unwrap();
|
|
214
|
-
assert!(!markdown.is_empty());
|
|
215
|
-
}
|
|
216
|
-
}
|
|
1
|
+
use super::error::OcrError;
|
|
2
|
+
use html_to_markdown_rs::{ConversionOptions, convert};
|
|
3
|
+
|
|
4
|
+
pub fn convert_hocr_to_markdown(hocr_html: &str, options: Option<ConversionOptions>) -> Result<String, OcrError> {
|
|
5
|
+
let use_default = options.is_none();
|
|
6
|
+
let mut opts = options.unwrap_or_default();
|
|
7
|
+
|
|
8
|
+
if use_default {
|
|
9
|
+
opts.hocr_spatial_tables = false;
|
|
10
|
+
opts.extract_metadata = false;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
convert(hocr_html, Some(opts)).map_err(|e| OcrError::ProcessingFailed(format!("hOCR conversion failed: {}", e)))
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
#[cfg(test)]
|
|
17
|
+
mod tests {
|
|
18
|
+
use super::*;
|
|
19
|
+
|
|
20
|
+
#[test]
|
|
21
|
+
fn test_simple_hocr_conversion() {
|
|
22
|
+
let hocr = r#"<div class="ocr_page">
|
|
23
|
+
<p class="ocr_par">
|
|
24
|
+
<span class="ocrx_word">Hello</span>
|
|
25
|
+
<span class="ocrx_word">World</span>
|
|
26
|
+
</p>
|
|
27
|
+
</div>"#;
|
|
28
|
+
|
|
29
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
30
|
+
assert!(markdown.contains("Hello"));
|
|
31
|
+
assert!(markdown.contains("World"));
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
#[test]
|
|
35
|
+
fn test_hocr_with_formatting() {
|
|
36
|
+
let hocr = r#"<div class="ocr_page">
|
|
37
|
+
<p class="ocr_par">
|
|
38
|
+
<strong class="ocrx_word">Bold</strong>
|
|
39
|
+
<em class="ocrx_word">Italic</em>
|
|
40
|
+
</p>
|
|
41
|
+
</div>"#;
|
|
42
|
+
|
|
43
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
44
|
+
assert!(!markdown.is_empty());
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
#[test]
|
|
48
|
+
fn test_empty_hocr() {
|
|
49
|
+
let hocr = "";
|
|
50
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
51
|
+
assert!(markdown.is_empty() || markdown.trim().is_empty());
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
#[test]
|
|
55
|
+
fn test_hocr_with_headings() {
|
|
56
|
+
let hocr = r#"<div class="ocr_page">
|
|
57
|
+
<h1>Title</h1>
|
|
58
|
+
<p class="ocr_par">
|
|
59
|
+
<span class="ocrx_word">Content</span>
|
|
60
|
+
</p>
|
|
61
|
+
</div>"#;
|
|
62
|
+
|
|
63
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
64
|
+
assert!(!markdown.is_empty());
|
|
65
|
+
assert!(markdown.contains("Content"));
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#[test]
|
|
69
|
+
fn test_hocr_with_multiple_paragraphs() {
|
|
70
|
+
let hocr = r#"<div class="ocr_page">
|
|
71
|
+
<p class="ocr_par">
|
|
72
|
+
<span class="ocrx_word">First</span>
|
|
73
|
+
<span class="ocrx_word">paragraph</span>
|
|
74
|
+
</p>
|
|
75
|
+
<p class="ocr_par">
|
|
76
|
+
<span class="ocrx_word">Second</span>
|
|
77
|
+
<span class="ocrx_word">paragraph</span>
|
|
78
|
+
</p>
|
|
79
|
+
</div>"#;
|
|
80
|
+
|
|
81
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
82
|
+
assert!(markdown.contains("First"));
|
|
83
|
+
assert!(markdown.contains("Second"));
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[test]
|
|
87
|
+
fn test_hocr_with_line_breaks() {
|
|
88
|
+
let hocr = r#"<div class="ocr_page">
|
|
89
|
+
<p class="ocr_par">
|
|
90
|
+
<span class="ocrx_line">
|
|
91
|
+
<span class="ocrx_word">Line</span>
|
|
92
|
+
<span class="ocrx_word">one</span>
|
|
93
|
+
</span>
|
|
94
|
+
<span class="ocrx_line">
|
|
95
|
+
<span class="ocrx_word">Line</span>
|
|
96
|
+
<span class="ocrx_word">two</span>
|
|
97
|
+
</span>
|
|
98
|
+
</p>
|
|
99
|
+
</div>"#;
|
|
100
|
+
|
|
101
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
102
|
+
assert!(!markdown.is_empty());
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#[test]
|
|
106
|
+
fn test_hocr_whitespace_handling() {
|
|
107
|
+
let hocr = r#"<div class="ocr_page">
|
|
108
|
+
<p class="ocr_par">
|
|
109
|
+
<span class="ocrx_word"> Padded </span>
|
|
110
|
+
<span class="ocrx_word"> Text </span>
|
|
111
|
+
</p>
|
|
112
|
+
</div>"#;
|
|
113
|
+
|
|
114
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
115
|
+
assert!(!markdown.is_empty());
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
#[test]
|
|
119
|
+
fn test_hocr_special_characters() {
|
|
120
|
+
let hocr = r#"<div class="ocr_page">
|
|
121
|
+
<p class="ocr_par">
|
|
122
|
+
<span class="ocrx_word"><special></span>
|
|
123
|
+
<span class="ocrx_word">&chars&</span>
|
|
124
|
+
</p>
|
|
125
|
+
</div>"#;
|
|
126
|
+
|
|
127
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
128
|
+
assert!(!markdown.is_empty());
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
#[test]
|
|
132
|
+
fn test_hocr_nested_structure() {
|
|
133
|
+
let hocr = r#"<div class="ocr_page">
|
|
134
|
+
<div class="ocr_carea">
|
|
135
|
+
<p class="ocr_par">
|
|
136
|
+
<span class="ocr_line">
|
|
137
|
+
<span class="ocrx_word">Nested</span>
|
|
138
|
+
</span>
|
|
139
|
+
</p>
|
|
140
|
+
</div>
|
|
141
|
+
</div>"#;
|
|
142
|
+
|
|
143
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
144
|
+
assert!(markdown.contains("Nested"));
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
#[test]
|
|
148
|
+
fn test_hocr_malformed_html() {
|
|
149
|
+
let hocr = r#"<div class="ocr_page">
|
|
150
|
+
<p class="ocr_par">
|
|
151
|
+
<span class="ocrx_word">Unclosed
|
|
152
|
+
</div>"#;
|
|
153
|
+
|
|
154
|
+
let result = convert_hocr_to_markdown(hocr, None);
|
|
155
|
+
assert!(result.is_ok());
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
#[test]
|
|
159
|
+
fn test_hocr_no_ocr_classes() {
|
|
160
|
+
let hocr = r#"<div>
|
|
161
|
+
<p>
|
|
162
|
+
<span>Regular HTML</span>
|
|
163
|
+
</p>
|
|
164
|
+
</div>"#;
|
|
165
|
+
|
|
166
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
167
|
+
assert!(!markdown.is_empty());
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
#[test]
|
|
171
|
+
fn test_hocr_mixed_content() {
|
|
172
|
+
let hocr = r#"<div class="ocr_page">
|
|
173
|
+
<h1>Heading</h1>
|
|
174
|
+
<p class="ocr_par">
|
|
175
|
+
<span class="ocrx_word">Paragraph</span>
|
|
176
|
+
</p>
|
|
177
|
+
<ul>
|
|
178
|
+
<li>List item</li>
|
|
179
|
+
</ul>
|
|
180
|
+
</div>"#;
|
|
181
|
+
|
|
182
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
183
|
+
assert!(markdown.contains("Heading") || markdown.contains("heading") || !markdown.is_empty());
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
#[test]
|
|
187
|
+
fn test_hocr_unicode_content() {
|
|
188
|
+
let hocr = r#"<div class="ocr_page">
|
|
189
|
+
<p class="ocr_par">
|
|
190
|
+
<span class="ocrx_word">Ñoño</span>
|
|
191
|
+
<span class="ocrx_word">日本語</span>
|
|
192
|
+
<span class="ocrx_word">العربية</span>
|
|
193
|
+
</p>
|
|
194
|
+
</div>"#;
|
|
195
|
+
|
|
196
|
+
let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
|
|
197
|
+
assert!(markdown.contains("Ñoño") || !markdown.is_empty());
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
#[test]
|
|
201
|
+
fn test_hocr_large_document() {
|
|
202
|
+
let mut hocr = String::from(r#"<div class="ocr_page">"#);
|
|
203
|
+
for i in 0..100 {
|
|
204
|
+
hocr.push_str(&format!(
|
|
205
|
+
r#"<p class="ocr_par"><span class="ocrx_word">Word{}</span></p>"#,
|
|
206
|
+
i
|
|
207
|
+
));
|
|
208
|
+
}
|
|
209
|
+
hocr.push_str("</div>");
|
|
210
|
+
|
|
211
|
+
let result = convert_hocr_to_markdown(&hocr, None);
|
|
212
|
+
assert!(result.is_ok());
|
|
213
|
+
let markdown = result.unwrap();
|
|
214
|
+
assert!(!markdown.is_empty());
|
|
215
|
+
}
|
|
216
|
+
}
|
|
@@ -1,58 +1,58 @@
|
|
|
1
|
-
//! OCR (Optical Character Recognition) subsystem.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides OCR functionality using Tesseract as the backend.
|
|
4
|
-
//! It includes caching, table reconstruction, hOCR parsing, and batch processing.
|
|
5
|
-
//!
|
|
6
|
-
//! # Features
|
|
7
|
-
//!
|
|
8
|
-
//! - **Tesseract integration**: Native Tesseract backend via `kreuzberg-tesseract`
|
|
9
|
-
//! - **Result caching**: Persistent cache for OCR results using file hashing
|
|
10
|
-
//! - **Table reconstruction**: Extract and reconstruct tables from hOCR/TSV output
|
|
11
|
-
//! - **hOCR to Markdown**: Convert hOCR format to clean Markdown
|
|
12
|
-
//! - **Batch processing**: Process multiple images efficiently
|
|
13
|
-
//! - **Language support**: Validate and configure Tesseract languages
|
|
14
|
-
//! - **PSM modes**: Support for all Tesseract Page Segmentation Modes
|
|
15
|
-
//!
|
|
16
|
-
//! # Example
|
|
17
|
-
//!
|
|
18
|
-
//! ```rust,no_run
|
|
19
|
-
//! use kreuzberg::ocr::{OcrProcessor, TesseractConfig};
|
|
20
|
-
//!
|
|
21
|
-
//! # fn example() -> Result<(), kreuzberg::ocr::error::OcrError> {
|
|
22
|
-
//! let processor = OcrProcessor::new(None)?;
|
|
23
|
-
//! let config = TesseractConfig::default();
|
|
24
|
-
//!
|
|
25
|
-
//! let image_bytes = std::fs::read("scanned.png").expect("failed to read image");
|
|
26
|
-
//! let result = processor.process_image(&image_bytes, &config)?;
|
|
27
|
-
//!
|
|
28
|
-
//! println!("Extracted text: {}", result.content);
|
|
29
|
-
//! # Ok(())
|
|
30
|
-
//! # }
|
|
31
|
-
//! ```
|
|
32
|
-
//!
|
|
33
|
-
//! # Optional Feature
|
|
34
|
-
//!
|
|
35
|
-
//! This module requires the `ocr` feature to be enabled:
|
|
36
|
-
//! ```toml
|
|
37
|
-
//! [dependencies]
|
|
38
|
-
//! kreuzberg = { version = "4.0", features = ["ocr"] }
|
|
39
|
-
//! ```
|
|
40
|
-
pub mod cache;
|
|
41
|
-
pub mod error;
|
|
42
|
-
pub mod hocr;
|
|
43
|
-
pub mod processor;
|
|
44
|
-
pub mod table;
|
|
45
|
-
pub mod tesseract_backend;
|
|
46
|
-
pub mod types;
|
|
47
|
-
pub mod utils;
|
|
48
|
-
pub mod validation;
|
|
49
|
-
|
|
50
|
-
pub use cache::{OcrCache, OcrCacheStats};
|
|
51
|
-
pub use error::OcrError;
|
|
52
|
-
pub use hocr::convert_hocr_to_markdown;
|
|
53
|
-
pub use processor::OcrProcessor;
|
|
54
|
-
pub use table::{HocrWord, extract_words_from_tsv, reconstruct_table, table_to_markdown};
|
|
55
|
-
pub use tesseract_backend::TesseractBackend;
|
|
56
|
-
pub use types::{BatchItemResult, ExtractionResult, PSMMode, Table, TesseractConfig};
|
|
57
|
-
pub use utils::compute_hash;
|
|
58
|
-
pub use validation::{validate_language_code, validate_tesseract_version};
|
|
1
|
+
//! OCR (Optical Character Recognition) subsystem.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides OCR functionality using Tesseract as the backend.
|
|
4
|
+
//! It includes caching, table reconstruction, hOCR parsing, and batch processing.
|
|
5
|
+
//!
|
|
6
|
+
//! # Features
|
|
7
|
+
//!
|
|
8
|
+
//! - **Tesseract integration**: Native Tesseract backend via `kreuzberg-tesseract`
|
|
9
|
+
//! - **Result caching**: Persistent cache for OCR results using file hashing
|
|
10
|
+
//! - **Table reconstruction**: Extract and reconstruct tables from hOCR/TSV output
|
|
11
|
+
//! - **hOCR to Markdown**: Convert hOCR format to clean Markdown
|
|
12
|
+
//! - **Batch processing**: Process multiple images efficiently
|
|
13
|
+
//! - **Language support**: Validate and configure Tesseract languages
|
|
14
|
+
//! - **PSM modes**: Support for all Tesseract Page Segmentation Modes
|
|
15
|
+
//!
|
|
16
|
+
//! # Example
|
|
17
|
+
//!
|
|
18
|
+
//! ```rust,no_run
|
|
19
|
+
//! use kreuzberg::ocr::{OcrProcessor, TesseractConfig};
|
|
20
|
+
//!
|
|
21
|
+
//! # fn example() -> Result<(), kreuzberg::ocr::error::OcrError> {
|
|
22
|
+
//! let processor = OcrProcessor::new(None)?;
|
|
23
|
+
//! let config = TesseractConfig::default();
|
|
24
|
+
//!
|
|
25
|
+
//! let image_bytes = std::fs::read("scanned.png").expect("failed to read image");
|
|
26
|
+
//! let result = processor.process_image(&image_bytes, &config)?;
|
|
27
|
+
//!
|
|
28
|
+
//! println!("Extracted text: {}", result.content);
|
|
29
|
+
//! # Ok(())
|
|
30
|
+
//! # }
|
|
31
|
+
//! ```
|
|
32
|
+
//!
|
|
33
|
+
//! # Optional Feature
|
|
34
|
+
//!
|
|
35
|
+
//! This module requires the `ocr` feature to be enabled:
|
|
36
|
+
//! ```toml
|
|
37
|
+
//! [dependencies]
|
|
38
|
+
//! kreuzberg = { version = "4.0", features = ["ocr"] }
|
|
39
|
+
//! ```
|
|
40
|
+
pub mod cache;
|
|
41
|
+
pub mod error;
|
|
42
|
+
pub mod hocr;
|
|
43
|
+
pub mod processor;
|
|
44
|
+
pub mod table;
|
|
45
|
+
pub mod tesseract_backend;
|
|
46
|
+
pub mod types;
|
|
47
|
+
pub mod utils;
|
|
48
|
+
pub mod validation;
|
|
49
|
+
|
|
50
|
+
pub use cache::{OcrCache, OcrCacheStats};
|
|
51
|
+
pub use error::OcrError;
|
|
52
|
+
pub use hocr::convert_hocr_to_markdown;
|
|
53
|
+
pub use processor::OcrProcessor;
|
|
54
|
+
pub use table::{HocrWord, extract_words_from_tsv, reconstruct_table, table_to_markdown};
|
|
55
|
+
pub use tesseract_backend::TesseractBackend;
|
|
56
|
+
pub use types::{BatchItemResult, ExtractionResult, PSMMode, Table, TesseractConfig};
|
|
57
|
+
pub use utils::compute_hash;
|
|
58
|
+
pub use validation::{validate_language_code, validate_tesseract_version};
|