kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +2 -105
- data/README.md +454 -454
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +7 -80
|
@@ -1,71 +1,71 @@
|
|
|
1
|
-
# Tesseract WASM Patches
|
|
2
|
-
|
|
3
|
-
This directory contains patches needed to compile Tesseract for WebAssembly (WASM) targets using Emscripten.
|
|
4
|
-
|
|
5
|
-
These patches are vendored from the [tesseract-wasm](https://github.com/naptha/tesseract.js) project and have been proven to work with WASM compilation.
|
|
6
|
-
|
|
7
|
-
## Patches
|
|
8
|
-
|
|
9
|
-
### tesseract.diff
|
|
10
|
-
|
|
11
|
-
A comprehensive patch that makes Tesseract compatible with WASM compilation. The patch includes the following changes:
|
|
12
|
-
|
|
13
|
-
#### 1. CMakeLists.txt Modifications
|
|
14
|
-
|
|
15
|
-
- **New CMake option**: `BUILD_TESSERACT_BINARY` (default: ON)
|
|
16
|
-
- Allows disabling the Tesseract CLI binary build, which is not needed for WASM
|
|
17
|
-
- Wraps all executable and installation targets for the tesseract binary
|
|
18
|
-
|
|
19
|
-
- **Disabled components for WASM**:
|
|
20
|
-
- Removes OpenCL support (`src/opencl/*.cpp`) - not applicable to WASM
|
|
21
|
-
- Removes viewer support (`src/viewer/*.cpp`) - UI components not needed for WASM
|
|
22
|
-
- Removes C API bindings (`src/api/capi.cpp`) - only hocrrenderer is kept
|
|
23
|
-
- Removes PDF and rendering support files:
|
|
24
|
-
- `src/api/renderer.cpp`
|
|
25
|
-
- `src/api/altorenderer.cpp`
|
|
26
|
-
- `src/api/lstmboxrenderer.cpp`
|
|
27
|
-
- `src/api/pdfrenderer.cpp`
|
|
28
|
-
- `src/api/wordstrboxrenderer.cpp`
|
|
29
|
-
|
|
30
|
-
#### 2. SIMD Detection Fixes (src/arch/simddetect.cpp)
|
|
31
|
-
|
|
32
|
-
- Guards CPUID detection with `#if !defined(__EMSCRIPTEN__)`
|
|
33
|
-
- Prevents attempts to use CPU feature detection that don't exist in Emscripten
|
|
34
|
-
- The HAS_CPUID macro is only defined for non-Emscripten builds
|
|
35
|
-
- This allows the code to gracefully handle WASM's SIMD limitations
|
|
36
|
-
|
|
37
|
-
#### 3. Pointer Type Fixes (src/ccmain/pageiterator.cpp, src/ccmain/pagesegmain.cpp, src/ccmain/tesseractclass.cpp)
|
|
38
|
-
|
|
39
|
-
**Changed from stack allocation to heap allocation** in `tesseractclass.h`:
|
|
40
|
-
- `pixa_debug_` changed from `DebugPixa` to `std::unique_ptr<DebugPixa>`
|
|
41
|
-
- This prevents large allocations on the stack, which is limited in WASM
|
|
42
|
-
|
|
43
|
-
**Updated all references** throughout the codebase:
|
|
44
|
-
- `.get()` calls added where raw pointers are needed
|
|
45
|
-
- Arrow operator `->` replaces dot operator `.` for member access
|
|
46
|
-
- Null checks added before dereferencing to prevent crashes
|
|
47
|
-
|
|
48
|
-
**Affected functions**:
|
|
49
|
-
- `PageIterator::Orientation()` - added null vector check
|
|
50
|
-
- `Tesseract::AutoPageSeg()` - updated pointer passing
|
|
51
|
-
- `Tesseract::SetupPageSegAndDetectOrientation()` - multiple pointer updates
|
|
52
|
-
- `Tesseract::Clear()` - added null check before WritePDF
|
|
53
|
-
- `Tesseract::PrepareForPageseg()` - updated Split() calls
|
|
54
|
-
- `Tesseract::PrepareForTessOCR()` - updated Split() calls
|
|
55
|
-
|
|
56
|
-
#### 4. Additional Fixes
|
|
57
|
-
|
|
58
|
-
- **Orientation detection**: Changed comparison from `> 0.0F` to `>= 0.0F` in `pageiterator.cpp` to handle null vectors gracefully when orientation info is not available
|
|
59
|
-
|
|
60
|
-
## How to Apply
|
|
61
|
-
|
|
62
|
-
These patches are applied during the WASM build process. They modify the Tesseract source code to:
|
|
63
|
-
|
|
64
|
-
1. Disable WASM-incompatible features (OpenCL, viewers, renderers)
|
|
65
|
-
2. Prevent CPUID detection in Emscripten environment
|
|
66
|
-
3. Use heap allocation instead of stack allocation for large objects
|
|
67
|
-
4. Handle missing pointer initialization gracefully
|
|
68
|
-
|
|
69
|
-
## Source
|
|
70
|
-
|
|
71
|
-
These patches are based on the proven WASM compilation approach used by the tesseract.js project, which successfully compiles Tesseract to WebAssembly and deploys it in production environments.
|
|
1
|
+
# Tesseract WASM Patches
|
|
2
|
+
|
|
3
|
+
This directory contains patches needed to compile Tesseract for WebAssembly (WASM) targets using Emscripten.
|
|
4
|
+
|
|
5
|
+
These patches are vendored from the [tesseract-wasm](https://github.com/naptha/tesseract.js) project and have been proven to work with WASM compilation.
|
|
6
|
+
|
|
7
|
+
## Patches
|
|
8
|
+
|
|
9
|
+
### tesseract.diff
|
|
10
|
+
|
|
11
|
+
A comprehensive patch that makes Tesseract compatible with WASM compilation. The patch includes the following changes:
|
|
12
|
+
|
|
13
|
+
#### 1. CMakeLists.txt Modifications
|
|
14
|
+
|
|
15
|
+
- **New CMake option**: `BUILD_TESSERACT_BINARY` (default: ON)
|
|
16
|
+
- Allows disabling the Tesseract CLI binary build, which is not needed for WASM
|
|
17
|
+
- Wraps all executable and installation targets for the tesseract binary
|
|
18
|
+
|
|
19
|
+
- **Disabled components for WASM**:
|
|
20
|
+
- Removes OpenCL support (`src/opencl/*.cpp`) - not applicable to WASM
|
|
21
|
+
- Removes viewer support (`src/viewer/*.cpp`) - UI components not needed for WASM
|
|
22
|
+
- Removes C API bindings (`src/api/capi.cpp`) - only hocrrenderer is kept
|
|
23
|
+
- Removes PDF and rendering support files:
|
|
24
|
+
- `src/api/renderer.cpp`
|
|
25
|
+
- `src/api/altorenderer.cpp`
|
|
26
|
+
- `src/api/lstmboxrenderer.cpp`
|
|
27
|
+
- `src/api/pdfrenderer.cpp`
|
|
28
|
+
- `src/api/wordstrboxrenderer.cpp`
|
|
29
|
+
|
|
30
|
+
#### 2. SIMD Detection Fixes (src/arch/simddetect.cpp)
|
|
31
|
+
|
|
32
|
+
- Guards CPUID detection with `#if !defined(__EMSCRIPTEN__)`
|
|
33
|
+
- Prevents attempts to use CPU feature detection that don't exist in Emscripten
|
|
34
|
+
- The HAS_CPUID macro is only defined for non-Emscripten builds
|
|
35
|
+
- This allows the code to gracefully handle WASM's SIMD limitations
|
|
36
|
+
|
|
37
|
+
#### 3. Pointer Type Fixes (src/ccmain/pageiterator.cpp, src/ccmain/pagesegmain.cpp, src/ccmain/tesseractclass.cpp)
|
|
38
|
+
|
|
39
|
+
**Changed from stack allocation to heap allocation** in `tesseractclass.h`:
|
|
40
|
+
- `pixa_debug_` changed from `DebugPixa` to `std::unique_ptr<DebugPixa>`
|
|
41
|
+
- This prevents large allocations on the stack, which is limited in WASM
|
|
42
|
+
|
|
43
|
+
**Updated all references** throughout the codebase:
|
|
44
|
+
- `.get()` calls added where raw pointers are needed
|
|
45
|
+
- Arrow operator `->` replaces dot operator `.` for member access
|
|
46
|
+
- Null checks added before dereferencing to prevent crashes
|
|
47
|
+
|
|
48
|
+
**Affected functions**:
|
|
49
|
+
- `PageIterator::Orientation()` - added null vector check
|
|
50
|
+
- `Tesseract::AutoPageSeg()` - updated pointer passing
|
|
51
|
+
- `Tesseract::SetupPageSegAndDetectOrientation()` - multiple pointer updates
|
|
52
|
+
- `Tesseract::Clear()` - added null check before WritePDF
|
|
53
|
+
- `Tesseract::PrepareForPageseg()` - updated Split() calls
|
|
54
|
+
- `Tesseract::PrepareForTessOCR()` - updated Split() calls
|
|
55
|
+
|
|
56
|
+
#### 4. Additional Fixes
|
|
57
|
+
|
|
58
|
+
- **Orientation detection**: Changed comparison from `> 0.0F` to `>= 0.0F` in `pageiterator.cpp` to handle null vectors gracefully when orientation info is not available
|
|
59
|
+
|
|
60
|
+
## How to Apply
|
|
61
|
+
|
|
62
|
+
These patches are applied during the WASM build process. They modify the Tesseract source code to:
|
|
63
|
+
|
|
64
|
+
1. Disable WASM-incompatible features (OpenCL, viewers, renderers)
|
|
65
|
+
2. Prevent CPUID detection in Emscripten environment
|
|
66
|
+
3. Use heap allocation instead of stack allocation for large objects
|
|
67
|
+
4. Handle missing pointer initialization gracefully
|
|
68
|
+
|
|
69
|
+
## Source
|
|
70
|
+
|
|
71
|
+
These patches are based on the proven WASM compilation approach used by the tesseract.js project, which successfully compiles Tesseract to WebAssembly and deploys it in production environments.
|
|
@@ -1,199 +1,199 @@
|
|
|
1
|
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
|
2
|
-
index 8c6845cb..fdcfc4a8 100644
|
|
3
|
-
--- a/CMakeLists.txt
|
|
4
|
-
+++ b/CMakeLists.txt
|
|
5
|
-
@@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
|
|
6
|
-
option(FAST_FLOAT "Enable float for LSTM" ON)
|
|
7
|
-
option(ENABLE_OPENCL "Enable unsupported experimental OpenCL support" OFF)
|
|
8
|
-
option(BUILD_TRAINING_TOOLS "Build training tools" ON)
|
|
9
|
-
+option(BUILD_TESSERACT_BINARY "Build Tesseract binary" ON)
|
|
10
|
-
option(BUILD_TESTS "Build tests" OFF)
|
|
11
|
-
option(USE_SYSTEM_ICU "Use system ICU" OFF)
|
|
12
|
-
option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF)
|
|
13
|
-
@@ -565,9 +566,7 @@ file(
|
|
14
|
-
src/cutil/*.cpp
|
|
15
|
-
src/dict/*.cpp
|
|
16
|
-
src/lstm/*.cpp
|
|
17
|
-
- src/opencl/*.cpp
|
|
18
|
-
src/textord/*.cpp
|
|
19
|
-
- src/viewer/*.cpp
|
|
20
|
-
src/wordrec/*.cpp)
|
|
21
|
-
|
|
22
|
-
if(DISABLED_LEGACY_ENGINE)
|
|
23
|
-
@@ -714,13 +713,7 @@ file(
|
|
24
|
-
set(TESSERACT_SRC
|
|
25
|
-
${TESSERACT_SRC}
|
|
26
|
-
src/api/baseapi.cpp
|
|
27
|
-
- src/api/capi.cpp
|
|
28
|
-
- src/api/renderer.cpp
|
|
29
|
-
- src/api/altorenderer.cpp
|
|
30
|
-
- src/api/hocrrenderer.cpp
|
|
31
|
-
- src/api/lstmboxrenderer.cpp
|
|
32
|
-
- src/api/pdfrenderer.cpp
|
|
33
|
-
- src/api/wordstrboxrenderer.cpp)
|
|
34
|
-
+ src/api/hocrrenderer.cpp)
|
|
35
|
-
|
|
36
|
-
set(TESSERACT_CONFIGS
|
|
37
|
-
tessdata/configs/alto
|
|
38
|
-
@@ -858,14 +851,16 @@ endif()
|
|
39
|
-
# EXECUTABLE tesseract
|
|
40
|
-
# ##############################################################################
|
|
41
|
-
|
|
42
|
-
-add_executable(tesseract src/tesseract.cpp)
|
|
43
|
-
-target_link_libraries(tesseract libtesseract)
|
|
44
|
-
-if(HAVE_TIFFIO_H AND WIN32)
|
|
45
|
-
- target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
|
46
|
-
-endif()
|
|
47
|
-
+if(BUILD_TESSERACT_BINARY)
|
|
48
|
-
+ add_executable(tesseract src/tesseract.cpp)
|
|
49
|
-
+ target_link_libraries(tesseract libtesseract)
|
|
50
|
-
+ if(HAVE_TIFFIO_H AND WIN32)
|
|
51
|
-
+ target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
|
52
|
-
+ endif()
|
|
53
|
-
|
|
54
|
-
-if(OPENMP_BUILD AND UNIX)
|
|
55
|
-
- target_link_libraries(tesseract pthread)
|
|
56
|
-
+ if(OPENMP_BUILD AND UNIX)
|
|
57
|
-
+ target_link_libraries(tesseract pthread)
|
|
58
|
-
+ endif()
|
|
59
|
-
endif()
|
|
60
|
-
|
|
61
|
-
# ##############################################################################
|
|
62
|
-
@@ -899,7 +894,11 @@ write_basic_package_version_file(
|
|
63
|
-
|
|
64
|
-
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
|
|
65
|
-
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
|
66
|
-
-install(TARGETS tesseract DESTINATION bin)
|
|
67
|
-
+
|
|
68
|
-
+if(BUILD_TESSERACT_BINARY)
|
|
69
|
-
+ install(TARGETS tesseract DESTINATION bin)
|
|
70
|
-
+endif()
|
|
71
|
-
+
|
|
72
|
-
install(
|
|
73
|
-
TARGETS libtesseract
|
|
74
|
-
EXPORT TesseractTargets
|
|
75
|
-
diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp
|
|
76
|
-
index 1afe5a5d..cb8c6d4c 100644
|
|
77
|
-
--- a/src/arch/simddetect.cpp
|
|
78
|
-
+++ b/src/arch/simddetect.cpp
|
|
79
|
-
@@ -40,10 +40,12 @@
|
|
80
|
-
|
|
81
|
-
#endif
|
|
82
|
-
|
|
83
|
-
+#if !defined(__EMSCRIPTEN__)
|
|
84
|
-
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
|
|
85
|
-
// See https://en.wikipedia.org/wiki/CPUID.
|
|
86
|
-
# define HAS_CPUID
|
|
87
|
-
#endif
|
|
88
|
-
+#endif
|
|
89
|
-
|
|
90
|
-
#if defined(HAS_CPUID)
|
|
91
|
-
# if defined(__GNUC__)
|
|
92
|
-
diff --git a/src/ccmain/pageiterator.cpp b/src/ccmain/pageiterator.cpp
|
|
93
|
-
index 64ff7f66..c0f80e5f 100644
|
|
94
|
-
--- a/src/ccmain/pageiterator.cpp
|
|
95
|
-
+++ b/src/ccmain/pageiterator.cpp
|
|
96
|
-
@@ -582,7 +582,9 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
|
|
97
|
-
up_in_image.rotate(block->re_rotation());
|
|
98
|
-
|
|
99
|
-
if (up_in_image.x() == 0.0F) {
|
|
100
|
-
- if (up_in_image.y() > 0.0F) {
|
|
101
|
-
+ // tesseract-wasm note: `up_in_image` will be a null vector if orientation
|
|
102
|
-
+ // info is not available. In that case, assume page up.
|
|
103
|
-
+ if (up_in_image.y() >= 0.0F) {
|
|
104
|
-
*orientation = ORIENTATION_PAGE_UP;
|
|
105
|
-
} else {
|
|
106
|
-
*orientation = ORIENTATION_PAGE_DOWN;
|
|
107
|
-
diff --git a/src/ccmain/pagesegmain.cpp b/src/ccmain/pagesegmain.cpp
|
|
108
|
-
index 0af44607..718e73ef 100644
|
|
109
|
-
--- a/src/ccmain/pagesegmain.cpp
|
|
110
|
-
+++ b/src/ccmain/pagesegmain.cpp
|
|
111
|
-
@@ -222,7 +222,7 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOC
|
|
112
|
-
}
|
|
113
|
-
#endif // ndef DISABLED_LEGACY_ENGINE
|
|
114
|
-
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
|
|
115
|
-
- photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
|
|
116
|
-
+ photomask_pix, pix_thresholds_, pix_grey_, pixa_debug_.get(),
|
|
117
|
-
&found_blocks, diacritic_blobs, to_blocks);
|
|
118
|
-
if (result >= 0) {
|
|
119
|
-
finder->GetDeskewVectors(&deskew_, &reskew_);
|
|
120
|
-
@@ -279,17 +279,17 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
|
121
|
-
ICOORD bleft(0, 0);
|
|
122
|
-
|
|
123
|
-
ASSERT_HOST(pix_binary_ != nullptr);
|
|
124
|
-
- if (tessedit_dump_pageseg_images) {
|
|
125
|
-
- pixa_debug_.AddPix(pix_binary_, "PageSegInput");
|
|
126
|
-
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
|
127
|
-
+ pixa_debug_->AddPix(pix_binary_, "PageSegInput");
|
|
128
|
-
}
|
|
129
|
-
// Leptonica is used to find the rule/separator lines in the input.
|
|
130
|
-
LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
|
|
131
|
-
&vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
|
|
132
|
-
- if (tessedit_dump_pageseg_images) {
|
|
133
|
-
- pixa_debug_.AddPix(pix_binary_, "NoLines");
|
|
134
|
-
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
|
135
|
-
+ pixa_debug_->AddPix(pix_binary_, "NoLines");
|
|
136
|
-
}
|
|
137
|
-
// Leptonica is used to find a mask of the photo regions in the input.
|
|
138
|
-
- *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
|
|
139
|
-
+ *photo_mask_pix = ImageFind::FindImages(pix_binary_, pixa_debug_.get());
|
|
140
|
-
if (tessedit_dump_pageseg_images) {
|
|
141
|
-
Image pix_no_image_ = nullptr;
|
|
142
|
-
if (*photo_mask_pix != nullptr) {
|
|
143
|
-
@@ -297,7 +297,7 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
|
144
|
-
} else {
|
|
145
|
-
pix_no_image_ = pix_binary_.clone();
|
|
146
|
-
}
|
|
147
|
-
- pixa_debug_.AddPix(pix_no_image_, "NoImages");
|
|
148
|
-
+ pixa_debug_->AddPix(pix_no_image_, "NoImages");
|
|
149
|
-
pix_no_image_.destroy();
|
|
150
|
-
}
|
|
151
|
-
if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
|
|
152
|
-
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
|
|
153
|
-
index fd58ac87..517f925e 100644
|
|
154
|
-
--- a/src/ccmain/tesseractclass.cpp
|
|
155
|
-
+++ b/src/ccmain/tesseractclass.cpp
|
|
156
|
-
@@ -487,8 +487,10 @@ Dict &Tesseract::getDict() {
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
void Tesseract::Clear() {
|
|
160
|
-
- std::string debug_name = imagebasename + "_debug.pdf";
|
|
161
|
-
- pixa_debug_.WritePDF(debug_name.c_str());
|
|
162
|
-
+ if (pixa_debug_) {
|
|
163
|
-
+ std::string debug_name = imagebasename + "_debug.pdf";
|
|
164
|
-
+ pixa_debug_->WritePDF(debug_name.c_str());
|
|
165
|
-
+ }
|
|
166
|
-
pix_binary_.destroy();
|
|
167
|
-
pix_grey_.destroy();
|
|
168
|
-
pix_thresholds_.destroy();
|
|
169
|
-
@@ -572,7 +574,7 @@ void Tesseract::PrepareForPageseg() {
|
|
170
|
-
// the newly split image.
|
|
171
|
-
splitter_.set_orig_pix(pix_binary());
|
|
172
|
-
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
|
|
173
|
-
- if (splitter_.Split(true, &pixa_debug_)) {
|
|
174
|
-
+ if (splitter_.Split(true, pixa_debug_.get())) {
|
|
175
|
-
ASSERT_HOST(splitter_.splitted_image());
|
|
176
|
-
pix_binary_.destroy();
|
|
177
|
-
pix_binary_ = splitter_.splitted_image().clone();
|
|
178
|
-
@@ -599,7 +601,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, O
|
|
179
|
-
splitter_.set_segmentation_block_list(block_list);
|
|
180
|
-
splitter_.set_ocr_split_strategy(max_ocr_strategy);
|
|
181
|
-
// Run the splitter for OCR
|
|
182
|
-
- bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
|
|
183
|
-
+ bool split_for_ocr = splitter_.Split(false, pixa_debug_.get());
|
|
184
|
-
// Restore pix_binary to the binarized original pix for future reference.
|
|
185
|
-
ASSERT_HOST(splitter_.orig_pix());
|
|
186
|
-
pix_binary_.destroy();
|
|
187
|
-
diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
|
|
188
|
-
index 732bb9e6..030aa5bc 100644
|
|
189
|
-
--- a/src/ccmain/tesseractclass.h
|
|
190
|
-
+++ b/src/ccmain/tesseractclass.h
|
|
191
|
-
@@ -986,7 +986,7 @@ private:
|
|
192
|
-
// Thresholds that were used to generate the thresholded image from grey.
|
|
193
|
-
Image pix_thresholds_;
|
|
194
|
-
// Debug images. If non-empty, will be written on destruction.
|
|
195
|
-
- DebugPixa pixa_debug_;
|
|
196
|
-
+ std::unique_ptr<DebugPixa> pixa_debug_;
|
|
197
|
-
// Input image resolution after any scaling. The resolution is not well
|
|
198
|
-
// transmitted by operations on Pix, so we keep an independent record here.
|
|
199
|
-
int source_resolution_;
|
|
1
|
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
|
2
|
+
index 8c6845cb..fdcfc4a8 100644
|
|
3
|
+
--- a/CMakeLists.txt
|
|
4
|
+
+++ b/CMakeLists.txt
|
|
5
|
+
@@ -90,6 +90,7 @@ option(ENABLE_LTO "Enable link-time optimization" OFF)
|
|
6
|
+
option(FAST_FLOAT "Enable float for LSTM" ON)
|
|
7
|
+
option(ENABLE_OPENCL "Enable unsupported experimental OpenCL support" OFF)
|
|
8
|
+
option(BUILD_TRAINING_TOOLS "Build training tools" ON)
|
|
9
|
+
+option(BUILD_TESSERACT_BINARY "Build Tesseract binary" ON)
|
|
10
|
+
option(BUILD_TESTS "Build tests" OFF)
|
|
11
|
+
option(USE_SYSTEM_ICU "Use system ICU" OFF)
|
|
12
|
+
option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF)
|
|
13
|
+
@@ -565,9 +566,7 @@ file(
|
|
14
|
+
src/cutil/*.cpp
|
|
15
|
+
src/dict/*.cpp
|
|
16
|
+
src/lstm/*.cpp
|
|
17
|
+
- src/opencl/*.cpp
|
|
18
|
+
src/textord/*.cpp
|
|
19
|
+
- src/viewer/*.cpp
|
|
20
|
+
src/wordrec/*.cpp)
|
|
21
|
+
|
|
22
|
+
if(DISABLED_LEGACY_ENGINE)
|
|
23
|
+
@@ -714,13 +713,7 @@ file(
|
|
24
|
+
set(TESSERACT_SRC
|
|
25
|
+
${TESSERACT_SRC}
|
|
26
|
+
src/api/baseapi.cpp
|
|
27
|
+
- src/api/capi.cpp
|
|
28
|
+
- src/api/renderer.cpp
|
|
29
|
+
- src/api/altorenderer.cpp
|
|
30
|
+
- src/api/hocrrenderer.cpp
|
|
31
|
+
- src/api/lstmboxrenderer.cpp
|
|
32
|
+
- src/api/pdfrenderer.cpp
|
|
33
|
+
- src/api/wordstrboxrenderer.cpp)
|
|
34
|
+
+ src/api/hocrrenderer.cpp)
|
|
35
|
+
|
|
36
|
+
set(TESSERACT_CONFIGS
|
|
37
|
+
tessdata/configs/alto
|
|
38
|
+
@@ -858,14 +851,16 @@ endif()
|
|
39
|
+
# EXECUTABLE tesseract
|
|
40
|
+
# ##############################################################################
|
|
41
|
+
|
|
42
|
+
-add_executable(tesseract src/tesseract.cpp)
|
|
43
|
+
-target_link_libraries(tesseract libtesseract)
|
|
44
|
+
-if(HAVE_TIFFIO_H AND WIN32)
|
|
45
|
+
- target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
|
46
|
+
-endif()
|
|
47
|
+
+if(BUILD_TESSERACT_BINARY)
|
|
48
|
+
+ add_executable(tesseract src/tesseract.cpp)
|
|
49
|
+
+ target_link_libraries(tesseract libtesseract)
|
|
50
|
+
+ if(HAVE_TIFFIO_H AND WIN32)
|
|
51
|
+
+ target_link_libraries(tesseract ${TIFF_LIBRARIES})
|
|
52
|
+
+ endif()
|
|
53
|
+
|
|
54
|
+
-if(OPENMP_BUILD AND UNIX)
|
|
55
|
+
- target_link_libraries(tesseract pthread)
|
|
56
|
+
+ if(OPENMP_BUILD AND UNIX)
|
|
57
|
+
+ target_link_libraries(tesseract pthread)
|
|
58
|
+
+ endif()
|
|
59
|
+
endif()
|
|
60
|
+
|
|
61
|
+
# ##############################################################################
|
|
62
|
+
@@ -899,7 +894,11 @@ write_basic_package_version_file(
|
|
63
|
+
|
|
64
|
+
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc
|
|
65
|
+
DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
|
66
|
+
-install(TARGETS tesseract DESTINATION bin)
|
|
67
|
+
+
|
|
68
|
+
+if(BUILD_TESSERACT_BINARY)
|
|
69
|
+
+ install(TARGETS tesseract DESTINATION bin)
|
|
70
|
+
+endif()
|
|
71
|
+
+
|
|
72
|
+
install(
|
|
73
|
+
TARGETS libtesseract
|
|
74
|
+
EXPORT TesseractTargets
|
|
75
|
+
diff --git a/src/arch/simddetect.cpp b/src/arch/simddetect.cpp
|
|
76
|
+
index 1afe5a5d..cb8c6d4c 100644
|
|
77
|
+
--- a/src/arch/simddetect.cpp
|
|
78
|
+
+++ b/src/arch/simddetect.cpp
|
|
79
|
+
@@ -40,10 +40,12 @@
|
|
80
|
+
|
|
81
|
+
#endif
|
|
82
|
+
|
|
83
|
+
+#if !defined(__EMSCRIPTEN__)
|
|
84
|
+
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
|
|
85
|
+
// See https://en.wikipedia.org/wiki/CPUID.
|
|
86
|
+
# define HAS_CPUID
|
|
87
|
+
#endif
|
|
88
|
+
+#endif
|
|
89
|
+
|
|
90
|
+
#if defined(HAS_CPUID)
|
|
91
|
+
# if defined(__GNUC__)
|
|
92
|
+
diff --git a/src/ccmain/pageiterator.cpp b/src/ccmain/pageiterator.cpp
|
|
93
|
+
index 64ff7f66..c0f80e5f 100644
|
|
94
|
+
--- a/src/ccmain/pageiterator.cpp
|
|
95
|
+
+++ b/src/ccmain/pageiterator.cpp
|
|
96
|
+
@@ -582,7 +582,9 @@ void PageIterator::Orientation(tesseract::Orientation *orientation,
|
|
97
|
+
up_in_image.rotate(block->re_rotation());
|
|
98
|
+
|
|
99
|
+
if (up_in_image.x() == 0.0F) {
|
|
100
|
+
- if (up_in_image.y() > 0.0F) {
|
|
101
|
+
+ // tesseract-wasm note: `up_in_image` will be a null vector if orientation
|
|
102
|
+
+ // info is not available. In that case, assume page up.
|
|
103
|
+
+ if (up_in_image.y() >= 0.0F) {
|
|
104
|
+
*orientation = ORIENTATION_PAGE_UP;
|
|
105
|
+
} else {
|
|
106
|
+
*orientation = ORIENTATION_PAGE_DOWN;
|
|
107
|
+
diff --git a/src/ccmain/pagesegmain.cpp b/src/ccmain/pagesegmain.cpp
|
|
108
|
+
index 0af44607..718e73ef 100644
|
|
109
|
+
--- a/src/ccmain/pagesegmain.cpp
|
|
110
|
+
+++ b/src/ccmain/pagesegmain.cpp
|
|
111
|
+
@@ -222,7 +222,7 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOC
|
|
112
|
+
}
|
|
113
|
+
#endif // ndef DISABLED_LEGACY_ENGINE
|
|
114
|
+
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
|
|
115
|
+
- photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
|
|
116
|
+
+ photomask_pix, pix_thresholds_, pix_grey_, pixa_debug_.get(),
|
|
117
|
+
&found_blocks, diacritic_blobs, to_blocks);
|
|
118
|
+
if (result >= 0) {
|
|
119
|
+
finder->GetDeskewVectors(&deskew_, &reskew_);
|
|
120
|
+
@@ -279,17 +279,17 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
|
121
|
+
ICOORD bleft(0, 0);
|
|
122
|
+
|
|
123
|
+
ASSERT_HOST(pix_binary_ != nullptr);
|
|
124
|
+
- if (tessedit_dump_pageseg_images) {
|
|
125
|
+
- pixa_debug_.AddPix(pix_binary_, "PageSegInput");
|
|
126
|
+
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
|
127
|
+
+ pixa_debug_->AddPix(pix_binary_, "PageSegInput");
|
|
128
|
+
}
|
|
129
|
+
// Leptonica is used to find the rule/separator lines in the input.
|
|
130
|
+
LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
|
|
131
|
+
&vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
|
|
132
|
+
- if (tessedit_dump_pageseg_images) {
|
|
133
|
+
- pixa_debug_.AddPix(pix_binary_, "NoLines");
|
|
134
|
+
+ if (tessedit_dump_pageseg_images && pixa_debug_) {
|
|
135
|
+
+ pixa_debug_->AddPix(pix_binary_, "NoLines");
|
|
136
|
+
}
|
|
137
|
+
// Leptonica is used to find a mask of the photo regions in the input.
|
|
138
|
+
- *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
|
|
139
|
+
+ *photo_mask_pix = ImageFind::FindImages(pix_binary_, pixa_debug_.get());
|
|
140
|
+
if (tessedit_dump_pageseg_images) {
|
|
141
|
+
Image pix_no_image_ = nullptr;
|
|
142
|
+
if (*photo_mask_pix != nullptr) {
|
|
143
|
+
@@ -297,7 +297,7 @@ ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mo
|
|
144
|
+
} else {
|
|
145
|
+
pix_no_image_ = pix_binary_.clone();
|
|
146
|
+
}
|
|
147
|
+
- pixa_debug_.AddPix(pix_no_image_, "NoImages");
|
|
148
|
+
+ pixa_debug_->AddPix(pix_no_image_, "NoImages");
|
|
149
|
+
pix_no_image_.destroy();
|
|
150
|
+
}
|
|
151
|
+
if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
|
|
152
|
+
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
|
|
153
|
+
index fd58ac87..517f925e 100644
|
|
154
|
+
--- a/src/ccmain/tesseractclass.cpp
|
|
155
|
+
+++ b/src/ccmain/tesseractclass.cpp
|
|
156
|
+
@@ -487,8 +487,10 @@ Dict &Tesseract::getDict() {
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
void Tesseract::Clear() {
|
|
160
|
+
- std::string debug_name = imagebasename + "_debug.pdf";
|
|
161
|
+
- pixa_debug_.WritePDF(debug_name.c_str());
|
|
162
|
+
+ if (pixa_debug_) {
|
|
163
|
+
+ std::string debug_name = imagebasename + "_debug.pdf";
|
|
164
|
+
+ pixa_debug_->WritePDF(debug_name.c_str());
|
|
165
|
+
+ }
|
|
166
|
+
pix_binary_.destroy();
|
|
167
|
+
pix_grey_.destroy();
|
|
168
|
+
pix_thresholds_.destroy();
|
|
169
|
+
@@ -572,7 +574,7 @@ void Tesseract::PrepareForPageseg() {
|
|
170
|
+
// the newly split image.
|
|
171
|
+
splitter_.set_orig_pix(pix_binary());
|
|
172
|
+
splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
|
|
173
|
+
- if (splitter_.Split(true, &pixa_debug_)) {
|
|
174
|
+
+ if (splitter_.Split(true, pixa_debug_.get())) {
|
|
175
|
+
ASSERT_HOST(splitter_.splitted_image());
|
|
176
|
+
pix_binary_.destroy();
|
|
177
|
+
pix_binary_ = splitter_.splitted_image().clone();
|
|
178
|
+
@@ -599,7 +601,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, O
|
|
179
|
+
splitter_.set_segmentation_block_list(block_list);
|
|
180
|
+
splitter_.set_ocr_split_strategy(max_ocr_strategy);
|
|
181
|
+
// Run the splitter for OCR
|
|
182
|
+
- bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
|
|
183
|
+
+ bool split_for_ocr = splitter_.Split(false, pixa_debug_.get());
|
|
184
|
+
// Restore pix_binary to the binarized original pix for future reference.
|
|
185
|
+
ASSERT_HOST(splitter_.orig_pix());
|
|
186
|
+
pix_binary_.destroy();
|
|
187
|
+
diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
|
|
188
|
+
index 732bb9e6..030aa5bc 100644
|
|
189
|
+
--- a/src/ccmain/tesseractclass.h
|
|
190
|
+
+++ b/src/ccmain/tesseractclass.h
|
|
191
|
+
@@ -986,7 +986,7 @@ private:
|
|
192
|
+
// Thresholds that were used to generate the thresholded image from grey.
|
|
193
|
+
Image pix_thresholds_;
|
|
194
|
+
// Debug images. If non-empty, will be written on destruction.
|
|
195
|
+
- DebugPixa pixa_debug_;
|
|
196
|
+
+ std::unique_ptr<DebugPixa> pixa_debug_;
|
|
197
|
+
// Input image resolution after any scaling. The resolution is not well
|
|
198
|
+
// transmitted by operations on Pix, so we keep an independent record here.
|
|
199
|
+
int source_resolution_;
|