kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,219 +1,219 @@
|
|
|
1
|
-
//! Language detection post-processor.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides a PostProcessor plugin that detects languages in
|
|
4
|
-
//! extraction results and stores them in the result.
|
|
5
|
-
|
|
6
|
-
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
7
|
-
use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
|
|
8
|
-
use async_trait::async_trait;
|
|
9
|
-
|
|
10
|
-
/// Post-processor that detects languages in document content.
|
|
11
|
-
///
|
|
12
|
-
/// This processor:
|
|
13
|
-
/// - Runs in the Early processing stage
|
|
14
|
-
/// - Only processes when `config.language_detection` is configured
|
|
15
|
-
/// - Stores detected languages in `result.detected_languages`
|
|
16
|
-
/// - Uses the whatlang library for detection
|
|
17
|
-
///
|
|
18
|
-
/// # Example
|
|
19
|
-
///
|
|
20
|
-
/// ```rust,no_run
|
|
21
|
-
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
22
|
-
/// use kreuzberg::language_detection::processor::LanguageDetector;
|
|
23
|
-
///
|
|
24
|
-
/// let processor = LanguageDetector;
|
|
25
|
-
/// assert_eq!(processor.name(), "language-detection");
|
|
26
|
-
/// ```
|
|
27
|
-
#[derive(Debug, Clone, Copy)]
|
|
28
|
-
pub struct LanguageDetector;
|
|
29
|
-
|
|
30
|
-
impl Plugin for LanguageDetector {
|
|
31
|
-
fn name(&self) -> &str {
|
|
32
|
-
"language-detection"
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
fn version(&self) -> String {
|
|
36
|
-
env!("CARGO_PKG_VERSION").to_string()
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
fn initialize(&self) -> Result<()> {
|
|
40
|
-
Ok(())
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
fn shutdown(&self) -> Result<()> {
|
|
44
|
-
Ok(())
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
49
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
50
|
-
impl PostProcessor for LanguageDetector {
|
|
51
|
-
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
52
|
-
let lang_config = match &config.language_detection {
|
|
53
|
-
Some(cfg) => cfg,
|
|
54
|
-
None => return Ok(()),
|
|
55
|
-
};
|
|
56
|
-
|
|
57
|
-
match super::detect_languages(&result.content, lang_config)
|
|
58
|
-
.map_err(|e| KreuzbergError::Other(format!("Language detection failed: {}", e)))?
|
|
59
|
-
{
|
|
60
|
-
Some(languages) => {
|
|
61
|
-
result.detected_languages = Some(languages);
|
|
62
|
-
}
|
|
63
|
-
None => {
|
|
64
|
-
result.detected_languages = None;
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
Ok(())
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
fn processing_stage(&self) -> ProcessingStage {
|
|
72
|
-
ProcessingStage::Early
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
76
|
-
config.language_detection.is_some()
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
|
|
80
|
-
let text_length = result.content.len();
|
|
81
|
-
// Language detection is relatively fast: ~1ms per 1KB
|
|
82
|
-
(text_length / 1024).max(1) as u64
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
#[cfg(test)]
|
|
87
|
-
mod tests {
|
|
88
|
-
use super::*;
|
|
89
|
-
use crate::core::config::LanguageDetectionConfig;
|
|
90
|
-
use crate::types::Metadata;
|
|
91
|
-
|
|
92
|
-
#[tokio::test]
|
|
93
|
-
async fn test_language_detector_processor() {
|
|
94
|
-
let processor = LanguageDetector;
|
|
95
|
-
let config = ExtractionConfig {
|
|
96
|
-
language_detection: Some(LanguageDetectionConfig {
|
|
97
|
-
enabled: true,
|
|
98
|
-
min_confidence: 0.8,
|
|
99
|
-
detect_multiple: false,
|
|
100
|
-
}),
|
|
101
|
-
..Default::default()
|
|
102
|
-
};
|
|
103
|
-
|
|
104
|
-
let mut result = ExtractionResult {
|
|
105
|
-
content: "Hello world! This is a test of the language detection system.".to_string(),
|
|
106
|
-
mime_type: "text/plain".to_string(),
|
|
107
|
-
metadata: Metadata::default(),
|
|
108
|
-
tables: vec![],
|
|
109
|
-
detected_languages: None,
|
|
110
|
-
chunks: None,
|
|
111
|
-
images: None,
|
|
112
|
-
pages: None,
|
|
113
|
-
};
|
|
114
|
-
|
|
115
|
-
processor.process(&mut result, &config).await.unwrap();
|
|
116
|
-
|
|
117
|
-
assert!(result.detected_languages.is_some());
|
|
118
|
-
let langs = result.detected_languages.unwrap();
|
|
119
|
-
assert!(!langs.is_empty());
|
|
120
|
-
assert_eq!(langs[0], "eng");
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
#[tokio::test]
|
|
124
|
-
async fn test_language_detector_no_config() {
|
|
125
|
-
let processor = LanguageDetector;
|
|
126
|
-
let config = ExtractionConfig::default();
|
|
127
|
-
|
|
128
|
-
let mut result = ExtractionResult {
|
|
129
|
-
content: "Hello world!".to_string(),
|
|
130
|
-
mime_type: "text/plain".to_string(),
|
|
131
|
-
metadata: Metadata::default(),
|
|
132
|
-
tables: vec![],
|
|
133
|
-
detected_languages: None,
|
|
134
|
-
chunks: None,
|
|
135
|
-
images: None,
|
|
136
|
-
pages: None,
|
|
137
|
-
};
|
|
138
|
-
|
|
139
|
-
processor.process(&mut result, &config).await.unwrap();
|
|
140
|
-
|
|
141
|
-
assert!(result.detected_languages.is_none());
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
#[test]
|
|
145
|
-
fn test_language_detector_plugin_interface() {
|
|
146
|
-
let processor = LanguageDetector;
|
|
147
|
-
assert_eq!(processor.name(), "language-detection");
|
|
148
|
-
assert!(!processor.version().is_empty());
|
|
149
|
-
assert!(processor.initialize().is_ok());
|
|
150
|
-
assert!(processor.shutdown().is_ok());
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
#[test]
|
|
154
|
-
fn test_language_detector_stage() {
|
|
155
|
-
let processor = LanguageDetector;
|
|
156
|
-
assert_eq!(processor.processing_stage(), ProcessingStage::Early);
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
#[test]
|
|
160
|
-
fn test_language_detector_should_process() {
|
|
161
|
-
let processor = LanguageDetector;
|
|
162
|
-
|
|
163
|
-
let result = ExtractionResult {
|
|
164
|
-
content: "Sample text".to_string(),
|
|
165
|
-
mime_type: "text/plain".to_string(),
|
|
166
|
-
metadata: Metadata::default(),
|
|
167
|
-
tables: vec![],
|
|
168
|
-
detected_languages: None,
|
|
169
|
-
chunks: None,
|
|
170
|
-
images: None,
|
|
171
|
-
pages: None,
|
|
172
|
-
};
|
|
173
|
-
|
|
174
|
-
let config_with_lang = ExtractionConfig {
|
|
175
|
-
language_detection: Some(LanguageDetectionConfig {
|
|
176
|
-
enabled: true,
|
|
177
|
-
min_confidence: 0.8,
|
|
178
|
-
detect_multiple: false,
|
|
179
|
-
}),
|
|
180
|
-
..Default::default()
|
|
181
|
-
};
|
|
182
|
-
assert!(processor.should_process(&result, &config_with_lang));
|
|
183
|
-
|
|
184
|
-
let config_without_lang = ExtractionConfig::default();
|
|
185
|
-
assert!(!processor.should_process(&result, &config_without_lang));
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
#[test]
|
|
189
|
-
fn test_language_detector_estimated_duration() {
|
|
190
|
-
let processor = LanguageDetector;
|
|
191
|
-
|
|
192
|
-
let short_result = ExtractionResult {
|
|
193
|
-
content: "Short".to_string(),
|
|
194
|
-
mime_type: "text/plain".to_string(),
|
|
195
|
-
metadata: Metadata::default(),
|
|
196
|
-
tables: vec![],
|
|
197
|
-
detected_languages: None,
|
|
198
|
-
chunks: None,
|
|
199
|
-
images: None,
|
|
200
|
-
pages: None,
|
|
201
|
-
};
|
|
202
|
-
|
|
203
|
-
let long_result = ExtractionResult {
|
|
204
|
-
content: "a".repeat(10000),
|
|
205
|
-
mime_type: "text/plain".to_string(),
|
|
206
|
-
metadata: Metadata::default(),
|
|
207
|
-
tables: vec![],
|
|
208
|
-
detected_languages: None,
|
|
209
|
-
chunks: None,
|
|
210
|
-
images: None,
|
|
211
|
-
pages: None,
|
|
212
|
-
};
|
|
213
|
-
|
|
214
|
-
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
215
|
-
let long_duration = processor.estimated_duration_ms(&long_result);
|
|
216
|
-
|
|
217
|
-
assert!(long_duration > short_duration);
|
|
218
|
-
}
|
|
219
|
-
}
|
|
1
|
+
//! Language detection post-processor.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a PostProcessor plugin that detects languages in
|
|
4
|
+
//! extraction results and stores them in the result.
|
|
5
|
+
|
|
6
|
+
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
7
|
+
use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
|
|
8
|
+
use async_trait::async_trait;
|
|
9
|
+
|
|
10
|
+
/// Post-processor that detects languages in document content.
|
|
11
|
+
///
|
|
12
|
+
/// This processor:
|
|
13
|
+
/// - Runs in the Early processing stage
|
|
14
|
+
/// - Only processes when `config.language_detection` is configured
|
|
15
|
+
/// - Stores detected languages in `result.detected_languages`
|
|
16
|
+
/// - Uses the whatlang library for detection
|
|
17
|
+
///
|
|
18
|
+
/// # Example
|
|
19
|
+
///
|
|
20
|
+
/// ```rust,no_run
|
|
21
|
+
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
22
|
+
/// use kreuzberg::language_detection::processor::LanguageDetector;
|
|
23
|
+
///
|
|
24
|
+
/// let processor = LanguageDetector;
|
|
25
|
+
/// assert_eq!(processor.name(), "language-detection");
|
|
26
|
+
/// ```
|
|
27
|
+
#[derive(Debug, Clone, Copy)]
|
|
28
|
+
pub struct LanguageDetector;
|
|
29
|
+
|
|
30
|
+
impl Plugin for LanguageDetector {
|
|
31
|
+
fn name(&self) -> &str {
|
|
32
|
+
"language-detection"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
fn version(&self) -> String {
|
|
36
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
fn initialize(&self) -> Result<()> {
|
|
40
|
+
Ok(())
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
fn shutdown(&self) -> Result<()> {
|
|
44
|
+
Ok(())
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
49
|
+
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
50
|
+
impl PostProcessor for LanguageDetector {
|
|
51
|
+
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
52
|
+
let lang_config = match &config.language_detection {
|
|
53
|
+
Some(cfg) => cfg,
|
|
54
|
+
None => return Ok(()),
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
match super::detect_languages(&result.content, lang_config)
|
|
58
|
+
.map_err(|e| KreuzbergError::Other(format!("Language detection failed: {}", e)))?
|
|
59
|
+
{
|
|
60
|
+
Some(languages) => {
|
|
61
|
+
result.detected_languages = Some(languages);
|
|
62
|
+
}
|
|
63
|
+
None => {
|
|
64
|
+
result.detected_languages = None;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
Ok(())
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
72
|
+
ProcessingStage::Early
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
76
|
+
config.language_detection.is_some()
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
|
|
80
|
+
let text_length = result.content.len();
|
|
81
|
+
// Language detection is relatively fast: ~1ms per 1KB
|
|
82
|
+
(text_length / 1024).max(1) as u64
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[cfg(test)]
|
|
87
|
+
mod tests {
|
|
88
|
+
use super::*;
|
|
89
|
+
use crate::core::config::LanguageDetectionConfig;
|
|
90
|
+
use crate::types::Metadata;
|
|
91
|
+
|
|
92
|
+
#[tokio::test]
|
|
93
|
+
async fn test_language_detector_processor() {
|
|
94
|
+
let processor = LanguageDetector;
|
|
95
|
+
let config = ExtractionConfig {
|
|
96
|
+
language_detection: Some(LanguageDetectionConfig {
|
|
97
|
+
enabled: true,
|
|
98
|
+
min_confidence: 0.8,
|
|
99
|
+
detect_multiple: false,
|
|
100
|
+
}),
|
|
101
|
+
..Default::default()
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
let mut result = ExtractionResult {
|
|
105
|
+
content: "Hello world! This is a test of the language detection system.".to_string(),
|
|
106
|
+
mime_type: "text/plain".to_string(),
|
|
107
|
+
metadata: Metadata::default(),
|
|
108
|
+
tables: vec![],
|
|
109
|
+
detected_languages: None,
|
|
110
|
+
chunks: None,
|
|
111
|
+
images: None,
|
|
112
|
+
pages: None,
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
116
|
+
|
|
117
|
+
assert!(result.detected_languages.is_some());
|
|
118
|
+
let langs = result.detected_languages.unwrap();
|
|
119
|
+
assert!(!langs.is_empty());
|
|
120
|
+
assert_eq!(langs[0], "eng");
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[tokio::test]
|
|
124
|
+
async fn test_language_detector_no_config() {
|
|
125
|
+
let processor = LanguageDetector;
|
|
126
|
+
let config = ExtractionConfig::default();
|
|
127
|
+
|
|
128
|
+
let mut result = ExtractionResult {
|
|
129
|
+
content: "Hello world!".to_string(),
|
|
130
|
+
mime_type: "text/plain".to_string(),
|
|
131
|
+
metadata: Metadata::default(),
|
|
132
|
+
tables: vec![],
|
|
133
|
+
detected_languages: None,
|
|
134
|
+
chunks: None,
|
|
135
|
+
images: None,
|
|
136
|
+
pages: None,
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
140
|
+
|
|
141
|
+
assert!(result.detected_languages.is_none());
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
#[test]
|
|
145
|
+
fn test_language_detector_plugin_interface() {
|
|
146
|
+
let processor = LanguageDetector;
|
|
147
|
+
assert_eq!(processor.name(), "language-detection");
|
|
148
|
+
assert!(!processor.version().is_empty());
|
|
149
|
+
assert!(processor.initialize().is_ok());
|
|
150
|
+
assert!(processor.shutdown().is_ok());
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[test]
|
|
154
|
+
fn test_language_detector_stage() {
|
|
155
|
+
let processor = LanguageDetector;
|
|
156
|
+
assert_eq!(processor.processing_stage(), ProcessingStage::Early);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[test]
|
|
160
|
+
fn test_language_detector_should_process() {
|
|
161
|
+
let processor = LanguageDetector;
|
|
162
|
+
|
|
163
|
+
let result = ExtractionResult {
|
|
164
|
+
content: "Sample text".to_string(),
|
|
165
|
+
mime_type: "text/plain".to_string(),
|
|
166
|
+
metadata: Metadata::default(),
|
|
167
|
+
tables: vec![],
|
|
168
|
+
detected_languages: None,
|
|
169
|
+
chunks: None,
|
|
170
|
+
images: None,
|
|
171
|
+
pages: None,
|
|
172
|
+
};
|
|
173
|
+
|
|
174
|
+
let config_with_lang = ExtractionConfig {
|
|
175
|
+
language_detection: Some(LanguageDetectionConfig {
|
|
176
|
+
enabled: true,
|
|
177
|
+
min_confidence: 0.8,
|
|
178
|
+
detect_multiple: false,
|
|
179
|
+
}),
|
|
180
|
+
..Default::default()
|
|
181
|
+
};
|
|
182
|
+
assert!(processor.should_process(&result, &config_with_lang));
|
|
183
|
+
|
|
184
|
+
let config_without_lang = ExtractionConfig::default();
|
|
185
|
+
assert!(!processor.should_process(&result, &config_without_lang));
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn test_language_detector_estimated_duration() {
|
|
190
|
+
let processor = LanguageDetector;
|
|
191
|
+
|
|
192
|
+
let short_result = ExtractionResult {
|
|
193
|
+
content: "Short".to_string(),
|
|
194
|
+
mime_type: "text/plain".to_string(),
|
|
195
|
+
metadata: Metadata::default(),
|
|
196
|
+
tables: vec![],
|
|
197
|
+
detected_languages: None,
|
|
198
|
+
chunks: None,
|
|
199
|
+
images: None,
|
|
200
|
+
pages: None,
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
let long_result = ExtractionResult {
|
|
204
|
+
content: "a".repeat(10000),
|
|
205
|
+
mime_type: "text/plain".to_string(),
|
|
206
|
+
metadata: Metadata::default(),
|
|
207
|
+
tables: vec![],
|
|
208
|
+
detected_languages: None,
|
|
209
|
+
chunks: None,
|
|
210
|
+
images: None,
|
|
211
|
+
pages: None,
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
215
|
+
let long_duration = processor.estimated_duration_ms(&long_result);
|
|
216
|
+
|
|
217
|
+
assert!(long_duration > short_duration);
|
|
218
|
+
}
|
|
219
|
+
}
|
data/vendor/kreuzberg/src/lib.rs
CHANGED
|
@@ -1,113 +1,113 @@
|
|
|
1
|
-
//! Kreuzberg - High-Performance Document Intelligence Library
|
|
2
|
-
//!
|
|
3
|
-
//! Kreuzberg is a Rust-first document extraction library with language-agnostic plugin support.
|
|
4
|
-
//! It provides fast, accurate extraction from PDFs, images, Office documents, emails, and more.
|
|
5
|
-
//!
|
|
6
|
-
//! # Quick Start
|
|
7
|
-
//!
|
|
8
|
-
//! ```rust,no_run
|
|
9
|
-
//! use kreuzberg::{extract_file_sync, ExtractionConfig};
|
|
10
|
-
//!
|
|
11
|
-
//! # fn main() -> kreuzberg::Result<()> {
|
|
12
|
-
//! // Extract content from a file
|
|
13
|
-
//! let config = ExtractionConfig::default();
|
|
14
|
-
//! let result = extract_file_sync("document.pdf", None, &config)?;
|
|
15
|
-
//! println!("Extracted: {}", result.content);
|
|
16
|
-
//! # Ok(())
|
|
17
|
-
//! # }
|
|
18
|
-
//! ```
|
|
19
|
-
//!
|
|
20
|
-
//! # Architecture
|
|
21
|
-
//!
|
|
22
|
-
//! - **Core Module** (`core`): Main extraction orchestration, MIME detection, config loading
|
|
23
|
-
//! - **Plugin System**: Language-agnostic plugin architecture
|
|
24
|
-
//! - **Extractors**: Format-specific extraction (PDF, images, Office docs, email, etc.)
|
|
25
|
-
//! - **OCR**: Multiple OCR backend support (Tesseract, EasyOCR, PaddleOCR)
|
|
26
|
-
//!
|
|
27
|
-
//! # Features
|
|
28
|
-
//!
|
|
29
|
-
//! - Fast parallel processing with async/await
|
|
30
|
-
//! - Priority-based extractor selection
|
|
31
|
-
//! - Comprehensive MIME type detection (118+ file extensions)
|
|
32
|
-
//! - Configurable caching and quality processing
|
|
33
|
-
//! - Cross-language plugin support (Python, Node.js planned)
|
|
34
|
-
|
|
35
|
-
#![deny(unsafe_code)]
|
|
36
|
-
|
|
37
|
-
pub mod cache;
|
|
38
|
-
pub mod core;
|
|
39
|
-
pub mod error;
|
|
40
|
-
pub mod extraction;
|
|
41
|
-
pub mod extractors;
|
|
42
|
-
pub mod panic_context;
|
|
43
|
-
pub mod plugins;
|
|
44
|
-
pub mod text;
|
|
45
|
-
pub mod types;
|
|
46
|
-
|
|
47
|
-
#[cfg(feature = "quality")]
|
|
48
|
-
pub mod utils;
|
|
49
|
-
|
|
50
|
-
#[cfg(feature = "api")]
|
|
51
|
-
pub mod api;
|
|
52
|
-
|
|
53
|
-
#[cfg(feature = "mcp")]
|
|
54
|
-
pub mod mcp;
|
|
55
|
-
|
|
56
|
-
#[cfg(feature = "chunking")]
|
|
57
|
-
pub mod chunking;
|
|
58
|
-
|
|
59
|
-
#[cfg(feature = "embeddings")]
|
|
60
|
-
pub mod embeddings;
|
|
61
|
-
|
|
62
|
-
#[cfg(feature = "ocr")]
|
|
63
|
-
pub mod image;
|
|
64
|
-
|
|
65
|
-
#[cfg(feature = "language-detection")]
|
|
66
|
-
pub mod language_detection;
|
|
67
|
-
|
|
68
|
-
#[cfg(feature = "stopwords")]
|
|
69
|
-
pub mod stopwords;
|
|
70
|
-
|
|
71
|
-
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
72
|
-
pub mod keywords;
|
|
73
|
-
|
|
74
|
-
#[cfg(feature = "ocr")]
|
|
75
|
-
pub mod ocr;
|
|
76
|
-
|
|
77
|
-
#[cfg(feature = "pdf")]
|
|
78
|
-
pub mod pdf;
|
|
79
|
-
|
|
80
|
-
pub use error::{KreuzbergError, Result};
|
|
81
|
-
pub use types::*;
|
|
82
|
-
|
|
83
|
-
#[cfg(feature = "tokio-runtime")]
|
|
84
|
-
pub use core::extractor::{batch_extract_bytes, batch_extract_file};
|
|
85
|
-
pub use core::extractor::{extract_bytes, extract_file};
|
|
86
|
-
|
|
87
|
-
// Available in WASM (bytes-based)
|
|
88
|
-
pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
|
|
89
|
-
|
|
90
|
-
// Only available with filesystem access
|
|
91
|
-
#[cfg(feature = "tokio-runtime")]
|
|
92
|
-
pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
|
|
93
|
-
|
|
94
|
-
pub use core::config::{
|
|
95
|
-
ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
|
|
96
|
-
LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
|
|
97
|
-
};
|
|
98
|
-
|
|
99
|
-
#[cfg(feature = "pdf")]
|
|
100
|
-
pub use core::config::PdfConfig;
|
|
101
|
-
|
|
102
|
-
pub use core::mime::{
|
|
103
|
-
DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,
|
|
104
|
-
PLAIN_TEXT_MIME_TYPE, POWER_POINT_MIME_TYPE, XML_MIME_TYPE, detect_mime_type, detect_mime_type_from_bytes,
|
|
105
|
-
detect_or_validate, get_extensions_for_mime, validate_mime_type,
|
|
106
|
-
};
|
|
107
|
-
|
|
108
|
-
pub use plugins::registry::{
|
|
109
|
-
get_document_extractor_registry, get_ocr_backend_registry, get_post_processor_registry, get_validator_registry,
|
|
110
|
-
};
|
|
111
|
-
|
|
112
|
-
#[cfg(feature = "embeddings")]
|
|
113
|
-
pub use embeddings::{EMBEDDING_PRESETS, EmbeddingPreset, get_preset, list_presets};
|
|
1
|
+
//! Kreuzberg - High-Performance Document Intelligence Library
|
|
2
|
+
//!
|
|
3
|
+
//! Kreuzberg is a Rust-first document extraction library with language-agnostic plugin support.
|
|
4
|
+
//! It provides fast, accurate extraction from PDFs, images, Office documents, emails, and more.
|
|
5
|
+
//!
|
|
6
|
+
//! # Quick Start
|
|
7
|
+
//!
|
|
8
|
+
//! ```rust,no_run
|
|
9
|
+
//! use kreuzberg::{extract_file_sync, ExtractionConfig};
|
|
10
|
+
//!
|
|
11
|
+
//! # fn main() -> kreuzberg::Result<()> {
|
|
12
|
+
//! // Extract content from a file
|
|
13
|
+
//! let config = ExtractionConfig::default();
|
|
14
|
+
//! let result = extract_file_sync("document.pdf", None, &config)?;
|
|
15
|
+
//! println!("Extracted: {}", result.content);
|
|
16
|
+
//! # Ok(())
|
|
17
|
+
//! # }
|
|
18
|
+
//! ```
|
|
19
|
+
//!
|
|
20
|
+
//! # Architecture
|
|
21
|
+
//!
|
|
22
|
+
//! - **Core Module** (`core`): Main extraction orchestration, MIME detection, config loading
|
|
23
|
+
//! - **Plugin System**: Language-agnostic plugin architecture
|
|
24
|
+
//! - **Extractors**: Format-specific extraction (PDF, images, Office docs, email, etc.)
|
|
25
|
+
//! - **OCR**: Multiple OCR backend support (Tesseract, EasyOCR, PaddleOCR)
|
|
26
|
+
//!
|
|
27
|
+
//! # Features
|
|
28
|
+
//!
|
|
29
|
+
//! - Fast parallel processing with async/await
|
|
30
|
+
//! - Priority-based extractor selection
|
|
31
|
+
//! - Comprehensive MIME type detection (118+ file extensions)
|
|
32
|
+
//! - Configurable caching and quality processing
|
|
33
|
+
//! - Cross-language plugin support (Python, Node.js planned)
|
|
34
|
+
|
|
35
|
+
#![deny(unsafe_code)]
|
|
36
|
+
|
|
37
|
+
pub mod cache;
|
|
38
|
+
pub mod core;
|
|
39
|
+
pub mod error;
|
|
40
|
+
pub mod extraction;
|
|
41
|
+
pub mod extractors;
|
|
42
|
+
pub mod panic_context;
|
|
43
|
+
pub mod plugins;
|
|
44
|
+
pub mod text;
|
|
45
|
+
pub mod types;
|
|
46
|
+
|
|
47
|
+
#[cfg(feature = "quality")]
|
|
48
|
+
pub mod utils;
|
|
49
|
+
|
|
50
|
+
#[cfg(feature = "api")]
|
|
51
|
+
pub mod api;
|
|
52
|
+
|
|
53
|
+
#[cfg(feature = "mcp")]
|
|
54
|
+
pub mod mcp;
|
|
55
|
+
|
|
56
|
+
#[cfg(feature = "chunking")]
|
|
57
|
+
pub mod chunking;
|
|
58
|
+
|
|
59
|
+
#[cfg(feature = "embeddings")]
|
|
60
|
+
pub mod embeddings;
|
|
61
|
+
|
|
62
|
+
#[cfg(feature = "ocr")]
|
|
63
|
+
pub mod image;
|
|
64
|
+
|
|
65
|
+
#[cfg(feature = "language-detection")]
|
|
66
|
+
pub mod language_detection;
|
|
67
|
+
|
|
68
|
+
#[cfg(feature = "stopwords")]
|
|
69
|
+
pub mod stopwords;
|
|
70
|
+
|
|
71
|
+
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
72
|
+
pub mod keywords;
|
|
73
|
+
|
|
74
|
+
#[cfg(feature = "ocr")]
|
|
75
|
+
pub mod ocr;
|
|
76
|
+
|
|
77
|
+
#[cfg(feature = "pdf")]
|
|
78
|
+
pub mod pdf;
|
|
79
|
+
|
|
80
|
+
pub use error::{KreuzbergError, Result};
|
|
81
|
+
pub use types::*;
|
|
82
|
+
|
|
83
|
+
#[cfg(feature = "tokio-runtime")]
|
|
84
|
+
pub use core::extractor::{batch_extract_bytes, batch_extract_file};
|
|
85
|
+
pub use core::extractor::{extract_bytes, extract_file};
|
|
86
|
+
|
|
87
|
+
// Available in WASM (bytes-based)
|
|
88
|
+
pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
|
|
89
|
+
|
|
90
|
+
// Only available with filesystem access
|
|
91
|
+
#[cfg(feature = "tokio-runtime")]
|
|
92
|
+
pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
|
|
93
|
+
|
|
94
|
+
pub use core::config::{
|
|
95
|
+
ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
|
|
96
|
+
LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
#[cfg(feature = "pdf")]
|
|
100
|
+
pub use core::config::PdfConfig;
|
|
101
|
+
|
|
102
|
+
pub use core::mime::{
|
|
103
|
+
DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,
|
|
104
|
+
PLAIN_TEXT_MIME_TYPE, POWER_POINT_MIME_TYPE, XML_MIME_TYPE, detect_mime_type, detect_mime_type_from_bytes,
|
|
105
|
+
detect_or_validate, get_extensions_for_mime, validate_mime_type,
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
pub use plugins::registry::{
|
|
109
|
+
get_document_extractor_registry, get_ocr_backend_registry, get_post_processor_registry, get_validator_registry,
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
#[cfg(feature = "embeddings")]
|
|
113
|
+
pub use embeddings::{EMBEDDING_PRESETS, EmbeddingPreset, get_preset, list_presets};
|