kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -1,230 +1,230 @@
|
|
|
1
|
-
# Kreuzberg
|
|
2
|
-
|
|
3
|
-
[](https://crates.io/crates/kreuzberg)
|
|
4
|
-
[](https://pypi.org/project/kreuzberg/)
|
|
5
|
-
[](https://www.npmjs.com/package/@kreuzberg/node)
|
|
6
|
-
[](https://www.npmjs.com/package/@kreuzberg/wasm)
|
|
7
|
-
[](https://rubygems.org/gems/kreuzberg)
|
|
8
|
-
[](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
|
|
9
|
-
[](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
|
|
10
|
-
[](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
|
|
11
|
-
|
|
12
|
-
[](https://opensource.org/licenses/MIT)
|
|
13
|
-
[](https://kreuzberg.dev/)
|
|
14
|
-
[](https://discord.gg/pXxagNK2zN)
|
|
15
|
-
|
|
16
|
-
High-performance document intelligence library for Rust. Extract text, metadata, and structured information from PDFs, Office documents, images, and 56 formats.
|
|
17
|
-
|
|
18
|
-
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
|
-
|
|
20
|
-
> **🚀 Version 4.0.0 Release Candidate**
|
|
21
|
-
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
|
-
>
|
|
23
|
-
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
24
|
-
|
|
25
|
-
## Installation
|
|
26
|
-
|
|
27
|
-
```toml
|
|
28
|
-
[dependencies]
|
|
29
|
-
kreuzberg = "4.0"
|
|
30
|
-
tokio = { version = "1", features = ["rt", "macros"] }
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
## System Requirements
|
|
34
|
-
|
|
35
|
-
### ONNX Runtime (for embeddings)
|
|
36
|
-
|
|
37
|
-
If using embeddings functionality, ONNX Runtime must be installed:
|
|
38
|
-
|
|
39
|
-
```bash
|
|
40
|
-
# macOS
|
|
41
|
-
brew install onnxruntime
|
|
42
|
-
|
|
43
|
-
# Ubuntu/Debian
|
|
44
|
-
sudo apt install libonnxruntime libonnxruntime-dev
|
|
45
|
-
|
|
46
|
-
# Windows (MSVC)
|
|
47
|
-
scoop install onnxruntime
|
|
48
|
-
# OR download from https://github.com/microsoft/onnxruntime/releases
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
Without ONNX Runtime, embeddings will raise `MissingDependencyError` with installation instructions.
|
|
52
|
-
|
|
53
|
-
## Quick Start
|
|
54
|
-
|
|
55
|
-
```rust
|
|
56
|
-
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
|
57
|
-
|
|
58
|
-
fn main() -> kreuzberg::Result<()> {
|
|
59
|
-
let config = ExtractionConfig::default();
|
|
60
|
-
let result = extract_file_sync("document.pdf", None, &config)?;
|
|
61
|
-
println!("{}", result.content);
|
|
62
|
-
Ok(())
|
|
63
|
-
}
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
### Async Extraction
|
|
67
|
-
|
|
68
|
-
```rust
|
|
69
|
-
use kreuzberg::{extract_file, ExtractionConfig};
|
|
70
|
-
|
|
71
|
-
#[tokio::main]
|
|
72
|
-
async fn main() -> kreuzberg::Result<()> {
|
|
73
|
-
let config = ExtractionConfig::default();
|
|
74
|
-
let result = extract_file("document.pdf", None, &config).await?;
|
|
75
|
-
println!("{}", result.content);
|
|
76
|
-
Ok(())
|
|
77
|
-
}
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
### Batch Processing
|
|
81
|
-
|
|
82
|
-
```rust
|
|
83
|
-
use kreuzberg::{batch_extract_file, ExtractionConfig};
|
|
84
|
-
|
|
85
|
-
#[tokio::main]
|
|
86
|
-
async fn main() -> kreuzberg::Result<()> {
|
|
87
|
-
let config = ExtractionConfig::default();
|
|
88
|
-
let files = vec!["doc1.pdf", "doc2.pdf", "doc3.pdf"];
|
|
89
|
-
let results = batch_extract_file(&files, None, &config).await?;
|
|
90
|
-
|
|
91
|
-
for result in results {
|
|
92
|
-
println!("{}", result.content);
|
|
93
|
-
}
|
|
94
|
-
Ok(())
|
|
95
|
-
}
|
|
96
|
-
```
|
|
97
|
-
|
|
98
|
-
## OCR with Table Extraction
|
|
99
|
-
|
|
100
|
-
```rust
|
|
101
|
-
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig, TesseractConfig};
|
|
102
|
-
|
|
103
|
-
fn main() -> kreuzberg::Result<()> {
|
|
104
|
-
let config = ExtractionConfig {
|
|
105
|
-
ocr: Some(OcrConfig {
|
|
106
|
-
backend: "tesseract".to_string(),
|
|
107
|
-
language: "eng".to_string(),
|
|
108
|
-
tesseract_config: Some(TesseractConfig {
|
|
109
|
-
enable_table_detection: true,
|
|
110
|
-
..Default::default()
|
|
111
|
-
}),
|
|
112
|
-
}),
|
|
113
|
-
..Default::default()
|
|
114
|
-
};
|
|
115
|
-
|
|
116
|
-
let result = extract_file_sync("invoice.pdf", None, &config)?;
|
|
117
|
-
|
|
118
|
-
for table in &result.tables {
|
|
119
|
-
println!("{}", table.markdown);
|
|
120
|
-
}
|
|
121
|
-
Ok(())
|
|
122
|
-
}
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
## Password-Protected PDFs
|
|
126
|
-
|
|
127
|
-
```rust
|
|
128
|
-
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig};
|
|
129
|
-
|
|
130
|
-
fn main() -> kreuzberg::Result<()> {
|
|
131
|
-
let config = ExtractionConfig {
|
|
132
|
-
pdf_options: Some(PdfConfig {
|
|
133
|
-
passwords: Some(vec!["password1".to_string(), "password2".to_string()]),
|
|
134
|
-
..Default::default()
|
|
135
|
-
}),
|
|
136
|
-
..Default::default()
|
|
137
|
-
};
|
|
138
|
-
|
|
139
|
-
let result = extract_file_sync("protected.pdf", None, &config)?;
|
|
140
|
-
Ok(())
|
|
141
|
-
}
|
|
142
|
-
```
|
|
143
|
-
|
|
144
|
-
## Extract from Bytes
|
|
145
|
-
|
|
146
|
-
```rust
|
|
147
|
-
use kreuzberg::{extract_bytes_sync, ExtractionConfig};
|
|
148
|
-
use std::fs;
|
|
149
|
-
|
|
150
|
-
fn main() -> kreuzberg::Result<()> {
|
|
151
|
-
let data = fs::read("document.pdf")?;
|
|
152
|
-
let config = ExtractionConfig::default();
|
|
153
|
-
let result = extract_bytes_sync(&data, "application/pdf", &config)?;
|
|
154
|
-
println!("{}", result.content);
|
|
155
|
-
Ok(())
|
|
156
|
-
}
|
|
157
|
-
```
|
|
158
|
-
|
|
159
|
-
## Features
|
|
160
|
-
|
|
161
|
-
The crate uses feature flags for optional functionality:
|
|
162
|
-
|
|
163
|
-
```toml
|
|
164
|
-
[dependencies]
|
|
165
|
-
kreuzberg = { version = "4.0", features = ["pdf", "excel", "ocr"] }
|
|
166
|
-
```
|
|
167
|
-
|
|
168
|
-
### Available Features
|
|
169
|
-
|
|
170
|
-
| Feature | Description | Binary Size |
|
|
171
|
-
|---------|-------------|-------------|
|
|
172
|
-
| `pdf` | PDF extraction via pdfium | +25MB |
|
|
173
|
-
| `excel` | Excel/spreadsheet parsing | +3MB |
|
|
174
|
-
| `office` | DOCX, PPTX extraction | +1MB |
|
|
175
|
-
| `email` | EML, MSG extraction | +500KB |
|
|
176
|
-
| `html` | HTML to markdown | +1MB |
|
|
177
|
-
| `xml` | XML streaming parser | +500KB |
|
|
178
|
-
| `archives` | ZIP, TAR, 7Z extraction | +2MB |
|
|
179
|
-
| `ocr` | OCR with Tesseract | +5MB |
|
|
180
|
-
| `language-detection` | Language detection | +100KB |
|
|
181
|
-
| `chunking` | Text chunking | +200KB |
|
|
182
|
-
| `quality` | Text quality processing | +500KB |
|
|
183
|
-
|
|
184
|
-
### Feature Bundles
|
|
185
|
-
|
|
186
|
-
```toml
|
|
187
|
-
kreuzberg = { version = "4.0", features = ["full"] }
|
|
188
|
-
kreuzberg = { version = "4.0", features = ["server"] }
|
|
189
|
-
kreuzberg = { version = "4.0", features = ["cli"] }
|
|
190
|
-
```
|
|
191
|
-
|
|
192
|
-
## PDF Support and Linking Options
|
|
193
|
-
|
|
194
|
-
Kreuzberg supports three PDFium linking strategies. **Default is `bundled-pdfium`** (best developer experience).
|
|
195
|
-
|
|
196
|
-
| Strategy | Feature | Use Case | Binary Size | Runtime Deps |
|
|
197
|
-
|----------|---------|----------|-------------|--------------|
|
|
198
|
-
| **Bundled (default)** | `bundled-pdfium` | Development, production | +8-15MB | None |
|
|
199
|
-
| **Static** | `static-pdfium` | Docker, musl, standalone binaries | +200MB | None |
|
|
200
|
-
| **System** | `system-pdfium` | Package managers, distros | +2MB | libpdfium.so |
|
|
201
|
-
|
|
202
|
-
### Quick Start
|
|
203
|
-
|
|
204
|
-
```toml
|
|
205
|
-
# Default - bundled PDFium (recommended)
|
|
206
|
-
[dependencies]
|
|
207
|
-
kreuzberg = "4.0"
|
|
208
|
-
|
|
209
|
-
# Static linking (Docker, musl)
|
|
210
|
-
[dependencies]
|
|
211
|
-
kreuzberg = { version = "4.0", features = ["static-pdfium"] }
|
|
212
|
-
|
|
213
|
-
# System PDFium (package managers)
|
|
214
|
-
[dependencies]
|
|
215
|
-
kreuzberg = { version = "4.0", features = ["system-pdfium"] }
|
|
216
|
-
```
|
|
217
|
-
|
|
218
|
-
For detailed information, see the [PDFium Linking Guide](../../docs/guides/pdfium-linking.md).
|
|
219
|
-
|
|
220
|
-
**Note:** Language bindings (Python, TypeScript, Ruby, Java, Go) automatically bundle PDFium. No configuration needed.
|
|
221
|
-
|
|
222
|
-
## Documentation
|
|
223
|
-
|
|
224
|
-
**[API Documentation](https://docs.rs/kreuzberg)** – Complete API reference with examples
|
|
225
|
-
|
|
226
|
-
**[https://kreuzberg.dev](https://kreuzberg.dev)** – User guide and tutorials
|
|
227
|
-
|
|
228
|
-
## License
|
|
229
|
-
|
|
230
|
-
MIT License - see [LICENSE](../../LICENSE) for details.
|
|
1
|
+
# Kreuzberg
|
|
2
|
+
|
|
3
|
+
[](https://crates.io/crates/kreuzberg)
|
|
4
|
+
[](https://pypi.org/project/kreuzberg/)
|
|
5
|
+
[](https://www.npmjs.com/package/@kreuzberg/node)
|
|
6
|
+
[](https://www.npmjs.com/package/@kreuzberg/wasm)
|
|
7
|
+
[](https://rubygems.org/gems/kreuzberg)
|
|
8
|
+
[](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
|
|
9
|
+
[](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
|
|
10
|
+
[](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
|
|
11
|
+
|
|
12
|
+
[](https://opensource.org/licenses/MIT)
|
|
13
|
+
[](https://kreuzberg.dev/)
|
|
14
|
+
[](https://discord.gg/pXxagNK2zN)
|
|
15
|
+
|
|
16
|
+
High-performance document intelligence library for Rust. Extract text, metadata, and structured information from PDFs, Office documents, images, and 56 formats.
|
|
17
|
+
|
|
18
|
+
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
|
+
|
|
20
|
+
> **🚀 Version 4.0.0 Release Candidate**
|
|
21
|
+
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
|
+
>
|
|
23
|
+
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```toml
|
|
28
|
+
[dependencies]
|
|
29
|
+
kreuzberg = "4.0"
|
|
30
|
+
tokio = { version = "1", features = ["rt", "macros"] }
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## System Requirements
|
|
34
|
+
|
|
35
|
+
### ONNX Runtime (for embeddings)
|
|
36
|
+
|
|
37
|
+
If using embeddings functionality, ONNX Runtime must be installed:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# macOS
|
|
41
|
+
brew install onnxruntime
|
|
42
|
+
|
|
43
|
+
# Ubuntu/Debian
|
|
44
|
+
sudo apt install libonnxruntime libonnxruntime-dev
|
|
45
|
+
|
|
46
|
+
# Windows (MSVC)
|
|
47
|
+
scoop install onnxruntime
|
|
48
|
+
# OR download from https://github.com/microsoft/onnxruntime/releases
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Without ONNX Runtime, embeddings will raise `MissingDependencyError` with installation instructions.
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
```rust
|
|
56
|
+
use kreuzberg::{extract_file_sync, ExtractionConfig};
|
|
57
|
+
|
|
58
|
+
fn main() -> kreuzberg::Result<()> {
|
|
59
|
+
let config = ExtractionConfig::default();
|
|
60
|
+
let result = extract_file_sync("document.pdf", None, &config)?;
|
|
61
|
+
println!("{}", result.content);
|
|
62
|
+
Ok(())
|
|
63
|
+
}
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Async Extraction
|
|
67
|
+
|
|
68
|
+
```rust
|
|
69
|
+
use kreuzberg::{extract_file, ExtractionConfig};
|
|
70
|
+
|
|
71
|
+
#[tokio::main]
|
|
72
|
+
async fn main() -> kreuzberg::Result<()> {
|
|
73
|
+
let config = ExtractionConfig::default();
|
|
74
|
+
let result = extract_file("document.pdf", None, &config).await?;
|
|
75
|
+
println!("{}", result.content);
|
|
76
|
+
Ok(())
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Batch Processing
|
|
81
|
+
|
|
82
|
+
```rust
|
|
83
|
+
use kreuzberg::{batch_extract_file, ExtractionConfig};
|
|
84
|
+
|
|
85
|
+
#[tokio::main]
|
|
86
|
+
async fn main() -> kreuzberg::Result<()> {
|
|
87
|
+
let config = ExtractionConfig::default();
|
|
88
|
+
let files = vec!["doc1.pdf", "doc2.pdf", "doc3.pdf"];
|
|
89
|
+
let results = batch_extract_file(&files, None, &config).await?;
|
|
90
|
+
|
|
91
|
+
for result in results {
|
|
92
|
+
println!("{}", result.content);
|
|
93
|
+
}
|
|
94
|
+
Ok(())
|
|
95
|
+
}
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## OCR with Table Extraction
|
|
99
|
+
|
|
100
|
+
```rust
|
|
101
|
+
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig, TesseractConfig};
|
|
102
|
+
|
|
103
|
+
fn main() -> kreuzberg::Result<()> {
|
|
104
|
+
let config = ExtractionConfig {
|
|
105
|
+
ocr: Some(OcrConfig {
|
|
106
|
+
backend: "tesseract".to_string(),
|
|
107
|
+
language: "eng".to_string(),
|
|
108
|
+
tesseract_config: Some(TesseractConfig {
|
|
109
|
+
enable_table_detection: true,
|
|
110
|
+
..Default::default()
|
|
111
|
+
}),
|
|
112
|
+
}),
|
|
113
|
+
..Default::default()
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
let result = extract_file_sync("invoice.pdf", None, &config)?;
|
|
117
|
+
|
|
118
|
+
for table in &result.tables {
|
|
119
|
+
println!("{}", table.markdown);
|
|
120
|
+
}
|
|
121
|
+
Ok(())
|
|
122
|
+
}
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Password-Protected PDFs
|
|
126
|
+
|
|
127
|
+
```rust
|
|
128
|
+
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig};
|
|
129
|
+
|
|
130
|
+
fn main() -> kreuzberg::Result<()> {
|
|
131
|
+
let config = ExtractionConfig {
|
|
132
|
+
pdf_options: Some(PdfConfig {
|
|
133
|
+
passwords: Some(vec!["password1".to_string(), "password2".to_string()]),
|
|
134
|
+
..Default::default()
|
|
135
|
+
}),
|
|
136
|
+
..Default::default()
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
let result = extract_file_sync("protected.pdf", None, &config)?;
|
|
140
|
+
Ok(())
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Extract from Bytes
|
|
145
|
+
|
|
146
|
+
```rust
|
|
147
|
+
use kreuzberg::{extract_bytes_sync, ExtractionConfig};
|
|
148
|
+
use std::fs;
|
|
149
|
+
|
|
150
|
+
fn main() -> kreuzberg::Result<()> {
|
|
151
|
+
let data = fs::read("document.pdf")?;
|
|
152
|
+
let config = ExtractionConfig::default();
|
|
153
|
+
let result = extract_bytes_sync(&data, "application/pdf", &config)?;
|
|
154
|
+
println!("{}", result.content);
|
|
155
|
+
Ok(())
|
|
156
|
+
}
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Features
|
|
160
|
+
|
|
161
|
+
The crate uses feature flags for optional functionality:
|
|
162
|
+
|
|
163
|
+
```toml
|
|
164
|
+
[dependencies]
|
|
165
|
+
kreuzberg = { version = "4.0", features = ["pdf", "excel", "ocr"] }
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Available Features
|
|
169
|
+
|
|
170
|
+
| Feature | Description | Binary Size |
|
|
171
|
+
|---------|-------------|-------------|
|
|
172
|
+
| `pdf` | PDF extraction via pdfium | +25MB |
|
|
173
|
+
| `excel` | Excel/spreadsheet parsing | +3MB |
|
|
174
|
+
| `office` | DOCX, PPTX extraction | +1MB |
|
|
175
|
+
| `email` | EML, MSG extraction | +500KB |
|
|
176
|
+
| `html` | HTML to markdown | +1MB |
|
|
177
|
+
| `xml` | XML streaming parser | +500KB |
|
|
178
|
+
| `archives` | ZIP, TAR, 7Z extraction | +2MB |
|
|
179
|
+
| `ocr` | OCR with Tesseract | +5MB |
|
|
180
|
+
| `language-detection` | Language detection | +100KB |
|
|
181
|
+
| `chunking` | Text chunking | +200KB |
|
|
182
|
+
| `quality` | Text quality processing | +500KB |
|
|
183
|
+
|
|
184
|
+
### Feature Bundles
|
|
185
|
+
|
|
186
|
+
```toml
|
|
187
|
+
kreuzberg = { version = "4.0", features = ["full"] }
|
|
188
|
+
kreuzberg = { version = "4.0", features = ["server"] }
|
|
189
|
+
kreuzberg = { version = "4.0", features = ["cli"] }
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## PDF Support and Linking Options
|
|
193
|
+
|
|
194
|
+
Kreuzberg supports three PDFium linking strategies. **Default is `bundled-pdfium`** (best developer experience).
|
|
195
|
+
|
|
196
|
+
| Strategy | Feature | Use Case | Binary Size | Runtime Deps |
|
|
197
|
+
|----------|---------|----------|-------------|--------------|
|
|
198
|
+
| **Bundled (default)** | `bundled-pdfium` | Development, production | +8-15MB | None |
|
|
199
|
+
| **Static** | `static-pdfium` | Docker, musl, standalone binaries | +200MB | None |
|
|
200
|
+
| **System** | `system-pdfium` | Package managers, distros | +2MB | libpdfium.so |
|
|
201
|
+
|
|
202
|
+
### Quick Start
|
|
203
|
+
|
|
204
|
+
```toml
|
|
205
|
+
# Default - bundled PDFium (recommended)
|
|
206
|
+
[dependencies]
|
|
207
|
+
kreuzberg = "4.0"
|
|
208
|
+
|
|
209
|
+
# Static linking (Docker, musl)
|
|
210
|
+
[dependencies]
|
|
211
|
+
kreuzberg = { version = "4.0", features = ["static-pdfium"] }
|
|
212
|
+
|
|
213
|
+
# System PDFium (package managers)
|
|
214
|
+
[dependencies]
|
|
215
|
+
kreuzberg = { version = "4.0", features = ["system-pdfium"] }
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
For detailed information, see the [PDFium Linking Guide](../../docs/guides/pdfium-linking.md).
|
|
219
|
+
|
|
220
|
+
**Note:** Language bindings (Python, TypeScript, Ruby, Java, Go) automatically bundle PDFium. No configuration needed.
|
|
221
|
+
|
|
222
|
+
## Documentation
|
|
223
|
+
|
|
224
|
+
**[API Documentation](https://docs.rs/kreuzberg)** – Complete API reference with examples
|
|
225
|
+
|
|
226
|
+
**[https://kreuzberg.dev](https://kreuzberg.dev)** – User guide and tutorials
|
|
227
|
+
|
|
228
|
+
## License
|
|
229
|
+
|
|
230
|
+
MIT License - see [LICENSE](../../LICENSE) for details.
|
|
@@ -1,48 +1,48 @@
|
|
|
1
|
-
use criterion::{Criterion, criterion_group, criterion_main};
|
|
2
|
-
use std::hint::black_box;
|
|
3
|
-
|
|
4
|
-
fn bench_text_extraction(c: &mut Criterion) {
|
|
5
|
-
let runtime = tokio::runtime::Runtime::new().unwrap();
|
|
6
|
-
|
|
7
|
-
c.bench_function("extract_text_no_otel", |b| {
|
|
8
|
-
b.iter(|| {
|
|
9
|
-
runtime.block_on(async {
|
|
10
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
11
|
-
use kreuzberg::core::extractor::extract_bytes;
|
|
12
|
-
|
|
13
|
-
let test_content = black_box(b"Hello, World! This is a test document.");
|
|
14
|
-
let config = ExtractionConfig::default();
|
|
15
|
-
|
|
16
|
-
extract_bytes(test_content, "text/plain", &config).await
|
|
17
|
-
})
|
|
18
|
-
});
|
|
19
|
-
});
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
fn bench_cache_operations(c: &mut Criterion) {
|
|
23
|
-
use kreuzberg::cache::GenericCache;
|
|
24
|
-
use tempfile::tempdir;
|
|
25
|
-
|
|
26
|
-
let temp_dir = tempdir().unwrap();
|
|
27
|
-
let cache = GenericCache::new(
|
|
28
|
-
"bench".to_string(),
|
|
29
|
-
Some(temp_dir.path().to_str().unwrap().to_string()),
|
|
30
|
-
30.0,
|
|
31
|
-
500.0,
|
|
32
|
-
1000.0,
|
|
33
|
-
)
|
|
34
|
-
.unwrap();
|
|
35
|
-
|
|
36
|
-
c.bench_function("cache_set_get", |b| {
|
|
37
|
-
b.iter(|| {
|
|
38
|
-
let key = black_box("bench_key");
|
|
39
|
-
let data = black_box(b"benchmark data".to_vec());
|
|
40
|
-
|
|
41
|
-
cache.set(key, data.clone(), None).unwrap();
|
|
42
|
-
cache.get(key, None).unwrap()
|
|
43
|
-
});
|
|
44
|
-
});
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
criterion_group!(benches, bench_text_extraction, bench_cache_operations);
|
|
48
|
-
criterion_main!(benches);
|
|
1
|
+
use criterion::{Criterion, criterion_group, criterion_main};
|
|
2
|
+
use std::hint::black_box;
|
|
3
|
+
|
|
4
|
+
fn bench_text_extraction(c: &mut Criterion) {
|
|
5
|
+
let runtime = tokio::runtime::Runtime::new().unwrap();
|
|
6
|
+
|
|
7
|
+
c.bench_function("extract_text_no_otel", |b| {
|
|
8
|
+
b.iter(|| {
|
|
9
|
+
runtime.block_on(async {
|
|
10
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
11
|
+
use kreuzberg::core::extractor::extract_bytes;
|
|
12
|
+
|
|
13
|
+
let test_content = black_box(b"Hello, World! This is a test document.");
|
|
14
|
+
let config = ExtractionConfig::default();
|
|
15
|
+
|
|
16
|
+
extract_bytes(test_content, "text/plain", &config).await
|
|
17
|
+
})
|
|
18
|
+
});
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
fn bench_cache_operations(c: &mut Criterion) {
|
|
23
|
+
use kreuzberg::cache::GenericCache;
|
|
24
|
+
use tempfile::tempdir;
|
|
25
|
+
|
|
26
|
+
let temp_dir = tempdir().unwrap();
|
|
27
|
+
let cache = GenericCache::new(
|
|
28
|
+
"bench".to_string(),
|
|
29
|
+
Some(temp_dir.path().to_str().unwrap().to_string()),
|
|
30
|
+
30.0,
|
|
31
|
+
500.0,
|
|
32
|
+
1000.0,
|
|
33
|
+
)
|
|
34
|
+
.unwrap();
|
|
35
|
+
|
|
36
|
+
c.bench_function("cache_set_get", |b| {
|
|
37
|
+
b.iter(|| {
|
|
38
|
+
let key = black_box("bench_key");
|
|
39
|
+
let data = black_box(b"benchmark data".to_vec());
|
|
40
|
+
|
|
41
|
+
cache.set(key, data.clone(), None).unwrap();
|
|
42
|
+
cache.get(key, None).unwrap()
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
criterion_group!(benches, bench_text_extraction, bench_cache_operations);
|
|
48
|
+
criterion_main!(benches);
|