kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +104 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +52 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.so} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +887 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +87 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +634 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +81 -22
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -1,414 +1,414 @@
|
|
|
1
|
-
//! CSV and spreadsheet integration tests.
|
|
2
|
-
//!
|
|
3
|
-
//! Tests for CSV and TSV extraction.
|
|
4
|
-
//! Validates data extraction, custom delimiters, quoted fields, and edge cases.
|
|
5
|
-
|
|
6
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
-
use kreuzberg::core::extractor::extract_bytes;
|
|
8
|
-
|
|
9
|
-
mod helpers;
|
|
10
|
-
|
|
11
|
-
/// Test basic CSV extraction - simple comma-separated values.
|
|
12
|
-
#[tokio::test]
|
|
13
|
-
async fn test_csv_basic_extraction() {
|
|
14
|
-
let config = ExtractionConfig::default();
|
|
15
|
-
|
|
16
|
-
let csv_content = b"Name,Age,City\nAlice,30,NYC\nBob,25,LA";
|
|
17
|
-
|
|
18
|
-
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
19
|
-
Ok(result) => result,
|
|
20
|
-
Err(_) => {
|
|
21
|
-
println!("Skipping test: CSV extraction not available");
|
|
22
|
-
return;
|
|
23
|
-
}
|
|
24
|
-
};
|
|
25
|
-
|
|
26
|
-
assert_eq!(extraction.mime_type, "text/csv");
|
|
27
|
-
assert!(
|
|
28
|
-
extraction.chunks.is_none(),
|
|
29
|
-
"Chunks should be None without chunking config"
|
|
30
|
-
);
|
|
31
|
-
assert!(
|
|
32
|
-
extraction.detected_languages.is_none(),
|
|
33
|
-
"Language detection not enabled"
|
|
34
|
-
);
|
|
35
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
36
|
-
|
|
37
|
-
assert!(extraction.content.contains("Name"), "Should contain 'Name' header");
|
|
38
|
-
assert!(extraction.content.contains("Age"), "Should contain 'Age' header");
|
|
39
|
-
assert!(extraction.content.contains("City"), "Should contain 'City' header");
|
|
40
|
-
|
|
41
|
-
assert!(extraction.content.contains("Alice"), "Should contain Alice row");
|
|
42
|
-
assert!(extraction.content.contains("30"), "Should contain Alice's age");
|
|
43
|
-
assert!(extraction.content.contains("NYC"), "Should contain Alice's city");
|
|
44
|
-
|
|
45
|
-
assert!(extraction.content.contains("Bob"), "Should contain Bob row");
|
|
46
|
-
assert!(extraction.content.contains("25"), "Should contain Bob's age");
|
|
47
|
-
assert!(extraction.content.contains("LA"), "Should contain Bob's city");
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
/// Test CSV with headers - first row as headers.
|
|
51
|
-
#[tokio::test]
|
|
52
|
-
async fn test_csv_with_headers() {
|
|
53
|
-
let config = ExtractionConfig::default();
|
|
54
|
-
|
|
55
|
-
let csv_content = b"Product,Price,Quantity\nApple,1.50,100\nBanana,0.75,200\nOrange,2.00,150";
|
|
56
|
-
|
|
57
|
-
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
58
|
-
Ok(result) => result,
|
|
59
|
-
Err(_) => {
|
|
60
|
-
println!("Skipping test: CSV extraction not available");
|
|
61
|
-
return;
|
|
62
|
-
}
|
|
63
|
-
};
|
|
64
|
-
|
|
65
|
-
assert!(
|
|
66
|
-
extraction.chunks.is_none(),
|
|
67
|
-
"Chunks should be None without chunking config"
|
|
68
|
-
);
|
|
69
|
-
assert!(
|
|
70
|
-
extraction.detected_languages.is_none(),
|
|
71
|
-
"Language detection not enabled"
|
|
72
|
-
);
|
|
73
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
74
|
-
|
|
75
|
-
assert!(extraction.content.contains("Product"), "Should contain Product header");
|
|
76
|
-
assert!(extraction.content.contains("Price"), "Should contain Price header");
|
|
77
|
-
assert!(
|
|
78
|
-
extraction.content.contains("Quantity"),
|
|
79
|
-
"Should contain Quantity header"
|
|
80
|
-
);
|
|
81
|
-
|
|
82
|
-
assert!(
|
|
83
|
-
extraction.content.contains("Apple")
|
|
84
|
-
&& extraction.content.contains("1.50")
|
|
85
|
-
&& extraction.content.contains("100")
|
|
86
|
-
);
|
|
87
|
-
assert!(
|
|
88
|
-
extraction.content.contains("Banana")
|
|
89
|
-
&& extraction.content.contains("0.75")
|
|
90
|
-
&& extraction.content.contains("200")
|
|
91
|
-
);
|
|
92
|
-
assert!(
|
|
93
|
-
extraction.content.contains("Orange")
|
|
94
|
-
&& extraction.content.contains("2.00")
|
|
95
|
-
&& extraction.content.contains("150")
|
|
96
|
-
);
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
/// Test CSV with custom delimiter - tab and semicolon.
|
|
100
|
-
#[tokio::test]
|
|
101
|
-
async fn test_csv_custom_delimiter() {
|
|
102
|
-
let config = ExtractionConfig::default();
|
|
103
|
-
|
|
104
|
-
let csv_content = b"Name;Age;City\nAlice;30;NYC\nBob;25;LA";
|
|
105
|
-
|
|
106
|
-
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
107
|
-
Ok(result) => result,
|
|
108
|
-
Err(_) => {
|
|
109
|
-
println!("Skipping test: CSV extraction not available");
|
|
110
|
-
return;
|
|
111
|
-
}
|
|
112
|
-
};
|
|
113
|
-
|
|
114
|
-
assert!(
|
|
115
|
-
extraction.chunks.is_none(),
|
|
116
|
-
"Chunks should be None without chunking config"
|
|
117
|
-
);
|
|
118
|
-
assert!(
|
|
119
|
-
extraction.detected_languages.is_none(),
|
|
120
|
-
"Language detection not enabled"
|
|
121
|
-
);
|
|
122
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
123
|
-
|
|
124
|
-
assert!(!extraction.content.is_empty(), "Content should be extracted");
|
|
125
|
-
|
|
126
|
-
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
|
127
|
-
assert!(extraction.content.contains("30"), "Should contain age");
|
|
128
|
-
assert!(extraction.content.contains("NYC"), "Should contain city");
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
/// Test TSV (Tab-Separated Values) file.
|
|
132
|
-
#[tokio::test]
|
|
133
|
-
async fn test_tsv_file() {
|
|
134
|
-
let config = ExtractionConfig::default();
|
|
135
|
-
|
|
136
|
-
let tsv_content = b"Name\tAge\tCity\nAlice\t30\tNYC\nBob\t25\tLA";
|
|
137
|
-
|
|
138
|
-
let extraction = match extract_bytes(tsv_content, "text/tab-separated-values", &config).await {
|
|
139
|
-
Ok(result) => result,
|
|
140
|
-
Err(_) => {
|
|
141
|
-
println!("Skipping test: TSV extraction not available");
|
|
142
|
-
return;
|
|
143
|
-
}
|
|
144
|
-
};
|
|
145
|
-
|
|
146
|
-
assert_eq!(extraction.mime_type, "text/tab-separated-values");
|
|
147
|
-
assert!(
|
|
148
|
-
extraction.chunks.is_none(),
|
|
149
|
-
"Chunks should be None without chunking config"
|
|
150
|
-
);
|
|
151
|
-
assert!(
|
|
152
|
-
extraction.detected_languages.is_none(),
|
|
153
|
-
"Language detection not enabled"
|
|
154
|
-
);
|
|
155
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
156
|
-
|
|
157
|
-
assert!(extraction.content.contains("Name"), "Should contain Name header");
|
|
158
|
-
assert!(extraction.content.contains("Age"), "Should contain Age header");
|
|
159
|
-
assert!(extraction.content.contains("City"), "Should contain City header");
|
|
160
|
-
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
|
161
|
-
assert!(extraction.content.contains("Bob"), "Should contain Bob");
|
|
162
|
-
assert!(extraction.content.contains("30") && extraction.content.contains("NYC"));
|
|
163
|
-
assert!(extraction.content.contains("25") && extraction.content.contains("LA"));
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
/// Test CSV with quoted fields - fields containing commas.
|
|
167
|
-
#[tokio::test]
|
|
168
|
-
async fn test_csv_quoted_fields() {
|
|
169
|
-
let config = ExtractionConfig::default();
|
|
170
|
-
|
|
171
|
-
let csv_content =
|
|
172
|
-
b"Name,Description,Price\n\"Smith, John\",\"Product A, premium\",100\n\"Doe, Jane\",\"Product B, standard\",50";
|
|
173
|
-
|
|
174
|
-
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
175
|
-
Ok(result) => result,
|
|
176
|
-
Err(_) => {
|
|
177
|
-
println!("Skipping test: CSV extraction not available");
|
|
178
|
-
return;
|
|
179
|
-
}
|
|
180
|
-
};
|
|
181
|
-
|
|
182
|
-
assert!(
|
|
183
|
-
extraction.chunks.is_none(),
|
|
184
|
-
"Chunks should be None without chunking config"
|
|
185
|
-
);
|
|
186
|
-
assert!(
|
|
187
|
-
extraction.detected_languages.is_none(),
|
|
188
|
-
"Language detection not enabled"
|
|
189
|
-
);
|
|
190
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
191
|
-
|
|
192
|
-
assert!(extraction.content.contains("Smith"), "Should contain Smith");
|
|
193
|
-
assert!(extraction.content.contains("John"), "Should contain John");
|
|
194
|
-
assert!(extraction.content.contains("Doe"), "Should contain Doe");
|
|
195
|
-
assert!(extraction.content.contains("Jane"), "Should contain Jane");
|
|
196
|
-
|
|
197
|
-
assert!(extraction.content.contains("Product A") || extraction.content.contains("premium"));
|
|
198
|
-
assert!(extraction.content.contains("Product B") || extraction.content.contains("standard"));
|
|
199
|
-
|
|
200
|
-
assert!(extraction.content.contains("100") && extraction.content.contains("50"));
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
/// Test CSV with special characters - Unicode, newlines in fields.
|
|
204
|
-
#[tokio::test]
|
|
205
|
-
async fn test_csv_special_characters() {
|
|
206
|
-
let config = ExtractionConfig::default();
|
|
207
|
-
|
|
208
|
-
let csv_content = "Name,City,Emoji\nAlice,Tokyo 東京,🎉\nBob,París,✅\nCarlos,Москва,🌍".as_bytes();
|
|
209
|
-
|
|
210
|
-
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
211
|
-
Ok(result) => result,
|
|
212
|
-
Err(_) => {
|
|
213
|
-
println!("Skipping test: CSV extraction not available");
|
|
214
|
-
return;
|
|
215
|
-
}
|
|
216
|
-
};
|
|
217
|
-
|
|
218
|
-
assert!(
|
|
219
|
-
extraction.chunks.is_none(),
|
|
220
|
-
"Chunks should be None without chunking config"
|
|
221
|
-
);
|
|
222
|
-
assert!(
|
|
223
|
-
extraction.detected_languages.is_none(),
|
|
224
|
-
"Language detection not enabled"
|
|
225
|
-
);
|
|
226
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
227
|
-
|
|
228
|
-
assert!(!extraction.content.is_empty(), "Special characters should be handled");
|
|
229
|
-
|
|
230
|
-
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
|
231
|
-
assert!(extraction.content.contains("Bob"), "Should contain Bob");
|
|
232
|
-
assert!(extraction.content.contains("Carlos"), "Should contain Carlos");
|
|
233
|
-
|
|
234
|
-
assert!(extraction.content.contains("Tokyo") || extraction.content.contains("東京"));
|
|
235
|
-
assert!(extraction.content.contains("París") || extraction.content.contains("Paris"));
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
/// Test CSV with large file - 10,000+ rows (streaming).
|
|
239
|
-
#[tokio::test]
|
|
240
|
-
async fn test_csv_large_file() {
|
|
241
|
-
let config = ExtractionConfig::default();
|
|
242
|
-
|
|
243
|
-
let mut csv_content = "ID,Name,Value\n".to_string();
|
|
244
|
-
for i in 1..=10_000 {
|
|
245
|
-
csv_content.push_str(&format!("{},Item{},{}.00\n", i, i, i * 10));
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
let extraction = match extract_bytes(csv_content.as_bytes(), "text/csv", &config).await {
|
|
249
|
-
Ok(result) => result,
|
|
250
|
-
Err(_) => {
|
|
251
|
-
println!("Skipping test: CSV extraction not available");
|
|
252
|
-
return;
|
|
253
|
-
}
|
|
254
|
-
};
|
|
255
|
-
|
|
256
|
-
assert!(
|
|
257
|
-
extraction.chunks.is_none(),
|
|
258
|
-
"Chunks should be None without chunking config"
|
|
259
|
-
);
|
|
260
|
-
assert!(
|
|
261
|
-
extraction.detected_languages.is_none(),
|
|
262
|
-
"Language detection not enabled"
|
|
263
|
-
);
|
|
264
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
265
|
-
|
|
266
|
-
assert!(!extraction.content.is_empty(), "Large CSV should be processed");
|
|
267
|
-
|
|
268
|
-
assert!(
|
|
269
|
-
extraction.content.len() > 1000,
|
|
270
|
-
"Large CSV content should be substantial"
|
|
271
|
-
);
|
|
272
|
-
|
|
273
|
-
assert!(extraction.content.contains("Item1") || extraction.content.contains("10.00"));
|
|
274
|
-
|
|
275
|
-
assert!(extraction.content.contains("Item5000") || extraction.content.contains("50000.00"));
|
|
276
|
-
|
|
277
|
-
assert!(extraction.content.contains("Item10000") || extraction.content.contains("100000.00"));
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
/// Test malformed CSV - inconsistent columns.
|
|
281
|
-
#[tokio::test]
|
|
282
|
-
async fn test_csv_malformed() {
|
|
283
|
-
let config = ExtractionConfig::default();
|
|
284
|
-
|
|
285
|
-
let csv_content = b"Name,Age,City\nAlice,30\nBob,25,LA,Extra\nCarlos,35,SF";
|
|
286
|
-
|
|
287
|
-
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
288
|
-
|
|
289
|
-
assert!(
|
|
290
|
-
result.is_ok() || result.is_err(),
|
|
291
|
-
"Should handle malformed CSV gracefully"
|
|
292
|
-
);
|
|
293
|
-
|
|
294
|
-
if let Ok(extraction) = result {
|
|
295
|
-
assert!(!extraction.content.is_empty());
|
|
296
|
-
}
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
/// Test empty CSV file.
|
|
300
|
-
#[tokio::test]
|
|
301
|
-
async fn test_csv_empty() {
|
|
302
|
-
let config = ExtractionConfig::default();
|
|
303
|
-
|
|
304
|
-
let empty_csv = b"";
|
|
305
|
-
|
|
306
|
-
let result = extract_bytes(empty_csv, "text/csv", &config).await;
|
|
307
|
-
|
|
308
|
-
assert!(result.is_ok() || result.is_err(), "Should handle empty CSV gracefully");
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
/// Test CSV with only headers.
|
|
312
|
-
#[tokio::test]
|
|
313
|
-
async fn test_csv_headers_only() {
|
|
314
|
-
let config = ExtractionConfig::default();
|
|
315
|
-
|
|
316
|
-
let csv_content = b"Name,Age,City";
|
|
317
|
-
|
|
318
|
-
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
319
|
-
Ok(result) => result,
|
|
320
|
-
Err(_) => {
|
|
321
|
-
println!("Skipping test: CSV extraction not available");
|
|
322
|
-
return;
|
|
323
|
-
}
|
|
324
|
-
};
|
|
325
|
-
|
|
326
|
-
assert!(
|
|
327
|
-
extraction.chunks.is_none(),
|
|
328
|
-
"Chunks should be None without chunking config"
|
|
329
|
-
);
|
|
330
|
-
assert!(
|
|
331
|
-
extraction.detected_languages.is_none(),
|
|
332
|
-
"Language detection not enabled"
|
|
333
|
-
);
|
|
334
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
335
|
-
|
|
336
|
-
assert!(
|
|
337
|
-
extraction.content.contains("Name") || !extraction.content.is_empty(),
|
|
338
|
-
"Headers should be extracted"
|
|
339
|
-
);
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
/// Test CSV with blank lines.
|
|
343
|
-
#[tokio::test]
|
|
344
|
-
async fn test_csv_blank_lines() {
|
|
345
|
-
let config = ExtractionConfig::default();
|
|
346
|
-
|
|
347
|
-
let csv_content = b"Name,Age\nAlice,30\n\nBob,25\n\nCarlos,35";
|
|
348
|
-
|
|
349
|
-
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
350
|
-
Ok(result) => result,
|
|
351
|
-
Err(_) => {
|
|
352
|
-
println!("Skipping test: CSV extraction not available");
|
|
353
|
-
return;
|
|
354
|
-
}
|
|
355
|
-
};
|
|
356
|
-
|
|
357
|
-
assert!(
|
|
358
|
-
extraction.chunks.is_none(),
|
|
359
|
-
"Chunks should be None without chunking config"
|
|
360
|
-
);
|
|
361
|
-
assert!(
|
|
362
|
-
extraction.detected_languages.is_none(),
|
|
363
|
-
"Language detection not enabled"
|
|
364
|
-
);
|
|
365
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
366
|
-
|
|
367
|
-
assert!(extraction.content.contains("Alice") || extraction.content.contains("Bob"));
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
/// Test CSV with numeric data.
|
|
371
|
-
#[tokio::test]
|
|
372
|
-
async fn test_csv_numeric_data() {
|
|
373
|
-
let config = ExtractionConfig::default();
|
|
374
|
-
|
|
375
|
-
let csv_content = b"ID,Price,Quantity,Discount\n1,19.99,100,0.15\n2,29.99,50,0.20\n3,9.99,200,0.10";
|
|
376
|
-
|
|
377
|
-
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
378
|
-
Ok(result) => result,
|
|
379
|
-
Err(_) => {
|
|
380
|
-
println!("Skipping test: CSV extraction not available");
|
|
381
|
-
return;
|
|
382
|
-
}
|
|
383
|
-
};
|
|
384
|
-
|
|
385
|
-
assert!(
|
|
386
|
-
extraction.chunks.is_none(),
|
|
387
|
-
"Chunks should be None without chunking config"
|
|
388
|
-
);
|
|
389
|
-
assert!(
|
|
390
|
-
extraction.detected_languages.is_none(),
|
|
391
|
-
"Language detection not enabled"
|
|
392
|
-
);
|
|
393
|
-
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
394
|
-
|
|
395
|
-
assert!(extraction.content.contains("Price"), "Should contain Price header");
|
|
396
|
-
assert!(
|
|
397
|
-
extraction.content.contains("Quantity"),
|
|
398
|
-
"Should contain Quantity header"
|
|
399
|
-
);
|
|
400
|
-
assert!(
|
|
401
|
-
extraction.content.contains("Discount"),
|
|
402
|
-
"Should contain Discount header"
|
|
403
|
-
);
|
|
404
|
-
|
|
405
|
-
assert!(extraction.content.contains("19.99"), "Should contain first price");
|
|
406
|
-
assert!(extraction.content.contains("100"), "Should contain first quantity");
|
|
407
|
-
assert!(extraction.content.contains("0.15"), "Should contain first discount");
|
|
408
|
-
|
|
409
|
-
assert!(extraction.content.contains("29.99"), "Should contain second price");
|
|
410
|
-
assert!(extraction.content.contains("50"), "Should contain second quantity");
|
|
411
|
-
|
|
412
|
-
assert!(extraction.content.contains("9.99"), "Should contain third price");
|
|
413
|
-
assert!(extraction.content.contains("200"), "Should contain third quantity");
|
|
414
|
-
}
|
|
1
|
+
//! CSV and spreadsheet integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests for CSV and TSV extraction.
|
|
4
|
+
//! Validates data extraction, custom delimiters, quoted fields, and edge cases.
|
|
5
|
+
|
|
6
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
+
use kreuzberg::core::extractor::extract_bytes;
|
|
8
|
+
|
|
9
|
+
mod helpers;
|
|
10
|
+
|
|
11
|
+
/// Test basic CSV extraction - simple comma-separated values.
|
|
12
|
+
#[tokio::test]
|
|
13
|
+
async fn test_csv_basic_extraction() {
|
|
14
|
+
let config = ExtractionConfig::default();
|
|
15
|
+
|
|
16
|
+
let csv_content = b"Name,Age,City\nAlice,30,NYC\nBob,25,LA";
|
|
17
|
+
|
|
18
|
+
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
19
|
+
Ok(result) => result,
|
|
20
|
+
Err(_) => {
|
|
21
|
+
println!("Skipping test: CSV extraction not available");
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
assert_eq!(extraction.mime_type, "text/csv");
|
|
27
|
+
assert!(
|
|
28
|
+
extraction.chunks.is_none(),
|
|
29
|
+
"Chunks should be None without chunking config"
|
|
30
|
+
);
|
|
31
|
+
assert!(
|
|
32
|
+
extraction.detected_languages.is_none(),
|
|
33
|
+
"Language detection not enabled"
|
|
34
|
+
);
|
|
35
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
36
|
+
|
|
37
|
+
assert!(extraction.content.contains("Name"), "Should contain 'Name' header");
|
|
38
|
+
assert!(extraction.content.contains("Age"), "Should contain 'Age' header");
|
|
39
|
+
assert!(extraction.content.contains("City"), "Should contain 'City' header");
|
|
40
|
+
|
|
41
|
+
assert!(extraction.content.contains("Alice"), "Should contain Alice row");
|
|
42
|
+
assert!(extraction.content.contains("30"), "Should contain Alice's age");
|
|
43
|
+
assert!(extraction.content.contains("NYC"), "Should contain Alice's city");
|
|
44
|
+
|
|
45
|
+
assert!(extraction.content.contains("Bob"), "Should contain Bob row");
|
|
46
|
+
assert!(extraction.content.contains("25"), "Should contain Bob's age");
|
|
47
|
+
assert!(extraction.content.contains("LA"), "Should contain Bob's city");
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/// Test CSV with headers - first row as headers.
|
|
51
|
+
#[tokio::test]
|
|
52
|
+
async fn test_csv_with_headers() {
|
|
53
|
+
let config = ExtractionConfig::default();
|
|
54
|
+
|
|
55
|
+
let csv_content = b"Product,Price,Quantity\nApple,1.50,100\nBanana,0.75,200\nOrange,2.00,150";
|
|
56
|
+
|
|
57
|
+
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
58
|
+
Ok(result) => result,
|
|
59
|
+
Err(_) => {
|
|
60
|
+
println!("Skipping test: CSV extraction not available");
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
assert!(
|
|
66
|
+
extraction.chunks.is_none(),
|
|
67
|
+
"Chunks should be None without chunking config"
|
|
68
|
+
);
|
|
69
|
+
assert!(
|
|
70
|
+
extraction.detected_languages.is_none(),
|
|
71
|
+
"Language detection not enabled"
|
|
72
|
+
);
|
|
73
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
74
|
+
|
|
75
|
+
assert!(extraction.content.contains("Product"), "Should contain Product header");
|
|
76
|
+
assert!(extraction.content.contains("Price"), "Should contain Price header");
|
|
77
|
+
assert!(
|
|
78
|
+
extraction.content.contains("Quantity"),
|
|
79
|
+
"Should contain Quantity header"
|
|
80
|
+
);
|
|
81
|
+
|
|
82
|
+
assert!(
|
|
83
|
+
extraction.content.contains("Apple")
|
|
84
|
+
&& extraction.content.contains("1.50")
|
|
85
|
+
&& extraction.content.contains("100")
|
|
86
|
+
);
|
|
87
|
+
assert!(
|
|
88
|
+
extraction.content.contains("Banana")
|
|
89
|
+
&& extraction.content.contains("0.75")
|
|
90
|
+
&& extraction.content.contains("200")
|
|
91
|
+
);
|
|
92
|
+
assert!(
|
|
93
|
+
extraction.content.contains("Orange")
|
|
94
|
+
&& extraction.content.contains("2.00")
|
|
95
|
+
&& extraction.content.contains("150")
|
|
96
|
+
);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/// Test CSV with custom delimiter - tab and semicolon.
|
|
100
|
+
#[tokio::test]
|
|
101
|
+
async fn test_csv_custom_delimiter() {
|
|
102
|
+
let config = ExtractionConfig::default();
|
|
103
|
+
|
|
104
|
+
let csv_content = b"Name;Age;City\nAlice;30;NYC\nBob;25;LA";
|
|
105
|
+
|
|
106
|
+
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
107
|
+
Ok(result) => result,
|
|
108
|
+
Err(_) => {
|
|
109
|
+
println!("Skipping test: CSV extraction not available");
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
assert!(
|
|
115
|
+
extraction.chunks.is_none(),
|
|
116
|
+
"Chunks should be None without chunking config"
|
|
117
|
+
);
|
|
118
|
+
assert!(
|
|
119
|
+
extraction.detected_languages.is_none(),
|
|
120
|
+
"Language detection not enabled"
|
|
121
|
+
);
|
|
122
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
123
|
+
|
|
124
|
+
assert!(!extraction.content.is_empty(), "Content should be extracted");
|
|
125
|
+
|
|
126
|
+
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
|
127
|
+
assert!(extraction.content.contains("30"), "Should contain age");
|
|
128
|
+
assert!(extraction.content.contains("NYC"), "Should contain city");
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/// Test TSV (Tab-Separated Values) file.
|
|
132
|
+
#[tokio::test]
|
|
133
|
+
async fn test_tsv_file() {
|
|
134
|
+
let config = ExtractionConfig::default();
|
|
135
|
+
|
|
136
|
+
let tsv_content = b"Name\tAge\tCity\nAlice\t30\tNYC\nBob\t25\tLA";
|
|
137
|
+
|
|
138
|
+
let extraction = match extract_bytes(tsv_content, "text/tab-separated-values", &config).await {
|
|
139
|
+
Ok(result) => result,
|
|
140
|
+
Err(_) => {
|
|
141
|
+
println!("Skipping test: TSV extraction not available");
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
assert_eq!(extraction.mime_type, "text/tab-separated-values");
|
|
147
|
+
assert!(
|
|
148
|
+
extraction.chunks.is_none(),
|
|
149
|
+
"Chunks should be None without chunking config"
|
|
150
|
+
);
|
|
151
|
+
assert!(
|
|
152
|
+
extraction.detected_languages.is_none(),
|
|
153
|
+
"Language detection not enabled"
|
|
154
|
+
);
|
|
155
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
156
|
+
|
|
157
|
+
assert!(extraction.content.contains("Name"), "Should contain Name header");
|
|
158
|
+
assert!(extraction.content.contains("Age"), "Should contain Age header");
|
|
159
|
+
assert!(extraction.content.contains("City"), "Should contain City header");
|
|
160
|
+
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
|
161
|
+
assert!(extraction.content.contains("Bob"), "Should contain Bob");
|
|
162
|
+
assert!(extraction.content.contains("30") && extraction.content.contains("NYC"));
|
|
163
|
+
assert!(extraction.content.contains("25") && extraction.content.contains("LA"));
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/// Test CSV with quoted fields - fields containing commas.
|
|
167
|
+
#[tokio::test]
|
|
168
|
+
async fn test_csv_quoted_fields() {
|
|
169
|
+
let config = ExtractionConfig::default();
|
|
170
|
+
|
|
171
|
+
let csv_content =
|
|
172
|
+
b"Name,Description,Price\n\"Smith, John\",\"Product A, premium\",100\n\"Doe, Jane\",\"Product B, standard\",50";
|
|
173
|
+
|
|
174
|
+
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
175
|
+
Ok(result) => result,
|
|
176
|
+
Err(_) => {
|
|
177
|
+
println!("Skipping test: CSV extraction not available");
|
|
178
|
+
return;
|
|
179
|
+
}
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
assert!(
|
|
183
|
+
extraction.chunks.is_none(),
|
|
184
|
+
"Chunks should be None without chunking config"
|
|
185
|
+
);
|
|
186
|
+
assert!(
|
|
187
|
+
extraction.detected_languages.is_none(),
|
|
188
|
+
"Language detection not enabled"
|
|
189
|
+
);
|
|
190
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
191
|
+
|
|
192
|
+
assert!(extraction.content.contains("Smith"), "Should contain Smith");
|
|
193
|
+
assert!(extraction.content.contains("John"), "Should contain John");
|
|
194
|
+
assert!(extraction.content.contains("Doe"), "Should contain Doe");
|
|
195
|
+
assert!(extraction.content.contains("Jane"), "Should contain Jane");
|
|
196
|
+
|
|
197
|
+
assert!(extraction.content.contains("Product A") || extraction.content.contains("premium"));
|
|
198
|
+
assert!(extraction.content.contains("Product B") || extraction.content.contains("standard"));
|
|
199
|
+
|
|
200
|
+
assert!(extraction.content.contains("100") && extraction.content.contains("50"));
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/// Test CSV with special characters - Unicode, newlines in fields.
|
|
204
|
+
#[tokio::test]
|
|
205
|
+
async fn test_csv_special_characters() {
|
|
206
|
+
let config = ExtractionConfig::default();
|
|
207
|
+
|
|
208
|
+
let csv_content = "Name,City,Emoji\nAlice,Tokyo 東京,🎉\nBob,París,✅\nCarlos,Москва,🌍".as_bytes();
|
|
209
|
+
|
|
210
|
+
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
211
|
+
Ok(result) => result,
|
|
212
|
+
Err(_) => {
|
|
213
|
+
println!("Skipping test: CSV extraction not available");
|
|
214
|
+
return;
|
|
215
|
+
}
|
|
216
|
+
};
|
|
217
|
+
|
|
218
|
+
assert!(
|
|
219
|
+
extraction.chunks.is_none(),
|
|
220
|
+
"Chunks should be None without chunking config"
|
|
221
|
+
);
|
|
222
|
+
assert!(
|
|
223
|
+
extraction.detected_languages.is_none(),
|
|
224
|
+
"Language detection not enabled"
|
|
225
|
+
);
|
|
226
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
227
|
+
|
|
228
|
+
assert!(!extraction.content.is_empty(), "Special characters should be handled");
|
|
229
|
+
|
|
230
|
+
assert!(extraction.content.contains("Alice"), "Should contain Alice");
|
|
231
|
+
assert!(extraction.content.contains("Bob"), "Should contain Bob");
|
|
232
|
+
assert!(extraction.content.contains("Carlos"), "Should contain Carlos");
|
|
233
|
+
|
|
234
|
+
assert!(extraction.content.contains("Tokyo") || extraction.content.contains("東京"));
|
|
235
|
+
assert!(extraction.content.contains("París") || extraction.content.contains("Paris"));
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/// Test CSV with large file - 10,000+ rows (streaming).
|
|
239
|
+
#[tokio::test]
|
|
240
|
+
async fn test_csv_large_file() {
|
|
241
|
+
let config = ExtractionConfig::default();
|
|
242
|
+
|
|
243
|
+
let mut csv_content = "ID,Name,Value\n".to_string();
|
|
244
|
+
for i in 1..=10_000 {
|
|
245
|
+
csv_content.push_str(&format!("{},Item{},{}.00\n", i, i, i * 10));
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
let extraction = match extract_bytes(csv_content.as_bytes(), "text/csv", &config).await {
|
|
249
|
+
Ok(result) => result,
|
|
250
|
+
Err(_) => {
|
|
251
|
+
println!("Skipping test: CSV extraction not available");
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
};
|
|
255
|
+
|
|
256
|
+
assert!(
|
|
257
|
+
extraction.chunks.is_none(),
|
|
258
|
+
"Chunks should be None without chunking config"
|
|
259
|
+
);
|
|
260
|
+
assert!(
|
|
261
|
+
extraction.detected_languages.is_none(),
|
|
262
|
+
"Language detection not enabled"
|
|
263
|
+
);
|
|
264
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
265
|
+
|
|
266
|
+
assert!(!extraction.content.is_empty(), "Large CSV should be processed");
|
|
267
|
+
|
|
268
|
+
assert!(
|
|
269
|
+
extraction.content.len() > 1000,
|
|
270
|
+
"Large CSV content should be substantial"
|
|
271
|
+
);
|
|
272
|
+
|
|
273
|
+
assert!(extraction.content.contains("Item1") || extraction.content.contains("10.00"));
|
|
274
|
+
|
|
275
|
+
assert!(extraction.content.contains("Item5000") || extraction.content.contains("50000.00"));
|
|
276
|
+
|
|
277
|
+
assert!(extraction.content.contains("Item10000") || extraction.content.contains("100000.00"));
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/// Test malformed CSV - inconsistent columns.
|
|
281
|
+
#[tokio::test]
|
|
282
|
+
async fn test_csv_malformed() {
|
|
283
|
+
let config = ExtractionConfig::default();
|
|
284
|
+
|
|
285
|
+
let csv_content = b"Name,Age,City\nAlice,30\nBob,25,LA,Extra\nCarlos,35,SF";
|
|
286
|
+
|
|
287
|
+
let result = extract_bytes(csv_content, "text/csv", &config).await;
|
|
288
|
+
|
|
289
|
+
assert!(
|
|
290
|
+
result.is_ok() || result.is_err(),
|
|
291
|
+
"Should handle malformed CSV gracefully"
|
|
292
|
+
);
|
|
293
|
+
|
|
294
|
+
if let Ok(extraction) = result {
|
|
295
|
+
assert!(!extraction.content.is_empty());
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/// Test empty CSV file.
|
|
300
|
+
#[tokio::test]
|
|
301
|
+
async fn test_csv_empty() {
|
|
302
|
+
let config = ExtractionConfig::default();
|
|
303
|
+
|
|
304
|
+
let empty_csv = b"";
|
|
305
|
+
|
|
306
|
+
let result = extract_bytes(empty_csv, "text/csv", &config).await;
|
|
307
|
+
|
|
308
|
+
assert!(result.is_ok() || result.is_err(), "Should handle empty CSV gracefully");
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
/// Test CSV with only headers.
|
|
312
|
+
#[tokio::test]
|
|
313
|
+
async fn test_csv_headers_only() {
|
|
314
|
+
let config = ExtractionConfig::default();
|
|
315
|
+
|
|
316
|
+
let csv_content = b"Name,Age,City";
|
|
317
|
+
|
|
318
|
+
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
319
|
+
Ok(result) => result,
|
|
320
|
+
Err(_) => {
|
|
321
|
+
println!("Skipping test: CSV extraction not available");
|
|
322
|
+
return;
|
|
323
|
+
}
|
|
324
|
+
};
|
|
325
|
+
|
|
326
|
+
assert!(
|
|
327
|
+
extraction.chunks.is_none(),
|
|
328
|
+
"Chunks should be None without chunking config"
|
|
329
|
+
);
|
|
330
|
+
assert!(
|
|
331
|
+
extraction.detected_languages.is_none(),
|
|
332
|
+
"Language detection not enabled"
|
|
333
|
+
);
|
|
334
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
335
|
+
|
|
336
|
+
assert!(
|
|
337
|
+
extraction.content.contains("Name") || !extraction.content.is_empty(),
|
|
338
|
+
"Headers should be extracted"
|
|
339
|
+
);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/// Test CSV with blank lines.
|
|
343
|
+
#[tokio::test]
|
|
344
|
+
async fn test_csv_blank_lines() {
|
|
345
|
+
let config = ExtractionConfig::default();
|
|
346
|
+
|
|
347
|
+
let csv_content = b"Name,Age\nAlice,30\n\nBob,25\n\nCarlos,35";
|
|
348
|
+
|
|
349
|
+
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
350
|
+
Ok(result) => result,
|
|
351
|
+
Err(_) => {
|
|
352
|
+
println!("Skipping test: CSV extraction not available");
|
|
353
|
+
return;
|
|
354
|
+
}
|
|
355
|
+
};
|
|
356
|
+
|
|
357
|
+
assert!(
|
|
358
|
+
extraction.chunks.is_none(),
|
|
359
|
+
"Chunks should be None without chunking config"
|
|
360
|
+
);
|
|
361
|
+
assert!(
|
|
362
|
+
extraction.detected_languages.is_none(),
|
|
363
|
+
"Language detection not enabled"
|
|
364
|
+
);
|
|
365
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
366
|
+
|
|
367
|
+
assert!(extraction.content.contains("Alice") || extraction.content.contains("Bob"));
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
/// Test CSV with numeric data.
|
|
371
|
+
#[tokio::test]
|
|
372
|
+
async fn test_csv_numeric_data() {
|
|
373
|
+
let config = ExtractionConfig::default();
|
|
374
|
+
|
|
375
|
+
let csv_content = b"ID,Price,Quantity,Discount\n1,19.99,100,0.15\n2,29.99,50,0.20\n3,9.99,200,0.10";
|
|
376
|
+
|
|
377
|
+
let extraction = match extract_bytes(csv_content, "text/csv", &config).await {
|
|
378
|
+
Ok(result) => result,
|
|
379
|
+
Err(_) => {
|
|
380
|
+
println!("Skipping test: CSV extraction not available");
|
|
381
|
+
return;
|
|
382
|
+
}
|
|
383
|
+
};
|
|
384
|
+
|
|
385
|
+
assert!(
|
|
386
|
+
extraction.chunks.is_none(),
|
|
387
|
+
"Chunks should be None without chunking config"
|
|
388
|
+
);
|
|
389
|
+
assert!(
|
|
390
|
+
extraction.detected_languages.is_none(),
|
|
391
|
+
"Language detection not enabled"
|
|
392
|
+
);
|
|
393
|
+
assert!(extraction.tables.is_empty(), "CSV should not have table structures");
|
|
394
|
+
|
|
395
|
+
assert!(extraction.content.contains("Price"), "Should contain Price header");
|
|
396
|
+
assert!(
|
|
397
|
+
extraction.content.contains("Quantity"),
|
|
398
|
+
"Should contain Quantity header"
|
|
399
|
+
);
|
|
400
|
+
assert!(
|
|
401
|
+
extraction.content.contains("Discount"),
|
|
402
|
+
"Should contain Discount header"
|
|
403
|
+
);
|
|
404
|
+
|
|
405
|
+
assert!(extraction.content.contains("19.99"), "Should contain first price");
|
|
406
|
+
assert!(extraction.content.contains("100"), "Should contain first quantity");
|
|
407
|
+
assert!(extraction.content.contains("0.15"), "Should contain first discount");
|
|
408
|
+
|
|
409
|
+
assert!(extraction.content.contains("29.99"), "Should contain second price");
|
|
410
|
+
assert!(extraction.content.contains("50"), "Should contain second quantity");
|
|
411
|
+
|
|
412
|
+
assert!(extraction.content.contains("9.99"), "Should contain third price");
|
|
413
|
+
assert!(extraction.content.contains("200"), "Should contain third quantity");
|
|
414
|
+
}
|