kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +2 -105
- data/README.md +454 -454
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +2 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +7 -80
|
@@ -1,421 +1,421 @@
|
|
|
1
|
-
#![cfg(feature = "office")]
|
|
2
|
-
//! Comprehensive test for BibTeX extractor parity with Pandoc
|
|
3
|
-
|
|
4
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
5
|
-
use kreuzberg::extractors::BibtexExtractor;
|
|
6
|
-
use kreuzberg::plugins::DocumentExtractor;
|
|
7
|
-
|
|
8
|
-
mod helpers;
|
|
9
|
-
use helpers::get_test_file_path;
|
|
10
|
-
|
|
11
|
-
#[tokio::test]
|
|
12
|
-
async fn test_all_entry_types() {
|
|
13
|
-
let extractor = BibtexExtractor::new();
|
|
14
|
-
|
|
15
|
-
let test_cases = vec![
|
|
16
|
-
(
|
|
17
|
-
"@article{test, author={John Doe}, title={Test}, journal={Journal}, year={2023}}",
|
|
18
|
-
"article",
|
|
19
|
-
),
|
|
20
|
-
(
|
|
21
|
-
"@book{test, author={John Doe}, title={Test}, publisher={Publisher}, year={2023}}",
|
|
22
|
-
"book",
|
|
23
|
-
),
|
|
24
|
-
(
|
|
25
|
-
"@inproceedings{test, author={John Doe}, title={Test}, booktitle={Conference}, year={2023}}",
|
|
26
|
-
"inproceedings",
|
|
27
|
-
),
|
|
28
|
-
(
|
|
29
|
-
"@phdthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
|
|
30
|
-
"phdthesis",
|
|
31
|
-
),
|
|
32
|
-
(
|
|
33
|
-
"@mastersthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
|
|
34
|
-
"mastersthesis",
|
|
35
|
-
),
|
|
36
|
-
(
|
|
37
|
-
"@techreport{test, author={John Doe}, title={Test}, institution={Institute}, year={2023}}",
|
|
38
|
-
"techreport",
|
|
39
|
-
),
|
|
40
|
-
("@manual{test, title={Test Manual}, year={2023}}", "manual"),
|
|
41
|
-
("@misc{test, author={John Doe}, title={Test}, year={2023}}", "misc"),
|
|
42
|
-
(
|
|
43
|
-
"@unpublished{test, author={John Doe}, title={Test}, note={Unpublished}, year={2023}}",
|
|
44
|
-
"unpublished",
|
|
45
|
-
),
|
|
46
|
-
(
|
|
47
|
-
"@incollection{test, author={John Doe}, title={Test}, booktitle={Book}, publisher={Pub}, year={2023}}",
|
|
48
|
-
"incollection",
|
|
49
|
-
),
|
|
50
|
-
(
|
|
51
|
-
"@inbook{test, author={John Doe}, title={Test}, chapter={5}, publisher={Pub}, year={2023}}",
|
|
52
|
-
"inbook",
|
|
53
|
-
),
|
|
54
|
-
(
|
|
55
|
-
"@proceedings{test, title={Conference Proceedings}, year={2023}}",
|
|
56
|
-
"proceedings",
|
|
57
|
-
),
|
|
58
|
-
("@booklet{test, title={Booklet}, year={2023}}", "booklet"),
|
|
59
|
-
];
|
|
60
|
-
|
|
61
|
-
for (bibtex_content, expected_type) in test_cases {
|
|
62
|
-
let config = ExtractionConfig::default();
|
|
63
|
-
let result = extractor
|
|
64
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
65
|
-
.await;
|
|
66
|
-
|
|
67
|
-
assert!(result.is_ok(), "Failed to parse {} entry", expected_type);
|
|
68
|
-
let result = result.unwrap();
|
|
69
|
-
|
|
70
|
-
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
71
|
-
assert!(entry_types.as_object().is_some(), "Entry types should be an object");
|
|
72
|
-
println!("Entry type '{}' extracted successfully", expected_type);
|
|
73
|
-
}
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
#[tokio::test]
|
|
78
|
-
async fn test_all_common_fields() {
|
|
79
|
-
let extractor = BibtexExtractor::new();
|
|
80
|
-
|
|
81
|
-
let bibtex_content = r#"
|
|
82
|
-
@article{comprehensive,
|
|
83
|
-
author = {Smith, John and Doe, Jane},
|
|
84
|
-
title = {Comprehensive Test},
|
|
85
|
-
journal = {Test Journal},
|
|
86
|
-
year = {2023},
|
|
87
|
-
volume = {42},
|
|
88
|
-
number = {3},
|
|
89
|
-
pages = {123--145},
|
|
90
|
-
month = {June},
|
|
91
|
-
doi = {10.1234/test.001},
|
|
92
|
-
url = {https://example.com},
|
|
93
|
-
issn = {1234-5678},
|
|
94
|
-
isbn = {978-0-12-345678-9},
|
|
95
|
-
abstract = {This is an abstract},
|
|
96
|
-
keywords = {test, bibtex},
|
|
97
|
-
note = {Additional notes},
|
|
98
|
-
publisher = {Test Publisher},
|
|
99
|
-
address = {Test City},
|
|
100
|
-
edition = {2nd},
|
|
101
|
-
editor = {Editor Name},
|
|
102
|
-
series = {Test Series},
|
|
103
|
-
organization = {Test Org},
|
|
104
|
-
institution = {Test Institute},
|
|
105
|
-
school = {Test School},
|
|
106
|
-
howpublished = {Online},
|
|
107
|
-
type = {Research Article},
|
|
108
|
-
chapter = {5},
|
|
109
|
-
booktitle = {Book Title}
|
|
110
|
-
}
|
|
111
|
-
"#;
|
|
112
|
-
|
|
113
|
-
let config = ExtractionConfig::default();
|
|
114
|
-
let result = extractor
|
|
115
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
116
|
-
.await;
|
|
117
|
-
|
|
118
|
-
assert!(result.is_ok());
|
|
119
|
-
let result = result.unwrap();
|
|
120
|
-
|
|
121
|
-
let content = &result.content;
|
|
122
|
-
|
|
123
|
-
let expected_fields = vec![
|
|
124
|
-
"author",
|
|
125
|
-
"title",
|
|
126
|
-
"journal",
|
|
127
|
-
"year",
|
|
128
|
-
"volume",
|
|
129
|
-
"number",
|
|
130
|
-
"pages",
|
|
131
|
-
"month",
|
|
132
|
-
"doi",
|
|
133
|
-
"url",
|
|
134
|
-
"issn",
|
|
135
|
-
"isbn",
|
|
136
|
-
"abstract",
|
|
137
|
-
"keywords",
|
|
138
|
-
"note",
|
|
139
|
-
"publisher",
|
|
140
|
-
"address",
|
|
141
|
-
"edition",
|
|
142
|
-
"editor",
|
|
143
|
-
"series",
|
|
144
|
-
"organization",
|
|
145
|
-
"institution",
|
|
146
|
-
"school",
|
|
147
|
-
"howpublished",
|
|
148
|
-
"type",
|
|
149
|
-
"chapter",
|
|
150
|
-
"booktitle",
|
|
151
|
-
];
|
|
152
|
-
|
|
153
|
-
let num_fields = expected_fields.len();
|
|
154
|
-
for field in expected_fields {
|
|
155
|
-
assert!(content.contains(field), "Field '{}' should be present in output", field);
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
println!("All {} fields were extracted", num_fields);
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
#[tokio::test]
|
|
162
|
-
async fn test_author_parsing() {
|
|
163
|
-
let extractor = BibtexExtractor::new();
|
|
164
|
-
|
|
165
|
-
let test_cases = vec![
|
|
166
|
-
("author = {John Doe}", vec!["John Doe"]),
|
|
167
|
-
("author = {John Doe and Jane Smith}", vec!["John Doe", "Jane Smith"]),
|
|
168
|
-
("author = {Smith, John and Doe, Jane}", vec!["Smith, John", "Doe, Jane"]),
|
|
169
|
-
(
|
|
170
|
-
"author = {John Doe and Jane Smith and Bob Jones}",
|
|
171
|
-
vec!["John Doe", "Jane Smith", "Bob Jones"],
|
|
172
|
-
),
|
|
173
|
-
("author = {van der Berg, Hans}", vec!["van der Berg, Hans"]),
|
|
174
|
-
("author = {Smith, Jr., John}", vec!["Smith, Jr., John"]),
|
|
175
|
-
];
|
|
176
|
-
|
|
177
|
-
for (author_field, expected_authors) in test_cases {
|
|
178
|
-
let bibtex = format!("@article{{test, {}, title={{Test}}, year={{2023}}}}", author_field);
|
|
179
|
-
|
|
180
|
-
let config = ExtractionConfig::default();
|
|
181
|
-
let result = extractor
|
|
182
|
-
.extract_bytes(bibtex.as_bytes(), "application/x-bibtex", &config)
|
|
183
|
-
.await;
|
|
184
|
-
|
|
185
|
-
assert!(result.is_ok());
|
|
186
|
-
let result = result.unwrap();
|
|
187
|
-
|
|
188
|
-
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
189
|
-
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
190
|
-
|
|
191
|
-
for expected_author in &expected_authors {
|
|
192
|
-
let found = authors_array
|
|
193
|
-
.iter()
|
|
194
|
-
.any(|a| a.as_str().map(|s| s.contains(expected_author)).unwrap_or(false));
|
|
195
|
-
assert!(
|
|
196
|
-
found,
|
|
197
|
-
"Expected author '{}' not found in {:?}",
|
|
198
|
-
expected_author, authors_array
|
|
199
|
-
);
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
#[tokio::test]
|
|
206
|
-
async fn test_special_characters() {
|
|
207
|
-
let extractor = BibtexExtractor::new();
|
|
208
|
-
|
|
209
|
-
let bibtex_content = r#"
|
|
210
|
-
@article{special,
|
|
211
|
-
author = {M{\"u}ller, Hans and Sch{\"o}n, Anna and Garc{\'\i}a, Jos{\'e}},
|
|
212
|
-
title = {Special Characters in {BibTeX}: {\"O}berblick},
|
|
213
|
-
journal = {Test Journal},
|
|
214
|
-
year = {2022}
|
|
215
|
-
}
|
|
216
|
-
"#;
|
|
217
|
-
|
|
218
|
-
let config = ExtractionConfig::default();
|
|
219
|
-
let result = extractor
|
|
220
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
221
|
-
.await;
|
|
222
|
-
|
|
223
|
-
assert!(result.is_ok());
|
|
224
|
-
let result = result.unwrap();
|
|
225
|
-
|
|
226
|
-
assert_eq!(
|
|
227
|
-
result.metadata.additional.get("entry_count"),
|
|
228
|
-
Some(&serde_json::json!(1))
|
|
229
|
-
);
|
|
230
|
-
|
|
231
|
-
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
232
|
-
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
233
|
-
assert!(authors_array.len() >= 3, "Should have 3 authors");
|
|
234
|
-
}
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
#[tokio::test]
|
|
238
|
-
async fn test_year_range_extraction() {
|
|
239
|
-
let extractor = BibtexExtractor::new();
|
|
240
|
-
|
|
241
|
-
let bibtex_content = r#"
|
|
242
|
-
@article{old, author={A}, title={Old}, year={1990}}
|
|
243
|
-
@article{mid, author={B}, title={Mid}, year={2005}}
|
|
244
|
-
@article{new, author={C}, title={New}, year={2023}}
|
|
245
|
-
"#;
|
|
246
|
-
|
|
247
|
-
let config = ExtractionConfig::default();
|
|
248
|
-
let result = extractor
|
|
249
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
250
|
-
.await;
|
|
251
|
-
|
|
252
|
-
assert!(result.is_ok());
|
|
253
|
-
let result = result.unwrap();
|
|
254
|
-
|
|
255
|
-
if let Some(year_range) = result.metadata.additional.get("year_range") {
|
|
256
|
-
assert_eq!(year_range.get("min"), Some(&serde_json::json!(1990)));
|
|
257
|
-
assert_eq!(year_range.get("max"), Some(&serde_json::json!(2023)));
|
|
258
|
-
|
|
259
|
-
if let Some(years) = year_range.get("years") {
|
|
260
|
-
let years_array = years.as_array().expect("Years should be an array");
|
|
261
|
-
assert_eq!(years_array.len(), 3, "Should have 3 unique years");
|
|
262
|
-
}
|
|
263
|
-
} else {
|
|
264
|
-
panic!("Year range not extracted");
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
#[tokio::test]
|
|
269
|
-
async fn test_citation_keys_extraction() {
|
|
270
|
-
let extractor = BibtexExtractor::new();
|
|
271
|
-
|
|
272
|
-
let bibtex_content = r#"
|
|
273
|
-
@article{key1, author={A}, title={T1}, year={2023}}
|
|
274
|
-
@book{key2, author={B}, title={T2}, year={2023}}
|
|
275
|
-
@inproceedings{key3, author={C}, title={T3}, year={2023}}
|
|
276
|
-
"#;
|
|
277
|
-
|
|
278
|
-
let config = ExtractionConfig::default();
|
|
279
|
-
let result = extractor
|
|
280
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
281
|
-
.await;
|
|
282
|
-
|
|
283
|
-
assert!(result.is_ok());
|
|
284
|
-
let result = result.unwrap();
|
|
285
|
-
|
|
286
|
-
if let Some(citation_keys) = result.metadata.additional.get("citation_keys") {
|
|
287
|
-
let keys_array = citation_keys.as_array().expect("Citation keys should be an array");
|
|
288
|
-
assert_eq!(keys_array.len(), 3);
|
|
289
|
-
|
|
290
|
-
let expected_keys = vec!["key1", "key2", "key3"];
|
|
291
|
-
for expected_key in expected_keys {
|
|
292
|
-
let found = keys_array.iter().any(|k| k.as_str() == Some(expected_key));
|
|
293
|
-
assert!(found, "Citation key '{}' not found", expected_key);
|
|
294
|
-
}
|
|
295
|
-
} else {
|
|
296
|
-
panic!("Citation keys not extracted");
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
#[tokio::test]
|
|
301
|
-
async fn test_entry_type_distribution() {
|
|
302
|
-
let extractor = BibtexExtractor::new();
|
|
303
|
-
|
|
304
|
-
let bibtex_content = r#"
|
|
305
|
-
@article{a1, author={A}, title={T1}, year={2023}}
|
|
306
|
-
@article{a2, author={B}, title={T2}, year={2023}}
|
|
307
|
-
@book{b1, author={C}, title={T3}, year={2023}}
|
|
308
|
-
@inproceedings{c1, author={D}, title={T4}, year={2023}}
|
|
309
|
-
@inproceedings{c2, author={E}, title={T5}, year={2023}}
|
|
310
|
-
@inproceedings{c3, author={F}, title={T6}, year={2023}}
|
|
311
|
-
"#;
|
|
312
|
-
|
|
313
|
-
let config = ExtractionConfig::default();
|
|
314
|
-
let result = extractor
|
|
315
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
316
|
-
.await;
|
|
317
|
-
|
|
318
|
-
assert!(result.is_ok());
|
|
319
|
-
let result = result.unwrap();
|
|
320
|
-
|
|
321
|
-
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
322
|
-
let types_obj = entry_types.as_object().expect("Entry types should be an object");
|
|
323
|
-
|
|
324
|
-
assert_eq!(types_obj.get("article"), Some(&serde_json::json!(2)));
|
|
325
|
-
assert_eq!(types_obj.get("book"), Some(&serde_json::json!(1)));
|
|
326
|
-
assert_eq!(types_obj.get("inproceedings"), Some(&serde_json::json!(3)));
|
|
327
|
-
} else {
|
|
328
|
-
panic!("Entry types not extracted");
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
#[tokio::test]
|
|
333
|
-
async fn test_unicode_support() {
|
|
334
|
-
let extractor = BibtexExtractor::new();
|
|
335
|
-
|
|
336
|
-
let bibtex_content = r#"
|
|
337
|
-
@article{unicode,
|
|
338
|
-
author = {Müller, Hans and Søren, Kierkegård},
|
|
339
|
-
title = {Unicode in BibTeX: A Global Perspective},
|
|
340
|
-
journal = {International Journal},
|
|
341
|
-
year = {2023}
|
|
342
|
-
}
|
|
343
|
-
"#;
|
|
344
|
-
|
|
345
|
-
let config = ExtractionConfig::default();
|
|
346
|
-
let result = extractor
|
|
347
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
348
|
-
.await;
|
|
349
|
-
|
|
350
|
-
assert!(result.is_ok());
|
|
351
|
-
let result = result.unwrap();
|
|
352
|
-
|
|
353
|
-
assert_eq!(
|
|
354
|
-
result.metadata.additional.get("entry_count"),
|
|
355
|
-
Some(&serde_json::json!(1))
|
|
356
|
-
);
|
|
357
|
-
}
|
|
358
|
-
|
|
359
|
-
#[tokio::test]
|
|
360
|
-
async fn test_empty_fields() {
|
|
361
|
-
let extractor = BibtexExtractor::new();
|
|
362
|
-
|
|
363
|
-
let bibtex_content = r#"
|
|
364
|
-
@article{empty,
|
|
365
|
-
author = {Smith, John},
|
|
366
|
-
title = {Test},
|
|
367
|
-
journal = {},
|
|
368
|
-
year = {2023},
|
|
369
|
-
volume = {}
|
|
370
|
-
}
|
|
371
|
-
"#;
|
|
372
|
-
|
|
373
|
-
let config = ExtractionConfig::default();
|
|
374
|
-
let result = extractor
|
|
375
|
-
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
376
|
-
.await;
|
|
377
|
-
|
|
378
|
-
assert!(result.is_ok());
|
|
379
|
-
let result = result.unwrap();
|
|
380
|
-
assert_eq!(
|
|
381
|
-
result.metadata.additional.get("entry_count"),
|
|
382
|
-
Some(&serde_json::json!(1))
|
|
383
|
-
);
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
#[tokio::test]
|
|
387
|
-
async fn test_comprehensive_file() {
|
|
388
|
-
let extractor = BibtexExtractor::new();
|
|
389
|
-
|
|
390
|
-
let fixture_path = get_test_file_path("bibtex/comprehensive.bib");
|
|
391
|
-
let bibtex_content = std::fs::read(&fixture_path)
|
|
392
|
-
.unwrap_or_else(|err| panic!("Failed to read test file at {}: {}", fixture_path.display(), err));
|
|
393
|
-
|
|
394
|
-
let config = ExtractionConfig::default();
|
|
395
|
-
let result = extractor
|
|
396
|
-
.extract_bytes(&bibtex_content, "application/x-bibtex", &config)
|
|
397
|
-
.await;
|
|
398
|
-
|
|
399
|
-
assert!(result.is_ok());
|
|
400
|
-
let result = result.unwrap();
|
|
401
|
-
|
|
402
|
-
assert_eq!(
|
|
403
|
-
result.metadata.additional.get("entry_count"),
|
|
404
|
-
Some(&serde_json::json!(20))
|
|
405
|
-
);
|
|
406
|
-
|
|
407
|
-
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
408
|
-
let types_obj = entry_types.as_object().expect("Entry types should be an object");
|
|
409
|
-
assert!(types_obj.len() >= 10, "Should have at least 10 different entry types");
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
413
|
-
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
414
|
-
assert!(authors_array.len() > 10, "Should have many unique authors");
|
|
415
|
-
}
|
|
416
|
-
|
|
417
|
-
if let Some(year_range) = result.metadata.additional.get("year_range") {
|
|
418
|
-
assert!(year_range.get("min").is_some());
|
|
419
|
-
assert!(year_range.get("max").is_some());
|
|
420
|
-
}
|
|
421
|
-
}
|
|
1
|
+
#![cfg(feature = "office")]
|
|
2
|
+
//! Comprehensive test for BibTeX extractor parity with Pandoc
|
|
3
|
+
|
|
4
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
5
|
+
use kreuzberg::extractors::BibtexExtractor;
|
|
6
|
+
use kreuzberg::plugins::DocumentExtractor;
|
|
7
|
+
|
|
8
|
+
mod helpers;
|
|
9
|
+
use helpers::get_test_file_path;
|
|
10
|
+
|
|
11
|
+
#[tokio::test]
|
|
12
|
+
async fn test_all_entry_types() {
|
|
13
|
+
let extractor = BibtexExtractor::new();
|
|
14
|
+
|
|
15
|
+
let test_cases = vec![
|
|
16
|
+
(
|
|
17
|
+
"@article{test, author={John Doe}, title={Test}, journal={Journal}, year={2023}}",
|
|
18
|
+
"article",
|
|
19
|
+
),
|
|
20
|
+
(
|
|
21
|
+
"@book{test, author={John Doe}, title={Test}, publisher={Publisher}, year={2023}}",
|
|
22
|
+
"book",
|
|
23
|
+
),
|
|
24
|
+
(
|
|
25
|
+
"@inproceedings{test, author={John Doe}, title={Test}, booktitle={Conference}, year={2023}}",
|
|
26
|
+
"inproceedings",
|
|
27
|
+
),
|
|
28
|
+
(
|
|
29
|
+
"@phdthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
|
|
30
|
+
"phdthesis",
|
|
31
|
+
),
|
|
32
|
+
(
|
|
33
|
+
"@mastersthesis{test, author={John Doe}, title={Test}, school={University}, year={2023}}",
|
|
34
|
+
"mastersthesis",
|
|
35
|
+
),
|
|
36
|
+
(
|
|
37
|
+
"@techreport{test, author={John Doe}, title={Test}, institution={Institute}, year={2023}}",
|
|
38
|
+
"techreport",
|
|
39
|
+
),
|
|
40
|
+
("@manual{test, title={Test Manual}, year={2023}}", "manual"),
|
|
41
|
+
("@misc{test, author={John Doe}, title={Test}, year={2023}}", "misc"),
|
|
42
|
+
(
|
|
43
|
+
"@unpublished{test, author={John Doe}, title={Test}, note={Unpublished}, year={2023}}",
|
|
44
|
+
"unpublished",
|
|
45
|
+
),
|
|
46
|
+
(
|
|
47
|
+
"@incollection{test, author={John Doe}, title={Test}, booktitle={Book}, publisher={Pub}, year={2023}}",
|
|
48
|
+
"incollection",
|
|
49
|
+
),
|
|
50
|
+
(
|
|
51
|
+
"@inbook{test, author={John Doe}, title={Test}, chapter={5}, publisher={Pub}, year={2023}}",
|
|
52
|
+
"inbook",
|
|
53
|
+
),
|
|
54
|
+
(
|
|
55
|
+
"@proceedings{test, title={Conference Proceedings}, year={2023}}",
|
|
56
|
+
"proceedings",
|
|
57
|
+
),
|
|
58
|
+
("@booklet{test, title={Booklet}, year={2023}}", "booklet"),
|
|
59
|
+
];
|
|
60
|
+
|
|
61
|
+
for (bibtex_content, expected_type) in test_cases {
|
|
62
|
+
let config = ExtractionConfig::default();
|
|
63
|
+
let result = extractor
|
|
64
|
+
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
65
|
+
.await;
|
|
66
|
+
|
|
67
|
+
assert!(result.is_ok(), "Failed to parse {} entry", expected_type);
|
|
68
|
+
let result = result.unwrap();
|
|
69
|
+
|
|
70
|
+
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
71
|
+
assert!(entry_types.as_object().is_some(), "Entry types should be an object");
|
|
72
|
+
println!("Entry type '{}' extracted successfully", expected_type);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
#[tokio::test]
|
|
78
|
+
async fn test_all_common_fields() {
|
|
79
|
+
let extractor = BibtexExtractor::new();
|
|
80
|
+
|
|
81
|
+
let bibtex_content = r#"
|
|
82
|
+
@article{comprehensive,
|
|
83
|
+
author = {Smith, John and Doe, Jane},
|
|
84
|
+
title = {Comprehensive Test},
|
|
85
|
+
journal = {Test Journal},
|
|
86
|
+
year = {2023},
|
|
87
|
+
volume = {42},
|
|
88
|
+
number = {3},
|
|
89
|
+
pages = {123--145},
|
|
90
|
+
month = {June},
|
|
91
|
+
doi = {10.1234/test.001},
|
|
92
|
+
url = {https://example.com},
|
|
93
|
+
issn = {1234-5678},
|
|
94
|
+
isbn = {978-0-12-345678-9},
|
|
95
|
+
abstract = {This is an abstract},
|
|
96
|
+
keywords = {test, bibtex},
|
|
97
|
+
note = {Additional notes},
|
|
98
|
+
publisher = {Test Publisher},
|
|
99
|
+
address = {Test City},
|
|
100
|
+
edition = {2nd},
|
|
101
|
+
editor = {Editor Name},
|
|
102
|
+
series = {Test Series},
|
|
103
|
+
organization = {Test Org},
|
|
104
|
+
institution = {Test Institute},
|
|
105
|
+
school = {Test School},
|
|
106
|
+
howpublished = {Online},
|
|
107
|
+
type = {Research Article},
|
|
108
|
+
chapter = {5},
|
|
109
|
+
booktitle = {Book Title}
|
|
110
|
+
}
|
|
111
|
+
"#;
|
|
112
|
+
|
|
113
|
+
let config = ExtractionConfig::default();
|
|
114
|
+
let result = extractor
|
|
115
|
+
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
116
|
+
.await;
|
|
117
|
+
|
|
118
|
+
assert!(result.is_ok());
|
|
119
|
+
let result = result.unwrap();
|
|
120
|
+
|
|
121
|
+
let content = &result.content;
|
|
122
|
+
|
|
123
|
+
let expected_fields = vec![
|
|
124
|
+
"author",
|
|
125
|
+
"title",
|
|
126
|
+
"journal",
|
|
127
|
+
"year",
|
|
128
|
+
"volume",
|
|
129
|
+
"number",
|
|
130
|
+
"pages",
|
|
131
|
+
"month",
|
|
132
|
+
"doi",
|
|
133
|
+
"url",
|
|
134
|
+
"issn",
|
|
135
|
+
"isbn",
|
|
136
|
+
"abstract",
|
|
137
|
+
"keywords",
|
|
138
|
+
"note",
|
|
139
|
+
"publisher",
|
|
140
|
+
"address",
|
|
141
|
+
"edition",
|
|
142
|
+
"editor",
|
|
143
|
+
"series",
|
|
144
|
+
"organization",
|
|
145
|
+
"institution",
|
|
146
|
+
"school",
|
|
147
|
+
"howpublished",
|
|
148
|
+
"type",
|
|
149
|
+
"chapter",
|
|
150
|
+
"booktitle",
|
|
151
|
+
];
|
|
152
|
+
|
|
153
|
+
let num_fields = expected_fields.len();
|
|
154
|
+
for field in expected_fields {
|
|
155
|
+
assert!(content.contains(field), "Field '{}' should be present in output", field);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
println!("All {} fields were extracted", num_fields);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
#[tokio::test]
|
|
162
|
+
async fn test_author_parsing() {
|
|
163
|
+
let extractor = BibtexExtractor::new();
|
|
164
|
+
|
|
165
|
+
let test_cases = vec![
|
|
166
|
+
("author = {John Doe}", vec!["John Doe"]),
|
|
167
|
+
("author = {John Doe and Jane Smith}", vec!["John Doe", "Jane Smith"]),
|
|
168
|
+
("author = {Smith, John and Doe, Jane}", vec!["Smith, John", "Doe, Jane"]),
|
|
169
|
+
(
|
|
170
|
+
"author = {John Doe and Jane Smith and Bob Jones}",
|
|
171
|
+
vec!["John Doe", "Jane Smith", "Bob Jones"],
|
|
172
|
+
),
|
|
173
|
+
("author = {van der Berg, Hans}", vec!["van der Berg, Hans"]),
|
|
174
|
+
("author = {Smith, Jr., John}", vec!["Smith, Jr., John"]),
|
|
175
|
+
];
|
|
176
|
+
|
|
177
|
+
for (author_field, expected_authors) in test_cases {
|
|
178
|
+
let bibtex = format!("@article{{test, {}, title={{Test}}, year={{2023}}}}", author_field);
|
|
179
|
+
|
|
180
|
+
let config = ExtractionConfig::default();
|
|
181
|
+
let result = extractor
|
|
182
|
+
.extract_bytes(bibtex.as_bytes(), "application/x-bibtex", &config)
|
|
183
|
+
.await;
|
|
184
|
+
|
|
185
|
+
assert!(result.is_ok());
|
|
186
|
+
let result = result.unwrap();
|
|
187
|
+
|
|
188
|
+
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
189
|
+
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
190
|
+
|
|
191
|
+
for expected_author in &expected_authors {
|
|
192
|
+
let found = authors_array
|
|
193
|
+
.iter()
|
|
194
|
+
.any(|a| a.as_str().map(|s| s.contains(expected_author)).unwrap_or(false));
|
|
195
|
+
assert!(
|
|
196
|
+
found,
|
|
197
|
+
"Expected author '{}' not found in {:?}",
|
|
198
|
+
expected_author, authors_array
|
|
199
|
+
);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
#[tokio::test]
|
|
206
|
+
async fn test_special_characters() {
|
|
207
|
+
let extractor = BibtexExtractor::new();
|
|
208
|
+
|
|
209
|
+
let bibtex_content = r#"
|
|
210
|
+
@article{special,
|
|
211
|
+
author = {M{\"u}ller, Hans and Sch{\"o}n, Anna and Garc{\'\i}a, Jos{\'e}},
|
|
212
|
+
title = {Special Characters in {BibTeX}: {\"O}berblick},
|
|
213
|
+
journal = {Test Journal},
|
|
214
|
+
year = {2022}
|
|
215
|
+
}
|
|
216
|
+
"#;
|
|
217
|
+
|
|
218
|
+
let config = ExtractionConfig::default();
|
|
219
|
+
let result = extractor
|
|
220
|
+
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
221
|
+
.await;
|
|
222
|
+
|
|
223
|
+
assert!(result.is_ok());
|
|
224
|
+
let result = result.unwrap();
|
|
225
|
+
|
|
226
|
+
assert_eq!(
|
|
227
|
+
result.metadata.additional.get("entry_count"),
|
|
228
|
+
Some(&serde_json::json!(1))
|
|
229
|
+
);
|
|
230
|
+
|
|
231
|
+
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
232
|
+
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
233
|
+
assert!(authors_array.len() >= 3, "Should have 3 authors");
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
#[tokio::test]
|
|
238
|
+
async fn test_year_range_extraction() {
|
|
239
|
+
let extractor = BibtexExtractor::new();
|
|
240
|
+
|
|
241
|
+
let bibtex_content = r#"
|
|
242
|
+
@article{old, author={A}, title={Old}, year={1990}}
|
|
243
|
+
@article{mid, author={B}, title={Mid}, year={2005}}
|
|
244
|
+
@article{new, author={C}, title={New}, year={2023}}
|
|
245
|
+
"#;
|
|
246
|
+
|
|
247
|
+
let config = ExtractionConfig::default();
|
|
248
|
+
let result = extractor
|
|
249
|
+
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
250
|
+
.await;
|
|
251
|
+
|
|
252
|
+
assert!(result.is_ok());
|
|
253
|
+
let result = result.unwrap();
|
|
254
|
+
|
|
255
|
+
if let Some(year_range) = result.metadata.additional.get("year_range") {
|
|
256
|
+
assert_eq!(year_range.get("min"), Some(&serde_json::json!(1990)));
|
|
257
|
+
assert_eq!(year_range.get("max"), Some(&serde_json::json!(2023)));
|
|
258
|
+
|
|
259
|
+
if let Some(years) = year_range.get("years") {
|
|
260
|
+
let years_array = years.as_array().expect("Years should be an array");
|
|
261
|
+
assert_eq!(years_array.len(), 3, "Should have 3 unique years");
|
|
262
|
+
}
|
|
263
|
+
} else {
|
|
264
|
+
panic!("Year range not extracted");
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
#[tokio::test]
|
|
269
|
+
async fn test_citation_keys_extraction() {
|
|
270
|
+
let extractor = BibtexExtractor::new();
|
|
271
|
+
|
|
272
|
+
let bibtex_content = r#"
|
|
273
|
+
@article{key1, author={A}, title={T1}, year={2023}}
|
|
274
|
+
@book{key2, author={B}, title={T2}, year={2023}}
|
|
275
|
+
@inproceedings{key3, author={C}, title={T3}, year={2023}}
|
|
276
|
+
"#;
|
|
277
|
+
|
|
278
|
+
let config = ExtractionConfig::default();
|
|
279
|
+
let result = extractor
|
|
280
|
+
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
281
|
+
.await;
|
|
282
|
+
|
|
283
|
+
assert!(result.is_ok());
|
|
284
|
+
let result = result.unwrap();
|
|
285
|
+
|
|
286
|
+
if let Some(citation_keys) = result.metadata.additional.get("citation_keys") {
|
|
287
|
+
let keys_array = citation_keys.as_array().expect("Citation keys should be an array");
|
|
288
|
+
assert_eq!(keys_array.len(), 3);
|
|
289
|
+
|
|
290
|
+
let expected_keys = vec!["key1", "key2", "key3"];
|
|
291
|
+
for expected_key in expected_keys {
|
|
292
|
+
let found = keys_array.iter().any(|k| k.as_str() == Some(expected_key));
|
|
293
|
+
assert!(found, "Citation key '{}' not found", expected_key);
|
|
294
|
+
}
|
|
295
|
+
} else {
|
|
296
|
+
panic!("Citation keys not extracted");
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
#[tokio::test]
|
|
301
|
+
async fn test_entry_type_distribution() {
|
|
302
|
+
let extractor = BibtexExtractor::new();
|
|
303
|
+
|
|
304
|
+
let bibtex_content = r#"
|
|
305
|
+
@article{a1, author={A}, title={T1}, year={2023}}
|
|
306
|
+
@article{a2, author={B}, title={T2}, year={2023}}
|
|
307
|
+
@book{b1, author={C}, title={T3}, year={2023}}
|
|
308
|
+
@inproceedings{c1, author={D}, title={T4}, year={2023}}
|
|
309
|
+
@inproceedings{c2, author={E}, title={T5}, year={2023}}
|
|
310
|
+
@inproceedings{c3, author={F}, title={T6}, year={2023}}
|
|
311
|
+
"#;
|
|
312
|
+
|
|
313
|
+
let config = ExtractionConfig::default();
|
|
314
|
+
let result = extractor
|
|
315
|
+
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
316
|
+
.await;
|
|
317
|
+
|
|
318
|
+
assert!(result.is_ok());
|
|
319
|
+
let result = result.unwrap();
|
|
320
|
+
|
|
321
|
+
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
322
|
+
let types_obj = entry_types.as_object().expect("Entry types should be an object");
|
|
323
|
+
|
|
324
|
+
assert_eq!(types_obj.get("article"), Some(&serde_json::json!(2)));
|
|
325
|
+
assert_eq!(types_obj.get("book"), Some(&serde_json::json!(1)));
|
|
326
|
+
assert_eq!(types_obj.get("inproceedings"), Some(&serde_json::json!(3)));
|
|
327
|
+
} else {
|
|
328
|
+
panic!("Entry types not extracted");
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
#[tokio::test]
|
|
333
|
+
async fn test_unicode_support() {
|
|
334
|
+
let extractor = BibtexExtractor::new();
|
|
335
|
+
|
|
336
|
+
let bibtex_content = r#"
|
|
337
|
+
@article{unicode,
|
|
338
|
+
author = {Müller, Hans and Søren, Kierkegård},
|
|
339
|
+
title = {Unicode in BibTeX: A Global Perspective},
|
|
340
|
+
journal = {International Journal},
|
|
341
|
+
year = {2023}
|
|
342
|
+
}
|
|
343
|
+
"#;
|
|
344
|
+
|
|
345
|
+
let config = ExtractionConfig::default();
|
|
346
|
+
let result = extractor
|
|
347
|
+
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
348
|
+
.await;
|
|
349
|
+
|
|
350
|
+
assert!(result.is_ok());
|
|
351
|
+
let result = result.unwrap();
|
|
352
|
+
|
|
353
|
+
assert_eq!(
|
|
354
|
+
result.metadata.additional.get("entry_count"),
|
|
355
|
+
Some(&serde_json::json!(1))
|
|
356
|
+
);
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
#[tokio::test]
|
|
360
|
+
async fn test_empty_fields() {
|
|
361
|
+
let extractor = BibtexExtractor::new();
|
|
362
|
+
|
|
363
|
+
let bibtex_content = r#"
|
|
364
|
+
@article{empty,
|
|
365
|
+
author = {Smith, John},
|
|
366
|
+
title = {Test},
|
|
367
|
+
journal = {},
|
|
368
|
+
year = {2023},
|
|
369
|
+
volume = {}
|
|
370
|
+
}
|
|
371
|
+
"#;
|
|
372
|
+
|
|
373
|
+
let config = ExtractionConfig::default();
|
|
374
|
+
let result = extractor
|
|
375
|
+
.extract_bytes(bibtex_content.as_bytes(), "application/x-bibtex", &config)
|
|
376
|
+
.await;
|
|
377
|
+
|
|
378
|
+
assert!(result.is_ok());
|
|
379
|
+
let result = result.unwrap();
|
|
380
|
+
assert_eq!(
|
|
381
|
+
result.metadata.additional.get("entry_count"),
|
|
382
|
+
Some(&serde_json::json!(1))
|
|
383
|
+
);
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
#[tokio::test]
|
|
387
|
+
async fn test_comprehensive_file() {
|
|
388
|
+
let extractor = BibtexExtractor::new();
|
|
389
|
+
|
|
390
|
+
let fixture_path = get_test_file_path("bibtex/comprehensive.bib");
|
|
391
|
+
let bibtex_content = std::fs::read(&fixture_path)
|
|
392
|
+
.unwrap_or_else(|err| panic!("Failed to read test file at {}: {}", fixture_path.display(), err));
|
|
393
|
+
|
|
394
|
+
let config = ExtractionConfig::default();
|
|
395
|
+
let result = extractor
|
|
396
|
+
.extract_bytes(&bibtex_content, "application/x-bibtex", &config)
|
|
397
|
+
.await;
|
|
398
|
+
|
|
399
|
+
assert!(result.is_ok());
|
|
400
|
+
let result = result.unwrap();
|
|
401
|
+
|
|
402
|
+
assert_eq!(
|
|
403
|
+
result.metadata.additional.get("entry_count"),
|
|
404
|
+
Some(&serde_json::json!(20))
|
|
405
|
+
);
|
|
406
|
+
|
|
407
|
+
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
408
|
+
let types_obj = entry_types.as_object().expect("Entry types should be an object");
|
|
409
|
+
assert!(types_obj.len() >= 10, "Should have at least 10 different entry types");
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
413
|
+
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
414
|
+
assert!(authors_array.len() > 10, "Should have many unique authors");
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
if let Some(year_range) = result.metadata.additional.get("year_range") {
|
|
418
|
+
assert!(year_range.get("min").is_some());
|
|
419
|
+
assert!(year_range.get("max").is_some());
|
|
420
|
+
}
|
|
421
|
+
}
|