kreuzberg 4.0.0.rc2 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +396 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,600 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
RSpec.describe 'Keyword Extraction' do
|
|
4
|
+
describe 'basic keyword extraction' do
|
|
5
|
+
it 'extracts keywords from text' do
|
|
6
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
7
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
8
|
+
algorithm: 'yake',
|
|
9
|
+
max_keywords: 10
|
|
10
|
+
)
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
text = 'Machine learning and artificial intelligence are transforming technology. Neural networks and deep learning are key areas of AI research. These technologies enable predictions and data analysis.'
|
|
14
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
15
|
+
|
|
16
|
+
expect(result).not_to be_nil
|
|
17
|
+
expect(result.content).not_to be_nil
|
|
18
|
+
expect(result.content).to include('Machine learning')
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it 'returns keywords in metadata' do
|
|
22
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
23
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
24
|
+
algorithm: 'yake',
|
|
25
|
+
max_keywords: 5
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
text = 'Artificial intelligence transforms technology development. Machine learning algorithms improve with training data.'
|
|
30
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
31
|
+
|
|
32
|
+
expect(result).not_to be_nil
|
|
33
|
+
expect(result.metadata).to be_a(Hash)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
it 'respects max_keywords parameter' do
|
|
37
|
+
max_keywords_values = [1, 5, 10]
|
|
38
|
+
|
|
39
|
+
max_keywords_values.each do |max_kw|
|
|
40
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
41
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
42
|
+
algorithm: 'yake',
|
|
43
|
+
max_keywords: max_kw
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
text = 'Machine learning and artificial intelligence are transforming technology. Neural networks and deep learning are key research areas. Data science enables predictions. Algorithms process information efficiently.'
|
|
48
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
49
|
+
|
|
50
|
+
expect(result).not_to be_nil
|
|
51
|
+
expect(result.content).not_to be_nil
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it 'returns content when keywords enabled' do
|
|
56
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
57
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
58
|
+
algorithm: 'yake',
|
|
59
|
+
max_keywords: 5
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
text = 'Artificial intelligence is transforming the world.'
|
|
64
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
65
|
+
|
|
66
|
+
expect(result).not_to be_nil
|
|
67
|
+
expect(result.content).not_to be_nil
|
|
68
|
+
expect(result.content).to include('Artificial intelligence')
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
describe 'multilingual keyword extraction' do
|
|
73
|
+
it 'extracts keywords from English text' do
|
|
74
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
75
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
76
|
+
algorithm: 'yake',
|
|
77
|
+
language: 'en',
|
|
78
|
+
max_keywords: 5
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
text = 'Machine learning and artificial intelligence are transforming technology development globally.'
|
|
83
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
84
|
+
|
|
85
|
+
expect(result).not_to be_nil
|
|
86
|
+
expect(result.content).not_to be_nil
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it 'extracts keywords from German text' do
|
|
90
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
91
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
92
|
+
algorithm: 'yake',
|
|
93
|
+
language: 'de',
|
|
94
|
+
max_keywords: 5
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
text = 'Maschinelles Lernen und künstliche Intelligenz transformieren die Technologieentwicklung.'
|
|
99
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
100
|
+
|
|
101
|
+
expect(result).not_to be_nil
|
|
102
|
+
expect(result.content).not_to be_nil
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
it 'extracts keywords from French text' do
|
|
106
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
107
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
108
|
+
algorithm: 'yake',
|
|
109
|
+
language: 'fr',
|
|
110
|
+
max_keywords: 5
|
|
111
|
+
)
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
text = "L'apprentissage automatique et l'intelligence artificielle transforment le développement technologique."
|
|
115
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
116
|
+
|
|
117
|
+
expect(result).not_to be_nil
|
|
118
|
+
expect(result.content).not_to be_nil
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
it 'handles language parameter correctly' do
|
|
122
|
+
languages = %w[en de fr es it pt nl]
|
|
123
|
+
|
|
124
|
+
languages.each do |lang|
|
|
125
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
126
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
127
|
+
algorithm: 'yake',
|
|
128
|
+
language: lang,
|
|
129
|
+
max_keywords: 3
|
|
130
|
+
)
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
text = 'Machine learning and artificial intelligence.'
|
|
134
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
135
|
+
|
|
136
|
+
expect(result).not_to be_nil
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
describe 'min_score filtering' do
|
|
142
|
+
it 'filters keywords with different score thresholds' do
|
|
143
|
+
text = 'Machine learning and artificial intelligence are transforming technology. Neural networks and deep learning are key research areas.'
|
|
144
|
+
|
|
145
|
+
results_by_threshold = {}
|
|
146
|
+
thresholds = [0.1, 0.5, 0.9]
|
|
147
|
+
|
|
148
|
+
thresholds.each do |threshold|
|
|
149
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
150
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
151
|
+
algorithm: 'yake',
|
|
152
|
+
max_keywords: 100,
|
|
153
|
+
min_score: threshold
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
158
|
+
results_by_threshold[threshold] = result
|
|
159
|
+
|
|
160
|
+
expect(result).not_to be_nil
|
|
161
|
+
expect(result.content).not_to be_nil
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Lower thresholds should produce more or equal results
|
|
165
|
+
low_score_content = results_by_threshold[0.1]
|
|
166
|
+
high_score_content = results_by_threshold[0.9]
|
|
167
|
+
expect(low_score_content.content.length).to be >= high_score_content.content.length
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
it 'produces consistent results with same score threshold' do
|
|
171
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
172
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
173
|
+
algorithm: 'yake',
|
|
174
|
+
max_keywords: 50,
|
|
175
|
+
min_score: 0.3
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
text = 'Artificial intelligence is transforming data science and machine learning research globally with neural networks.'
|
|
180
|
+
result1 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
181
|
+
result2 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
182
|
+
result3 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
183
|
+
|
|
184
|
+
# Results should be identical across runs
|
|
185
|
+
expect(result1.content).to eq(result2.content)
|
|
186
|
+
expect(result2.content).to eq(result3.content)
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
describe 'ngram_range variations' do
|
|
191
|
+
it 'extracts keywords with different ngram configurations' do
|
|
192
|
+
text = 'Machine learning and artificial intelligence are transforming data science and technology development globally.'
|
|
193
|
+
|
|
194
|
+
configs = {
|
|
195
|
+
'single_words' => [1, 1],
|
|
196
|
+
'bigrams' => [2, 2],
|
|
197
|
+
'unigram_bigram' => [1, 2],
|
|
198
|
+
'unigram_trigram' => [1, 3]
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
results = {}
|
|
202
|
+
configs.each do |label, range|
|
|
203
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
204
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
205
|
+
algorithm: 'yake',
|
|
206
|
+
max_keywords: 15,
|
|
207
|
+
ngram_range: range
|
|
208
|
+
)
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
212
|
+
results[label] = result
|
|
213
|
+
|
|
214
|
+
expect(result).not_to be_nil
|
|
215
|
+
expect(result.content).not_to be_nil
|
|
216
|
+
expect(result.content.length).to be > 0
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Wider n-gram ranges typically produce more content due to phrase inclusion
|
|
220
|
+
expect(results['unigram_trigram'].content.length).to be >= results['single_words'].content.length
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
it 'produces consistent results with same ngram_range' do
|
|
224
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
225
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
226
|
+
algorithm: 'yake',
|
|
227
|
+
max_keywords: 10,
|
|
228
|
+
ngram_range: [1, 2]
|
|
229
|
+
)
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
text = 'Machine learning and artificial intelligence are transforming technology development across industry.'
|
|
233
|
+
result1 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
234
|
+
result2 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
235
|
+
|
|
236
|
+
# Results should be identical
|
|
237
|
+
expect(result1.content).to eq(result2.content)
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
describe 'algorithm selection' do
|
|
242
|
+
it 'both YAKE and RAKE algorithms produce results' do
|
|
243
|
+
text = 'Machine learning and artificial intelligence are transforming technology and neural networks enable deep learning.'
|
|
244
|
+
|
|
245
|
+
yake_config = Kreuzberg::Config::Extraction.new(
|
|
246
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
247
|
+
algorithm: 'yake',
|
|
248
|
+
max_keywords: 10
|
|
249
|
+
)
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
rake_config = Kreuzberg::Config::Extraction.new(
|
|
253
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
254
|
+
algorithm: 'rake',
|
|
255
|
+
max_keywords: 10
|
|
256
|
+
)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
yake_result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: yake_config)
|
|
260
|
+
rake_result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: rake_config)
|
|
261
|
+
|
|
262
|
+
expect(yake_result).not_to be_nil
|
|
263
|
+
expect(yake_result.content).not_to be_nil
|
|
264
|
+
expect(yake_result.content.length).to be > 0
|
|
265
|
+
|
|
266
|
+
expect(rake_result).not_to be_nil
|
|
267
|
+
expect(rake_result.content).not_to be_nil
|
|
268
|
+
expect(rake_result.content.length).to be > 0
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
it 'algorithm-specific parameters affect extraction' do
|
|
272
|
+
text = 'Machine learning and artificial intelligence are transforming technology development and research.'
|
|
273
|
+
|
|
274
|
+
# YAKE with different window sizes
|
|
275
|
+
yake_config_small = Kreuzberg::Config::Extraction.new(
|
|
276
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
277
|
+
algorithm: 'yake',
|
|
278
|
+
max_keywords: 10,
|
|
279
|
+
yake_params: Kreuzberg::Config::KeywordYakeParams.new(window_size: 2)
|
|
280
|
+
)
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
yake_config_large = Kreuzberg::Config::Extraction.new(
|
|
284
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
285
|
+
algorithm: 'yake',
|
|
286
|
+
max_keywords: 10,
|
|
287
|
+
yake_params: Kreuzberg::Config::KeywordYakeParams.new(window_size: 4)
|
|
288
|
+
)
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
result_small = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: yake_config_small)
|
|
292
|
+
result_large = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: yake_config_large)
|
|
293
|
+
|
|
294
|
+
expect(result_small).not_to be_nil
|
|
295
|
+
expect(result_large).not_to be_nil
|
|
296
|
+
# Both should produce valid results
|
|
297
|
+
expect(result_small.content.length).to be > 0
|
|
298
|
+
expect(result_large.content.length).to be > 0
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
it 'RAKE with min_word_length parameter works' do
|
|
302
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
303
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
304
|
+
algorithm: 'rake',
|
|
305
|
+
max_keywords: 10,
|
|
306
|
+
rake_params: Kreuzberg::Config::KeywordRakeParams.new(
|
|
307
|
+
min_word_length: 2,
|
|
308
|
+
max_words_per_phrase: 4
|
|
309
|
+
)
|
|
310
|
+
)
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
text = 'Machine learning and artificial intelligence are transforming technology.'
|
|
314
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
315
|
+
|
|
316
|
+
expect(result).not_to be_nil
|
|
317
|
+
expect(result.content).not_to be_nil
|
|
318
|
+
expect(result.content.length).to be > 0
|
|
319
|
+
end
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
describe 'batch keyword extraction' do
|
|
323
|
+
it 'processes multiple texts with same configuration' do
|
|
324
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
325
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
326
|
+
algorithm: 'yake',
|
|
327
|
+
max_keywords: 5
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
texts = [
|
|
332
|
+
'Machine learning and artificial intelligence are transforming technology.',
|
|
333
|
+
'Deep learning neural networks enable advanced data science applications.',
|
|
334
|
+
'Artificial intelligence enables predictions and automation globally.'
|
|
335
|
+
]
|
|
336
|
+
|
|
337
|
+
results = texts.map { |text| Kreuzberg.extract_bytes_sync(text, 'text/plain', config: config) }
|
|
338
|
+
|
|
339
|
+
expect(results.length).to eq(3)
|
|
340
|
+
results.each do |result|
|
|
341
|
+
expect(result).not_to be_nil
|
|
342
|
+
expect(result.content).not_to be_nil
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
it 'maintains consistency across batch processing' do
|
|
347
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
348
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
349
|
+
algorithm: 'yake',
|
|
350
|
+
max_keywords: 10
|
|
351
|
+
)
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
text = 'Machine learning and artificial intelligence are transforming technology development globally.'
|
|
355
|
+
|
|
356
|
+
result1 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
357
|
+
result2 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
358
|
+
|
|
359
|
+
expect(result1).not_to be_nil
|
|
360
|
+
expect(result2).not_to be_nil
|
|
361
|
+
expect(result1.content).to eq(result2.content)
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
it 'handles different configurations per batch item' do
|
|
365
|
+
text = 'Machine learning and artificial intelligence are transforming technology.'
|
|
366
|
+
|
|
367
|
+
configs = [
|
|
368
|
+
Kreuzberg::Config::Extraction.new(
|
|
369
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 5)
|
|
370
|
+
),
|
|
371
|
+
Kreuzberg::Config::Extraction.new(
|
|
372
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'rake', max_keywords: 5)
|
|
373
|
+
),
|
|
374
|
+
Kreuzberg::Config::Extraction.new(
|
|
375
|
+
keywords: Kreuzberg::Config::Keywords.new(algorithm: 'yake', max_keywords: 10)
|
|
376
|
+
)
|
|
377
|
+
]
|
|
378
|
+
|
|
379
|
+
results = configs.map { |cfg| Kreuzberg.extract_bytes_sync(text, 'text/plain', config: cfg) }
|
|
380
|
+
|
|
381
|
+
expect(results.length).to eq(3)
|
|
382
|
+
results.each do |result|
|
|
383
|
+
expect(result).not_to be_nil
|
|
384
|
+
end
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
|
|
388
|
+
describe 'score normalization validation' do
|
|
389
|
+
it 'returns normalized scores between 0 and 1' do
|
|
390
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
391
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
392
|
+
algorithm: 'yake',
|
|
393
|
+
max_keywords: 20
|
|
394
|
+
)
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
text = 'Machine learning and artificial intelligence are transforming technology. Neural networks and deep learning enable data science applications.'
|
|
398
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
399
|
+
|
|
400
|
+
expect(result).not_to be_nil
|
|
401
|
+
expect(result.content).not_to be_nil
|
|
402
|
+
end
|
|
403
|
+
|
|
404
|
+
it 'validates score values are realistic' do
|
|
405
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
406
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
407
|
+
algorithm: 'yake',
|
|
408
|
+
max_keywords: 50
|
|
409
|
+
)
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
text = 'Artificial intelligence machine learning data science neural networks deep learning transforming technology.'
|
|
413
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
414
|
+
|
|
415
|
+
expect(result).not_to be_nil
|
|
416
|
+
expect(result.content).not_to be_nil
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
it 'handles score filtering with normalized ranges' do
|
|
420
|
+
scores_to_test = [0.0, 0.25, 0.5, 0.75, 1.0]
|
|
421
|
+
|
|
422
|
+
scores_to_test.each do |score_threshold|
|
|
423
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
424
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
425
|
+
algorithm: 'yake',
|
|
426
|
+
max_keywords: 100,
|
|
427
|
+
min_score: score_threshold
|
|
428
|
+
)
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
text = 'Machine learning artificial intelligence data science neural networks deep learning technology.'
|
|
432
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
433
|
+
|
|
434
|
+
expect(result).not_to be_nil
|
|
435
|
+
end
|
|
436
|
+
end
|
|
437
|
+
|
|
438
|
+
it 'ensures score consistency for same text' do
|
|
439
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
440
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
441
|
+
algorithm: 'yake',
|
|
442
|
+
max_keywords: 10
|
|
443
|
+
)
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
text = 'Machine learning and artificial intelligence transform technology.'
|
|
447
|
+
result1 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
448
|
+
result2 = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
449
|
+
|
|
450
|
+
expect(result1).not_to be_nil
|
|
451
|
+
expect(result2).not_to be_nil
|
|
452
|
+
end
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
describe 'empty and edge cases' do
|
|
456
|
+
it 'handles very short text gracefully' do
|
|
457
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
458
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
459
|
+
algorithm: 'yake',
|
|
460
|
+
max_keywords: 5
|
|
461
|
+
)
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
text = 'AI'
|
|
465
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
466
|
+
|
|
467
|
+
expect(result).not_to be_nil
|
|
468
|
+
expect(result.content).not_to be_nil
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
it 'handles text with no obvious keywords' do
|
|
472
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
473
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
474
|
+
algorithm: 'yake',
|
|
475
|
+
max_keywords: 5
|
|
476
|
+
)
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
text = 'a b c d e'
|
|
480
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
481
|
+
|
|
482
|
+
expect(result).not_to be_nil
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
it 'handles text with repeated keywords' do
|
|
486
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
487
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
488
|
+
algorithm: 'yake',
|
|
489
|
+
max_keywords: 5
|
|
490
|
+
)
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
text = 'Machine machine machine learning learning learning artificial artificial artificial intelligence intelligence.'
|
|
494
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
495
|
+
|
|
496
|
+
expect(result).not_to be_nil
|
|
497
|
+
expect(result.content).not_to be_nil
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
it 'handles max_keywords of 0' do
|
|
501
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
502
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
503
|
+
algorithm: 'yake',
|
|
504
|
+
max_keywords: 0
|
|
505
|
+
)
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
text = 'Machine learning and artificial intelligence.'
|
|
509
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
510
|
+
|
|
511
|
+
expect(result).not_to be_nil
|
|
512
|
+
end
|
|
513
|
+
|
|
514
|
+
it 'handles large max_keywords value' do
|
|
515
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
516
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
517
|
+
algorithm: 'yake',
|
|
518
|
+
max_keywords: 1000
|
|
519
|
+
)
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
text = 'Machine learning and artificial intelligence are transforming technology.'
|
|
523
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
524
|
+
|
|
525
|
+
expect(result).not_to be_nil
|
|
526
|
+
end
|
|
527
|
+
|
|
528
|
+
it 'handles disabled keyword extraction' do
|
|
529
|
+
config = Kreuzberg::Config::Extraction.new
|
|
530
|
+
text = 'Machine learning and artificial intelligence.'
|
|
531
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
532
|
+
|
|
533
|
+
expect(result).not_to be_nil
|
|
534
|
+
expect(result.content).not_to be_nil
|
|
535
|
+
end
|
|
536
|
+
|
|
537
|
+
it 'handles keywords config with nil values' do
|
|
538
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
539
|
+
keywords: Kreuzberg::Config::Keywords.new(
|
|
540
|
+
algorithm: nil,
|
|
541
|
+
max_keywords: nil
|
|
542
|
+
)
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
text = 'Machine learning and artificial intelligence.'
|
|
546
|
+
result = Kreuzberg.extract_bytes_sync(data: text, mime_type: 'text/plain', config: config)
|
|
547
|
+
|
|
548
|
+
expect(result).not_to be_nil
|
|
549
|
+
end
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
describe 'integration with Extraction config' do
|
|
553
|
+
it 'accepts Keywords config in Extraction' do
|
|
554
|
+
keywords = Kreuzberg::Config::Keywords.new(
|
|
555
|
+
algorithm: 'yake',
|
|
556
|
+
max_keywords: 10
|
|
557
|
+
)
|
|
558
|
+
config = Kreuzberg::Config::Extraction.new(keywords: keywords)
|
|
559
|
+
|
|
560
|
+
expect(config.keywords).to be_a(Kreuzberg::Config::Keywords)
|
|
561
|
+
expect(config.keywords.algorithm).to eq('yake')
|
|
562
|
+
end
|
|
563
|
+
|
|
564
|
+
it 'accepts keywords config as hash in Extraction' do
|
|
565
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
566
|
+
keywords: {
|
|
567
|
+
algorithm: 'rake',
|
|
568
|
+
max_keywords: 15,
|
|
569
|
+
min_score: 0.3
|
|
570
|
+
}
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
expect(config.keywords).to be_a(Kreuzberg::Config::Keywords)
|
|
574
|
+
expect(config.keywords.algorithm).to eq('rake')
|
|
575
|
+
expect(config.keywords.max_keywords).to eq(15)
|
|
576
|
+
end
|
|
577
|
+
|
|
578
|
+
it 'includes keywords config in to_h' do
|
|
579
|
+
keywords = Kreuzberg::Config::Keywords.new(
|
|
580
|
+
algorithm: 'yake',
|
|
581
|
+
max_keywords: 10
|
|
582
|
+
)
|
|
583
|
+
config = Kreuzberg::Config::Extraction.new(keywords: keywords)
|
|
584
|
+
|
|
585
|
+
hash = config.to_h
|
|
586
|
+
|
|
587
|
+
expect(hash).to include(:keywords)
|
|
588
|
+
expect(hash[:keywords]).to be_a(Hash)
|
|
589
|
+
expect(hash[:keywords][:algorithm]).to eq('yake')
|
|
590
|
+
end
|
|
591
|
+
|
|
592
|
+
it 'handles nil keywords config' do
|
|
593
|
+
config = Kreuzberg::Config::Extraction.new(keywords: nil)
|
|
594
|
+
|
|
595
|
+
expect(config.keywords).to be_nil
|
|
596
|
+
hash = config.to_h
|
|
597
|
+
expect(hash[:keywords]).to be_nil
|
|
598
|
+
end
|
|
599
|
+
end
|
|
600
|
+
end
|