kreuzberg 4.0.0.rc1 → 4.0.0.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -8
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -534
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -9
- data/Gemfile.lock +9 -109
- data/README.md +426 -421
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -340
- data/ext/kreuzberg_rb/extconf.rb +45 -35
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -17
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +148 -105
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +46 -45
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +691 -684
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -50
- data/lib/kreuzberg/extraction_api.rb +85 -84
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +216 -216
- data/lib/kreuzberg/setup_lib_path.rb +80 -79
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +103 -82
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +520 -468
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -87
- data/spec/binding/cli_spec.rb +55 -54
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -42
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/kreuzberg/Cargo.toml +204 -134
- data/vendor/kreuzberg/README.md +175 -175
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -460
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
- data/vendor/kreuzberg/src/core/config.rs +1032 -1032
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
- data/vendor/kreuzberg/src/core/io.rs +329 -327
- data/vendor/kreuzberg/src/core/mime.rs +605 -615
- data/vendor/kreuzberg/src/core/mod.rs +45 -42
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
- data/vendor/kreuzberg/src/embeddings.rs +432 -323
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +553 -553
- data/vendor/kreuzberg/src/extraction/image.rs +368 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
- data/vendor/kreuzberg/src/extractors/email.rs +143 -129
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -410
- data/vendor/kreuzberg/src/extractors/image.rs +198 -195
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
- data/vendor/kreuzberg/src/extractors/text.rs +260 -242
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
- data/vendor/kreuzberg/src/lib.rs +105 -102
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -122
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +393 -420
- data/vendor/kreuzberg/src/pdf/text.rs +158 -161
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +19 -19
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +903 -873
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
- data/vendor/kreuzberg/tests/config_features.rs +598 -580
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
- data/vendor/kreuzberg/tests/core_integration.rs +510 -493
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -404
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +90 -95
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/spec/examples.txt +0 -104
- data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
- data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
- data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
|
@@ -1,237 +1,237 @@
|
|
|
1
|
-
//! Keyword extraction module.
|
|
2
|
-
//!
|
|
3
|
-
//! Provides unified keyword extraction interface supporting multiple algorithms:
|
|
4
|
-
//! - YAKE (Yet Another Keyword Extractor) - statistical approach
|
|
5
|
-
//! - RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
|
|
6
|
-
//!
|
|
7
|
-
//! # Feature Flags
|
|
8
|
-
//!
|
|
9
|
-
//! - `keywords-yake`: Enable YAKE algorithm
|
|
10
|
-
//! - `keywords-rake`: Enable RAKE algorithm
|
|
11
|
-
//! - `keywords`: Enable both algorithms (default in `full` feature)
|
|
12
|
-
//!
|
|
13
|
-
//! # Examples
|
|
14
|
-
//!
|
|
15
|
-
//! ```rust,no_run
|
|
16
|
-
//! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
|
|
17
|
-
//! let text = "Rust is a systems programming language focused on safety and performance.";
|
|
18
|
-
//!
|
|
19
|
-
//! // Use default algorithm (YAKE if available)
|
|
20
|
-
//! let config = KeywordConfig::default();
|
|
21
|
-
//! let keywords = extract_keywords(text, &config).unwrap();
|
|
22
|
-
//!
|
|
23
|
-
//! for keyword in keywords {
|
|
24
|
-
//! println!("{}: {:.3}", keyword.text, keyword.score);
|
|
25
|
-
//! }
|
|
26
|
-
//! ```
|
|
27
|
-
//!
|
|
28
|
-
//! ```rust,no_run
|
|
29
|
-
//! # #[cfg(feature = "keywords-rake")]
|
|
30
|
-
//! # {
|
|
31
|
-
//! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
|
|
32
|
-
//! // Use RAKE algorithm explicitly
|
|
33
|
-
//! let text = "Machine learning models require large datasets.";
|
|
34
|
-
//! let config = KeywordConfig::rake()
|
|
35
|
-
//! .with_max_keywords(5)
|
|
36
|
-
//! .with_min_score(0.3);
|
|
37
|
-
//!
|
|
38
|
-
//! let keywords = extract_keywords(text, &config).unwrap();
|
|
39
|
-
//! # }
|
|
40
|
-
//! ```
|
|
41
|
-
|
|
42
|
-
use crate::Result;
|
|
43
|
-
use crate::plugins::registry::get_post_processor_registry;
|
|
44
|
-
use once_cell::sync::Lazy;
|
|
45
|
-
use std::sync::Arc;
|
|
46
|
-
|
|
47
|
-
pub mod config;
|
|
48
|
-
pub mod processor;
|
|
49
|
-
pub mod types;
|
|
50
|
-
|
|
51
|
-
#[cfg(feature = "keywords-yake")]
|
|
52
|
-
mod yake;
|
|
53
|
-
|
|
54
|
-
#[cfg(feature = "keywords-rake")]
|
|
55
|
-
mod rake;
|
|
56
|
-
|
|
57
|
-
pub use config::KeywordConfig;
|
|
58
|
-
pub use processor::KeywordExtractor;
|
|
59
|
-
|
|
60
|
-
#[cfg(feature = "keywords-rake")]
|
|
61
|
-
pub use config::RakeParams;
|
|
62
|
-
|
|
63
|
-
#[cfg(feature = "keywords-yake")]
|
|
64
|
-
pub use config::YakeParams;
|
|
65
|
-
pub use types::{Keyword, KeywordAlgorithm};
|
|
66
|
-
|
|
67
|
-
/// Extract keywords from text using the specified algorithm.
|
|
68
|
-
///
|
|
69
|
-
/// This is the unified entry point for keyword extraction. The algorithm
|
|
70
|
-
/// used is determined by `config.algorithm`.
|
|
71
|
-
///
|
|
72
|
-
/// # Arguments
|
|
73
|
-
///
|
|
74
|
-
/// * `text` - The text to extract keywords from
|
|
75
|
-
/// * `config` - Keyword extraction configuration
|
|
76
|
-
///
|
|
77
|
-
/// # Returns
|
|
78
|
-
///
|
|
79
|
-
/// A vector of keywords sorted by relevance (highest score first).
|
|
80
|
-
///
|
|
81
|
-
/// # Errors
|
|
82
|
-
///
|
|
83
|
-
/// Returns an error if:
|
|
84
|
-
/// - The specified algorithm feature is not enabled
|
|
85
|
-
/// - Keyword extraction fails
|
|
86
|
-
///
|
|
87
|
-
/// # Examples
|
|
88
|
-
///
|
|
89
|
-
/// ```rust,no_run
|
|
90
|
-
/// # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
|
|
91
|
-
/// let text = "Document intelligence with Rust provides memory safety.";
|
|
92
|
-
/// let config = KeywordConfig::default()
|
|
93
|
-
/// .with_max_keywords(10)
|
|
94
|
-
/// .with_language("en");
|
|
95
|
-
///
|
|
96
|
-
/// let keywords = extract_keywords(text, &config)?;
|
|
97
|
-
///
|
|
98
|
-
/// for keyword in keywords {
|
|
99
|
-
/// println!("{}: {:.3}", keyword.text, keyword.score);
|
|
100
|
-
/// }
|
|
101
|
-
/// # Ok::<(), kreuzberg::KreuzbergError>(())
|
|
102
|
-
/// ```
|
|
103
|
-
pub fn extract_keywords(text: &str, config: &KeywordConfig) -> Result<Vec<Keyword>> {
|
|
104
|
-
match config.algorithm {
|
|
105
|
-
#[cfg(feature = "keywords-yake")]
|
|
106
|
-
KeywordAlgorithm::Yake => yake::extract_keywords_yake(text, config),
|
|
107
|
-
|
|
108
|
-
#[cfg(feature = "keywords-rake")]
|
|
109
|
-
KeywordAlgorithm::Rake => rake::extract_keywords_rake(text, config),
|
|
110
|
-
|
|
111
|
-
#[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
|
|
112
|
-
_ => Err(crate::KreuzbergError::Other(
|
|
113
|
-
"No keyword extraction algorithm feature enabled".to_string(),
|
|
114
|
-
)),
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
/// Lazy-initialized flag that ensures keyword processor is registered exactly once.
|
|
119
|
-
///
|
|
120
|
-
/// This static is accessed on first use to automatically register the
|
|
121
|
-
/// keyword extraction processor with the plugin registry.
|
|
122
|
-
static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_keyword_processor);
|
|
123
|
-
|
|
124
|
-
/// Ensure the keyword processor is registered.
|
|
125
|
-
///
|
|
126
|
-
/// This function is called automatically when needed.
|
|
127
|
-
/// It's safe to call multiple times - registration only happens once.
|
|
128
|
-
pub fn ensure_initialized() -> Result<()> {
|
|
129
|
-
PROCESSOR_INITIALIZED
|
|
130
|
-
.as_ref()
|
|
131
|
-
.map(|_| ())
|
|
132
|
-
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
133
|
-
message: format!("Failed to register keyword processor: {}", e),
|
|
134
|
-
plugin_name: "keyword-extraction".to_string(),
|
|
135
|
-
})
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
/// Register the keyword extraction processor with the global registry.
|
|
139
|
-
///
|
|
140
|
-
/// This function should be called once at application startup to register
|
|
141
|
-
/// the keyword extraction post-processor.
|
|
142
|
-
///
|
|
143
|
-
/// **Note:** This is called automatically on first use.
|
|
144
|
-
/// Explicit calling is optional.
|
|
145
|
-
///
|
|
146
|
-
/// # Example
|
|
147
|
-
///
|
|
148
|
-
/// ```rust
|
|
149
|
-
/// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
150
|
-
/// use kreuzberg::keywords::register_keyword_processor;
|
|
151
|
-
///
|
|
152
|
-
/// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
153
|
-
/// # fn main() -> kreuzberg::Result<()> {
|
|
154
|
-
/// register_keyword_processor()?;
|
|
155
|
-
/// # Ok(())
|
|
156
|
-
/// # }
|
|
157
|
-
/// # #[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
|
|
158
|
-
/// # fn main() {}
|
|
159
|
-
/// ```
|
|
160
|
-
pub fn register_keyword_processor() -> Result<()> {
|
|
161
|
-
let registry = get_post_processor_registry();
|
|
162
|
-
let mut registry = registry
|
|
163
|
-
.write()
|
|
164
|
-
.map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
|
|
165
|
-
|
|
166
|
-
registry.register(Arc::new(KeywordExtractor), 50)?;
|
|
167
|
-
|
|
168
|
-
Ok(())
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
#[cfg(test)]
|
|
172
|
-
mod tests {
|
|
173
|
-
use super::*;
|
|
174
|
-
|
|
175
|
-
#[test]
|
|
176
|
-
fn test_extract_keywords_default_algorithm() {
|
|
177
|
-
let text = "Rust programming language provides memory safety and performance.";
|
|
178
|
-
let config = KeywordConfig::default();
|
|
179
|
-
|
|
180
|
-
let keywords = extract_keywords(text, &config).unwrap();
|
|
181
|
-
|
|
182
|
-
assert!(!keywords.is_empty(), "Should extract keywords");
|
|
183
|
-
assert!(keywords.len() <= config.max_keywords);
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
#[cfg(feature = "keywords-yake")]
|
|
187
|
-
#[test]
|
|
188
|
-
fn test_extract_keywords_yake() {
|
|
189
|
-
let text = "Natural language processing using Rust is efficient and safe.";
|
|
190
|
-
let config = KeywordConfig::yake();
|
|
191
|
-
|
|
192
|
-
let keywords = extract_keywords(text, &config).unwrap();
|
|
193
|
-
|
|
194
|
-
assert!(!keywords.is_empty());
|
|
195
|
-
assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Yake);
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
#[cfg(feature = "keywords-rake")]
|
|
199
|
-
#[test]
|
|
200
|
-
fn test_extract_keywords_rake() {
|
|
201
|
-
let text = "Natural language processing using Rust is efficient and safe.";
|
|
202
|
-
let config = KeywordConfig::rake();
|
|
203
|
-
|
|
204
|
-
let keywords = extract_keywords(text, &config).unwrap();
|
|
205
|
-
|
|
206
|
-
assert!(!keywords.is_empty());
|
|
207
|
-
assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Rake);
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
#[cfg(all(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
211
|
-
#[test]
|
|
212
|
-
fn test_compare_algorithms() {
|
|
213
|
-
let text = "Machine learning and artificial intelligence are transforming technology. \
|
|
214
|
-
Deep learning models require substantial computational resources.";
|
|
215
|
-
|
|
216
|
-
let yake_config = KeywordConfig::yake().with_max_keywords(5);
|
|
217
|
-
let yake_keywords = extract_keywords(text, &yake_config).unwrap();
|
|
218
|
-
|
|
219
|
-
let rake_config = KeywordConfig::rake().with_max_keywords(5);
|
|
220
|
-
let rake_keywords = extract_keywords(text, &rake_config).unwrap();
|
|
221
|
-
|
|
222
|
-
assert!(!yake_keywords.is_empty());
|
|
223
|
-
assert!(!rake_keywords.is_empty());
|
|
224
|
-
|
|
225
|
-
assert!(yake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Yake));
|
|
226
|
-
assert!(rake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Rake));
|
|
227
|
-
|
|
228
|
-
println!(
|
|
229
|
-
"YAKE keywords: {:?}",
|
|
230
|
-
yake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
|
|
231
|
-
);
|
|
232
|
-
println!(
|
|
233
|
-
"RAKE keywords: {:?}",
|
|
234
|
-
rake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
|
|
235
|
-
);
|
|
236
|
-
}
|
|
237
|
-
}
|
|
1
|
+
//! Keyword extraction module.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides unified keyword extraction interface supporting multiple algorithms:
|
|
4
|
+
//! - YAKE (Yet Another Keyword Extractor) - statistical approach
|
|
5
|
+
//! - RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
|
|
6
|
+
//!
|
|
7
|
+
//! # Feature Flags
|
|
8
|
+
//!
|
|
9
|
+
//! - `keywords-yake`: Enable YAKE algorithm
|
|
10
|
+
//! - `keywords-rake`: Enable RAKE algorithm
|
|
11
|
+
//! - `keywords`: Enable both algorithms (default in `full` feature)
|
|
12
|
+
//!
|
|
13
|
+
//! # Examples
|
|
14
|
+
//!
|
|
15
|
+
//! ```rust,no_run
|
|
16
|
+
//! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
|
|
17
|
+
//! let text = "Rust is a systems programming language focused on safety and performance.";
|
|
18
|
+
//!
|
|
19
|
+
//! // Use default algorithm (YAKE if available)
|
|
20
|
+
//! let config = KeywordConfig::default();
|
|
21
|
+
//! let keywords = extract_keywords(text, &config).unwrap();
|
|
22
|
+
//!
|
|
23
|
+
//! for keyword in keywords {
|
|
24
|
+
//! println!("{}: {:.3}", keyword.text, keyword.score);
|
|
25
|
+
//! }
|
|
26
|
+
//! ```
|
|
27
|
+
//!
|
|
28
|
+
//! ```rust,no_run
|
|
29
|
+
//! # #[cfg(feature = "keywords-rake")]
|
|
30
|
+
//! # {
|
|
31
|
+
//! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
|
|
32
|
+
//! // Use RAKE algorithm explicitly
|
|
33
|
+
//! let text = "Machine learning models require large datasets.";
|
|
34
|
+
//! let config = KeywordConfig::rake()
|
|
35
|
+
//! .with_max_keywords(5)
|
|
36
|
+
//! .with_min_score(0.3);
|
|
37
|
+
//!
|
|
38
|
+
//! let keywords = extract_keywords(text, &config).unwrap();
|
|
39
|
+
//! # }
|
|
40
|
+
//! ```
|
|
41
|
+
|
|
42
|
+
use crate::Result;
|
|
43
|
+
use crate::plugins::registry::get_post_processor_registry;
|
|
44
|
+
use once_cell::sync::Lazy;
|
|
45
|
+
use std::sync::Arc;
|
|
46
|
+
|
|
47
|
+
pub mod config;
|
|
48
|
+
pub mod processor;
|
|
49
|
+
pub mod types;
|
|
50
|
+
|
|
51
|
+
#[cfg(feature = "keywords-yake")]
|
|
52
|
+
mod yake;
|
|
53
|
+
|
|
54
|
+
#[cfg(feature = "keywords-rake")]
|
|
55
|
+
mod rake;
|
|
56
|
+
|
|
57
|
+
pub use config::KeywordConfig;
|
|
58
|
+
pub use processor::KeywordExtractor;
|
|
59
|
+
|
|
60
|
+
#[cfg(feature = "keywords-rake")]
|
|
61
|
+
pub use config::RakeParams;
|
|
62
|
+
|
|
63
|
+
#[cfg(feature = "keywords-yake")]
|
|
64
|
+
pub use config::YakeParams;
|
|
65
|
+
pub use types::{Keyword, KeywordAlgorithm};
|
|
66
|
+
|
|
67
|
+
/// Extract keywords from text using the specified algorithm.
|
|
68
|
+
///
|
|
69
|
+
/// This is the unified entry point for keyword extraction. The algorithm
|
|
70
|
+
/// used is determined by `config.algorithm`.
|
|
71
|
+
///
|
|
72
|
+
/// # Arguments
|
|
73
|
+
///
|
|
74
|
+
/// * `text` - The text to extract keywords from
|
|
75
|
+
/// * `config` - Keyword extraction configuration
|
|
76
|
+
///
|
|
77
|
+
/// # Returns
|
|
78
|
+
///
|
|
79
|
+
/// A vector of keywords sorted by relevance (highest score first).
|
|
80
|
+
///
|
|
81
|
+
/// # Errors
|
|
82
|
+
///
|
|
83
|
+
/// Returns an error if:
|
|
84
|
+
/// - The specified algorithm feature is not enabled
|
|
85
|
+
/// - Keyword extraction fails
|
|
86
|
+
///
|
|
87
|
+
/// # Examples
|
|
88
|
+
///
|
|
89
|
+
/// ```rust,no_run
|
|
90
|
+
/// # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
|
|
91
|
+
/// let text = "Document intelligence with Rust provides memory safety.";
|
|
92
|
+
/// let config = KeywordConfig::default()
|
|
93
|
+
/// .with_max_keywords(10)
|
|
94
|
+
/// .with_language("en");
|
|
95
|
+
///
|
|
96
|
+
/// let keywords = extract_keywords(text, &config)?;
|
|
97
|
+
///
|
|
98
|
+
/// for keyword in keywords {
|
|
99
|
+
/// println!("{}: {:.3}", keyword.text, keyword.score);
|
|
100
|
+
/// }
|
|
101
|
+
/// # Ok::<(), kreuzberg::KreuzbergError>(())
|
|
102
|
+
/// ```
|
|
103
|
+
pub fn extract_keywords(text: &str, config: &KeywordConfig) -> Result<Vec<Keyword>> {
|
|
104
|
+
match config.algorithm {
|
|
105
|
+
#[cfg(feature = "keywords-yake")]
|
|
106
|
+
KeywordAlgorithm::Yake => yake::extract_keywords_yake(text, config),
|
|
107
|
+
|
|
108
|
+
#[cfg(feature = "keywords-rake")]
|
|
109
|
+
KeywordAlgorithm::Rake => rake::extract_keywords_rake(text, config),
|
|
110
|
+
|
|
111
|
+
#[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
|
|
112
|
+
_ => Err(crate::KreuzbergError::Other(
|
|
113
|
+
"No keyword extraction algorithm feature enabled".to_string(),
|
|
114
|
+
)),
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/// Lazy-initialized flag that ensures keyword processor is registered exactly once.
|
|
119
|
+
///
|
|
120
|
+
/// This static is accessed on first use to automatically register the
|
|
121
|
+
/// keyword extraction processor with the plugin registry.
|
|
122
|
+
static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_keyword_processor);
|
|
123
|
+
|
|
124
|
+
/// Ensure the keyword processor is registered.
|
|
125
|
+
///
|
|
126
|
+
/// This function is called automatically when needed.
|
|
127
|
+
/// It's safe to call multiple times - registration only happens once.
|
|
128
|
+
pub fn ensure_initialized() -> Result<()> {
|
|
129
|
+
PROCESSOR_INITIALIZED
|
|
130
|
+
.as_ref()
|
|
131
|
+
.map(|_| ())
|
|
132
|
+
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
133
|
+
message: format!("Failed to register keyword processor: {}", e),
|
|
134
|
+
plugin_name: "keyword-extraction".to_string(),
|
|
135
|
+
})
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/// Register the keyword extraction processor with the global registry.
|
|
139
|
+
///
|
|
140
|
+
/// This function should be called once at application startup to register
|
|
141
|
+
/// the keyword extraction post-processor.
|
|
142
|
+
///
|
|
143
|
+
/// **Note:** This is called automatically on first use.
|
|
144
|
+
/// Explicit calling is optional.
|
|
145
|
+
///
|
|
146
|
+
/// # Example
|
|
147
|
+
///
|
|
148
|
+
/// ```rust
|
|
149
|
+
/// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
150
|
+
/// use kreuzberg::keywords::register_keyword_processor;
|
|
151
|
+
///
|
|
152
|
+
/// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
153
|
+
/// # fn main() -> kreuzberg::Result<()> {
|
|
154
|
+
/// register_keyword_processor()?;
|
|
155
|
+
/// # Ok(())
|
|
156
|
+
/// # }
|
|
157
|
+
/// # #[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
|
|
158
|
+
/// # fn main() {}
|
|
159
|
+
/// ```
|
|
160
|
+
pub fn register_keyword_processor() -> Result<()> {
|
|
161
|
+
let registry = get_post_processor_registry();
|
|
162
|
+
let mut registry = registry
|
|
163
|
+
.write()
|
|
164
|
+
.map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
|
|
165
|
+
|
|
166
|
+
registry.register(Arc::new(KeywordExtractor), 50)?;
|
|
167
|
+
|
|
168
|
+
Ok(())
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
#[cfg(test)]
|
|
172
|
+
mod tests {
|
|
173
|
+
use super::*;
|
|
174
|
+
|
|
175
|
+
#[test]
|
|
176
|
+
fn test_extract_keywords_default_algorithm() {
|
|
177
|
+
let text = "Rust programming language provides memory safety and performance.";
|
|
178
|
+
let config = KeywordConfig::default();
|
|
179
|
+
|
|
180
|
+
let keywords = extract_keywords(text, &config).unwrap();
|
|
181
|
+
|
|
182
|
+
assert!(!keywords.is_empty(), "Should extract keywords");
|
|
183
|
+
assert!(keywords.len() <= config.max_keywords);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
#[cfg(feature = "keywords-yake")]
|
|
187
|
+
#[test]
|
|
188
|
+
fn test_extract_keywords_yake() {
|
|
189
|
+
let text = "Natural language processing using Rust is efficient and safe.";
|
|
190
|
+
let config = KeywordConfig::yake();
|
|
191
|
+
|
|
192
|
+
let keywords = extract_keywords(text, &config).unwrap();
|
|
193
|
+
|
|
194
|
+
assert!(!keywords.is_empty());
|
|
195
|
+
assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Yake);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
#[cfg(feature = "keywords-rake")]
|
|
199
|
+
#[test]
|
|
200
|
+
fn test_extract_keywords_rake() {
|
|
201
|
+
let text = "Natural language processing using Rust is efficient and safe.";
|
|
202
|
+
let config = KeywordConfig::rake();
|
|
203
|
+
|
|
204
|
+
let keywords = extract_keywords(text, &config).unwrap();
|
|
205
|
+
|
|
206
|
+
assert!(!keywords.is_empty());
|
|
207
|
+
assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Rake);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
#[cfg(all(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
211
|
+
#[test]
|
|
212
|
+
fn test_compare_algorithms() {
|
|
213
|
+
let text = "Machine learning and artificial intelligence are transforming technology. \
|
|
214
|
+
Deep learning models require substantial computational resources.";
|
|
215
|
+
|
|
216
|
+
let yake_config = KeywordConfig::yake().with_max_keywords(5);
|
|
217
|
+
let yake_keywords = extract_keywords(text, &yake_config).unwrap();
|
|
218
|
+
|
|
219
|
+
let rake_config = KeywordConfig::rake().with_max_keywords(5);
|
|
220
|
+
let rake_keywords = extract_keywords(text, &rake_config).unwrap();
|
|
221
|
+
|
|
222
|
+
assert!(!yake_keywords.is_empty());
|
|
223
|
+
assert!(!rake_keywords.is_empty());
|
|
224
|
+
|
|
225
|
+
assert!(yake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Yake));
|
|
226
|
+
assert!(rake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Rake));
|
|
227
|
+
|
|
228
|
+
println!(
|
|
229
|
+
"YAKE keywords: {:?}",
|
|
230
|
+
yake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
|
|
231
|
+
);
|
|
232
|
+
println!(
|
|
233
|
+
"RAKE keywords: {:?}",
|
|
234
|
+
rake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
|
|
235
|
+
);
|
|
236
|
+
}
|
|
237
|
+
}
|