kreuzberg 4.0.0.rc1 → 4.0.0.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -8
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -534
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -9
- data/Gemfile.lock +9 -109
- data/README.md +426 -421
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -340
- data/ext/kreuzberg_rb/extconf.rb +45 -35
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -17
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +148 -105
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +46 -45
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +691 -684
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -50
- data/lib/kreuzberg/extraction_api.rb +85 -84
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +216 -216
- data/lib/kreuzberg/setup_lib_path.rb +80 -79
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +103 -82
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +520 -468
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -87
- data/spec/binding/cli_spec.rb +55 -54
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -42
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/kreuzberg/Cargo.toml +204 -134
- data/vendor/kreuzberg/README.md +175 -175
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -460
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
- data/vendor/kreuzberg/src/core/config.rs +1032 -1032
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
- data/vendor/kreuzberg/src/core/io.rs +329 -327
- data/vendor/kreuzberg/src/core/mime.rs +605 -615
- data/vendor/kreuzberg/src/core/mod.rs +45 -42
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
- data/vendor/kreuzberg/src/embeddings.rs +432 -323
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +553 -553
- data/vendor/kreuzberg/src/extraction/image.rs +368 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
- data/vendor/kreuzberg/src/extractors/email.rs +143 -129
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -410
- data/vendor/kreuzberg/src/extractors/image.rs +198 -195
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
- data/vendor/kreuzberg/src/extractors/text.rs +260 -242
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
- data/vendor/kreuzberg/src/lib.rs +105 -102
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -122
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +393 -420
- data/vendor/kreuzberg/src/pdf/text.rs +158 -161
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +19 -19
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +903 -873
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
- data/vendor/kreuzberg/tests/config_features.rs +598 -580
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
- data/vendor/kreuzberg/tests/core_integration.rs +510 -493
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -404
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +90 -95
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/spec/examples.txt +0 -104
- data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
- data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
- data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
|
@@ -1,89 +1,89 @@
|
|
|
1
|
-
use crate::error::{KreuzbergError, Result};
|
|
2
|
-
use fast_image_resize::{FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer, images::Image as FirImage};
|
|
3
|
-
use image::{DynamicImage, ImageBuffer, Rgb};
|
|
4
|
-
|
|
5
|
-
/// Resize an image using fast_image_resize with appropriate algorithm based on scale factor
|
|
6
|
-
pub fn resize_image(image: &DynamicImage, new_width: u32, new_height: u32, scale_factor: f64) -> Result<DynamicImage> {
|
|
7
|
-
let rgb_image = image.to_rgb8();
|
|
8
|
-
let (width, height) = rgb_image.dimensions();
|
|
9
|
-
|
|
10
|
-
let src_image = FirImage::from_vec_u8(width, height, rgb_image.into_raw(), PixelType::U8x3)
|
|
11
|
-
.map_err(|e| KreuzbergError::parsing(format!("Failed to create source image: {e:?}")))?;
|
|
12
|
-
|
|
13
|
-
let mut dst_image = FirImage::new(new_width, new_height, PixelType::U8x3);
|
|
14
|
-
|
|
15
|
-
let algorithm = if scale_factor < 1.0 {
|
|
16
|
-
ResizeAlg::Convolution(FilterType::Lanczos3)
|
|
17
|
-
} else {
|
|
18
|
-
ResizeAlg::Convolution(FilterType::CatmullRom)
|
|
19
|
-
};
|
|
20
|
-
|
|
21
|
-
let mut resizer = Resizer::new();
|
|
22
|
-
resizer
|
|
23
|
-
.resize(&src_image, &mut dst_image, &ResizeOptions::new().resize_alg(algorithm))
|
|
24
|
-
.map_err(|e| KreuzbergError::parsing(format!("Resize failed: {e:?}")))?;
|
|
25
|
-
|
|
26
|
-
let buffer = dst_image.into_vec();
|
|
27
|
-
let img_buffer = ImageBuffer::<Rgb<u8>, Vec<u8>>::from_raw(new_width, new_height, buffer)
|
|
28
|
-
.ok_or_else(|| KreuzbergError::parsing("Failed to create image buffer".to_string()))?;
|
|
29
|
-
|
|
30
|
-
Ok(DynamicImage::ImageRgb8(img_buffer))
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
#[cfg(test)]
|
|
34
|
-
mod tests {
|
|
35
|
-
use super::*;
|
|
36
|
-
use image::Rgb;
|
|
37
|
-
|
|
38
|
-
fn create_test_image() -> DynamicImage {
|
|
39
|
-
let mut img = ImageBuffer::new(100, 100);
|
|
40
|
-
for y in 0..100 {
|
|
41
|
-
for x in 0..100 {
|
|
42
|
-
img.put_pixel(x, y, Rgb([255u8, 0u8, 0u8]));
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
DynamicImage::ImageRgb8(img)
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
#[test]
|
|
49
|
-
fn test_resize_image_downscale() {
|
|
50
|
-
let img = create_test_image();
|
|
51
|
-
let result = resize_image(&img, 50, 50, 0.5);
|
|
52
|
-
assert!(result.is_ok());
|
|
53
|
-
let resized = result.unwrap();
|
|
54
|
-
assert_eq!(resized.width(), 50);
|
|
55
|
-
assert_eq!(resized.height(), 50);
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
#[test]
|
|
59
|
-
fn test_resize_image_upscale() {
|
|
60
|
-
let img = create_test_image();
|
|
61
|
-
let result = resize_image(&img, 200, 200, 2.0);
|
|
62
|
-
assert!(result.is_ok());
|
|
63
|
-
let resized = result.unwrap();
|
|
64
|
-
assert_eq!(resized.width(), 200);
|
|
65
|
-
assert_eq!(resized.height(), 200);
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
#[test]
|
|
69
|
-
fn test_resize_image_no_scale() {
|
|
70
|
-
let img = create_test_image();
|
|
71
|
-
let result = resize_image(&img, 100, 100, 1.0);
|
|
72
|
-
assert!(result.is_ok());
|
|
73
|
-
let resized = result.unwrap();
|
|
74
|
-
assert_eq!(resized.width(), 100);
|
|
75
|
-
assert_eq!(resized.height(), 100);
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
#[test]
|
|
79
|
-
fn test_resize_preserves_aspect_ratio() {
|
|
80
|
-
let img = create_test_image();
|
|
81
|
-
let result = resize_image(&img, 50, 50, 0.5);
|
|
82
|
-
assert!(result.is_ok());
|
|
83
|
-
let resized = result.unwrap();
|
|
84
|
-
|
|
85
|
-
let original_aspect = img.width() as f64 / img.height() as f64;
|
|
86
|
-
let resized_aspect = resized.width() as f64 / resized.height() as f64;
|
|
87
|
-
assert!((original_aspect - resized_aspect).abs() < 0.01);
|
|
88
|
-
}
|
|
89
|
-
}
|
|
1
|
+
use crate::error::{KreuzbergError, Result};
|
|
2
|
+
use fast_image_resize::{FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer, images::Image as FirImage};
|
|
3
|
+
use image::{DynamicImage, ImageBuffer, Rgb};
|
|
4
|
+
|
|
5
|
+
/// Resize an image using fast_image_resize with appropriate algorithm based on scale factor
|
|
6
|
+
pub fn resize_image(image: &DynamicImage, new_width: u32, new_height: u32, scale_factor: f64) -> Result<DynamicImage> {
|
|
7
|
+
let rgb_image = image.to_rgb8();
|
|
8
|
+
let (width, height) = rgb_image.dimensions();
|
|
9
|
+
|
|
10
|
+
let src_image = FirImage::from_vec_u8(width, height, rgb_image.into_raw(), PixelType::U8x3)
|
|
11
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to create source image: {e:?}")))?;
|
|
12
|
+
|
|
13
|
+
let mut dst_image = FirImage::new(new_width, new_height, PixelType::U8x3);
|
|
14
|
+
|
|
15
|
+
let algorithm = if scale_factor < 1.0 {
|
|
16
|
+
ResizeAlg::Convolution(FilterType::Lanczos3)
|
|
17
|
+
} else {
|
|
18
|
+
ResizeAlg::Convolution(FilterType::CatmullRom)
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
let mut resizer = Resizer::new();
|
|
22
|
+
resizer
|
|
23
|
+
.resize(&src_image, &mut dst_image, &ResizeOptions::new().resize_alg(algorithm))
|
|
24
|
+
.map_err(|e| KreuzbergError::parsing(format!("Resize failed: {e:?}")))?;
|
|
25
|
+
|
|
26
|
+
let buffer = dst_image.into_vec();
|
|
27
|
+
let img_buffer = ImageBuffer::<Rgb<u8>, Vec<u8>>::from_raw(new_width, new_height, buffer)
|
|
28
|
+
.ok_or_else(|| KreuzbergError::parsing("Failed to create image buffer".to_string()))?;
|
|
29
|
+
|
|
30
|
+
Ok(DynamicImage::ImageRgb8(img_buffer))
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#[cfg(test)]
|
|
34
|
+
mod tests {
|
|
35
|
+
use super::*;
|
|
36
|
+
use image::Rgb;
|
|
37
|
+
|
|
38
|
+
fn create_test_image() -> DynamicImage {
|
|
39
|
+
let mut img = ImageBuffer::new(100, 100);
|
|
40
|
+
for y in 0..100 {
|
|
41
|
+
for x in 0..100 {
|
|
42
|
+
img.put_pixel(x, y, Rgb([255u8, 0u8, 0u8]));
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
DynamicImage::ImageRgb8(img)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[test]
|
|
49
|
+
fn test_resize_image_downscale() {
|
|
50
|
+
let img = create_test_image();
|
|
51
|
+
let result = resize_image(&img, 50, 50, 0.5);
|
|
52
|
+
assert!(result.is_ok());
|
|
53
|
+
let resized = result.unwrap();
|
|
54
|
+
assert_eq!(resized.width(), 50);
|
|
55
|
+
assert_eq!(resized.height(), 50);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
#[test]
|
|
59
|
+
fn test_resize_image_upscale() {
|
|
60
|
+
let img = create_test_image();
|
|
61
|
+
let result = resize_image(&img, 200, 200, 2.0);
|
|
62
|
+
assert!(result.is_ok());
|
|
63
|
+
let resized = result.unwrap();
|
|
64
|
+
assert_eq!(resized.width(), 200);
|
|
65
|
+
assert_eq!(resized.height(), 200);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#[test]
|
|
69
|
+
fn test_resize_image_no_scale() {
|
|
70
|
+
let img = create_test_image();
|
|
71
|
+
let result = resize_image(&img, 100, 100, 1.0);
|
|
72
|
+
assert!(result.is_ok());
|
|
73
|
+
let resized = result.unwrap();
|
|
74
|
+
assert_eq!(resized.width(), 100);
|
|
75
|
+
assert_eq!(resized.height(), 100);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#[test]
|
|
79
|
+
fn test_resize_preserves_aspect_ratio() {
|
|
80
|
+
let img = create_test_image();
|
|
81
|
+
let result = resize_image(&img, 50, 50, 0.5);
|
|
82
|
+
assert!(result.is_ok());
|
|
83
|
+
let resized = result.unwrap();
|
|
84
|
+
|
|
85
|
+
let original_aspect = img.width() as f64 / img.height() as f64;
|
|
86
|
+
let resized_aspect = resized.width() as f64 / resized.height() as f64;
|
|
87
|
+
assert!((original_aspect - resized_aspect).abs() < 0.01);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
@@ -1,154 +1,154 @@
|
|
|
1
|
-
//! Configuration for keyword extraction.
|
|
2
|
-
|
|
3
|
-
use super::types::KeywordAlgorithm;
|
|
4
|
-
use serde::{Deserialize, Serialize};
|
|
5
|
-
|
|
6
|
-
/// YAKE-specific parameters.
|
|
7
|
-
#[cfg(feature = "keywords-yake")]
|
|
8
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
9
|
-
pub struct YakeParams {
|
|
10
|
-
/// Window size for co-occurrence analysis (default: 2).
|
|
11
|
-
///
|
|
12
|
-
/// Controls the context window for computing co-occurrence statistics.
|
|
13
|
-
pub window_size: usize,
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
#[cfg(feature = "keywords-yake")]
|
|
17
|
-
impl Default for YakeParams {
|
|
18
|
-
fn default() -> Self {
|
|
19
|
-
Self { window_size: 2 }
|
|
20
|
-
}
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
/// RAKE-specific parameters.
|
|
24
|
-
#[cfg(feature = "keywords-rake")]
|
|
25
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
26
|
-
pub struct RakeParams {
|
|
27
|
-
/// Minimum word length to consider (default: 1).
|
|
28
|
-
pub min_word_length: usize,
|
|
29
|
-
|
|
30
|
-
/// Maximum words in a keyword phrase (default: 3).
|
|
31
|
-
pub max_words_per_phrase: usize,
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
#[cfg(feature = "keywords-rake")]
|
|
35
|
-
impl Default for RakeParams {
|
|
36
|
-
fn default() -> Self {
|
|
37
|
-
Self {
|
|
38
|
-
min_word_length: 1,
|
|
39
|
-
max_words_per_phrase: 3,
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
/// Keyword extraction configuration.
|
|
45
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
46
|
-
pub struct KeywordConfig {
|
|
47
|
-
/// Algorithm to use for extraction.
|
|
48
|
-
pub algorithm: KeywordAlgorithm,
|
|
49
|
-
|
|
50
|
-
/// Maximum number of keywords to extract (default: 10).
|
|
51
|
-
pub max_keywords: usize,
|
|
52
|
-
|
|
53
|
-
/// Minimum score threshold (0.0-1.0, default: 0.0).
|
|
54
|
-
///
|
|
55
|
-
/// Keywords with scores below this threshold are filtered out.
|
|
56
|
-
/// Note: Score ranges differ between algorithms.
|
|
57
|
-
pub min_score: f32,
|
|
58
|
-
|
|
59
|
-
/// N-gram range for keyword extraction (min, max).
|
|
60
|
-
///
|
|
61
|
-
/// (1, 1) = unigrams only
|
|
62
|
-
/// (1, 2) = unigrams and bigrams
|
|
63
|
-
/// (1, 3) = unigrams, bigrams, and trigrams (default)
|
|
64
|
-
pub ngram_range: (usize, usize),
|
|
65
|
-
|
|
66
|
-
/// Language code for stopword filtering (e.g., "en", "de", "fr").
|
|
67
|
-
///
|
|
68
|
-
/// If None, no stopword filtering is applied.
|
|
69
|
-
pub language: Option<String>,
|
|
70
|
-
|
|
71
|
-
/// YAKE-specific tuning parameters.
|
|
72
|
-
#[cfg(feature = "keywords-yake")]
|
|
73
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
74
|
-
pub yake_params: Option<YakeParams>,
|
|
75
|
-
|
|
76
|
-
/// RAKE-specific tuning parameters.
|
|
77
|
-
#[cfg(feature = "keywords-rake")]
|
|
78
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
79
|
-
pub rake_params: Option<RakeParams>,
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
impl Default for KeywordConfig {
|
|
83
|
-
fn default() -> Self {
|
|
84
|
-
Self {
|
|
85
|
-
algorithm: KeywordAlgorithm::default(),
|
|
86
|
-
max_keywords: 10,
|
|
87
|
-
min_score: 0.0,
|
|
88
|
-
ngram_range: (1, 3),
|
|
89
|
-
language: Some("en".to_string()),
|
|
90
|
-
#[cfg(feature = "keywords-yake")]
|
|
91
|
-
yake_params: None,
|
|
92
|
-
#[cfg(feature = "keywords-rake")]
|
|
93
|
-
rake_params: None,
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
impl KeywordConfig {
|
|
99
|
-
/// Create a new configuration with YAKE algorithm.
|
|
100
|
-
#[cfg(feature = "keywords-yake")]
|
|
101
|
-
pub fn yake() -> Self {
|
|
102
|
-
Self {
|
|
103
|
-
algorithm: KeywordAlgorithm::Yake,
|
|
104
|
-
..Default::default()
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
/// Create a new configuration with RAKE algorithm.
|
|
109
|
-
#[cfg(feature = "keywords-rake")]
|
|
110
|
-
pub fn rake() -> Self {
|
|
111
|
-
Self {
|
|
112
|
-
algorithm: KeywordAlgorithm::Rake,
|
|
113
|
-
..Default::default()
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
/// Set maximum number of keywords to extract.
|
|
118
|
-
pub fn with_max_keywords(mut self, max: usize) -> Self {
|
|
119
|
-
self.max_keywords = max;
|
|
120
|
-
self
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
/// Set minimum score threshold.
|
|
124
|
-
pub fn with_min_score(mut self, score: f32) -> Self {
|
|
125
|
-
self.min_score = score;
|
|
126
|
-
self
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
/// Set n-gram range.
|
|
130
|
-
pub fn with_ngram_range(mut self, min: usize, max: usize) -> Self {
|
|
131
|
-
self.ngram_range = (min, max);
|
|
132
|
-
self
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/// Set language for stopword filtering.
|
|
136
|
-
pub fn with_language(mut self, lang: impl Into<String>) -> Self {
|
|
137
|
-
self.language = Some(lang.into());
|
|
138
|
-
self
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
/// Set YAKE-specific parameters.
|
|
142
|
-
#[cfg(feature = "keywords-yake")]
|
|
143
|
-
pub fn with_yake_params(mut self, params: YakeParams) -> Self {
|
|
144
|
-
self.yake_params = Some(params);
|
|
145
|
-
self
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
/// Set RAKE-specific parameters.
|
|
149
|
-
#[cfg(feature = "keywords-rake")]
|
|
150
|
-
pub fn with_rake_params(mut self, params: RakeParams) -> Self {
|
|
151
|
-
self.rake_params = Some(params);
|
|
152
|
-
self
|
|
153
|
-
}
|
|
154
|
-
}
|
|
1
|
+
//! Configuration for keyword extraction.
|
|
2
|
+
|
|
3
|
+
use super::types::KeywordAlgorithm;
|
|
4
|
+
use serde::{Deserialize, Serialize};
|
|
5
|
+
|
|
6
|
+
/// YAKE-specific parameters.
|
|
7
|
+
#[cfg(feature = "keywords-yake")]
|
|
8
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
9
|
+
pub struct YakeParams {
|
|
10
|
+
/// Window size for co-occurrence analysis (default: 2).
|
|
11
|
+
///
|
|
12
|
+
/// Controls the context window for computing co-occurrence statistics.
|
|
13
|
+
pub window_size: usize,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
#[cfg(feature = "keywords-yake")]
|
|
17
|
+
impl Default for YakeParams {
|
|
18
|
+
fn default() -> Self {
|
|
19
|
+
Self { window_size: 2 }
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/// RAKE-specific parameters.
|
|
24
|
+
#[cfg(feature = "keywords-rake")]
|
|
25
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
26
|
+
pub struct RakeParams {
|
|
27
|
+
/// Minimum word length to consider (default: 1).
|
|
28
|
+
pub min_word_length: usize,
|
|
29
|
+
|
|
30
|
+
/// Maximum words in a keyword phrase (default: 3).
|
|
31
|
+
pub max_words_per_phrase: usize,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
#[cfg(feature = "keywords-rake")]
|
|
35
|
+
impl Default for RakeParams {
|
|
36
|
+
fn default() -> Self {
|
|
37
|
+
Self {
|
|
38
|
+
min_word_length: 1,
|
|
39
|
+
max_words_per_phrase: 3,
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Keyword extraction configuration.
|
|
45
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
46
|
+
pub struct KeywordConfig {
|
|
47
|
+
/// Algorithm to use for extraction.
|
|
48
|
+
pub algorithm: KeywordAlgorithm,
|
|
49
|
+
|
|
50
|
+
/// Maximum number of keywords to extract (default: 10).
|
|
51
|
+
pub max_keywords: usize,
|
|
52
|
+
|
|
53
|
+
/// Minimum score threshold (0.0-1.0, default: 0.0).
|
|
54
|
+
///
|
|
55
|
+
/// Keywords with scores below this threshold are filtered out.
|
|
56
|
+
/// Note: Score ranges differ between algorithms.
|
|
57
|
+
pub min_score: f32,
|
|
58
|
+
|
|
59
|
+
/// N-gram range for keyword extraction (min, max).
|
|
60
|
+
///
|
|
61
|
+
/// (1, 1) = unigrams only
|
|
62
|
+
/// (1, 2) = unigrams and bigrams
|
|
63
|
+
/// (1, 3) = unigrams, bigrams, and trigrams (default)
|
|
64
|
+
pub ngram_range: (usize, usize),
|
|
65
|
+
|
|
66
|
+
/// Language code for stopword filtering (e.g., "en", "de", "fr").
|
|
67
|
+
///
|
|
68
|
+
/// If None, no stopword filtering is applied.
|
|
69
|
+
pub language: Option<String>,
|
|
70
|
+
|
|
71
|
+
/// YAKE-specific tuning parameters.
|
|
72
|
+
#[cfg(feature = "keywords-yake")]
|
|
73
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
74
|
+
pub yake_params: Option<YakeParams>,
|
|
75
|
+
|
|
76
|
+
/// RAKE-specific tuning parameters.
|
|
77
|
+
#[cfg(feature = "keywords-rake")]
|
|
78
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
79
|
+
pub rake_params: Option<RakeParams>,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
impl Default for KeywordConfig {
|
|
83
|
+
fn default() -> Self {
|
|
84
|
+
Self {
|
|
85
|
+
algorithm: KeywordAlgorithm::default(),
|
|
86
|
+
max_keywords: 10,
|
|
87
|
+
min_score: 0.0,
|
|
88
|
+
ngram_range: (1, 3),
|
|
89
|
+
language: Some("en".to_string()),
|
|
90
|
+
#[cfg(feature = "keywords-yake")]
|
|
91
|
+
yake_params: None,
|
|
92
|
+
#[cfg(feature = "keywords-rake")]
|
|
93
|
+
rake_params: None,
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
impl KeywordConfig {
|
|
99
|
+
/// Create a new configuration with YAKE algorithm.
|
|
100
|
+
#[cfg(feature = "keywords-yake")]
|
|
101
|
+
pub fn yake() -> Self {
|
|
102
|
+
Self {
|
|
103
|
+
algorithm: KeywordAlgorithm::Yake,
|
|
104
|
+
..Default::default()
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/// Create a new configuration with RAKE algorithm.
|
|
109
|
+
#[cfg(feature = "keywords-rake")]
|
|
110
|
+
pub fn rake() -> Self {
|
|
111
|
+
Self {
|
|
112
|
+
algorithm: KeywordAlgorithm::Rake,
|
|
113
|
+
..Default::default()
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/// Set maximum number of keywords to extract.
|
|
118
|
+
pub fn with_max_keywords(mut self, max: usize) -> Self {
|
|
119
|
+
self.max_keywords = max;
|
|
120
|
+
self
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/// Set minimum score threshold.
|
|
124
|
+
pub fn with_min_score(mut self, score: f32) -> Self {
|
|
125
|
+
self.min_score = score;
|
|
126
|
+
self
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/// Set n-gram range.
|
|
130
|
+
pub fn with_ngram_range(mut self, min: usize, max: usize) -> Self {
|
|
131
|
+
self.ngram_range = (min, max);
|
|
132
|
+
self
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/// Set language for stopword filtering.
|
|
136
|
+
pub fn with_language(mut self, lang: impl Into<String>) -> Self {
|
|
137
|
+
self.language = Some(lang.into());
|
|
138
|
+
self
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/// Set YAKE-specific parameters.
|
|
142
|
+
#[cfg(feature = "keywords-yake")]
|
|
143
|
+
pub fn with_yake_params(mut self, params: YakeParams) -> Self {
|
|
144
|
+
self.yake_params = Some(params);
|
|
145
|
+
self
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Set RAKE-specific parameters.
|
|
149
|
+
#[cfg(feature = "keywords-rake")]
|
|
150
|
+
pub fn with_rake_params(mut self, params: RakeParams) -> Self {
|
|
151
|
+
self.rake_params = Some(params);
|
|
152
|
+
self
|
|
153
|
+
}
|
|
154
|
+
}
|