kreuzberg 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +534 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +157 -0
- data/README.md +421 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +340 -0
- data/ext/kreuzberg_rb/extconf.rb +35 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +17 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +105 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +45 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +684 -0
- data/lib/kreuzberg/errors.rb +50 -0
- data/lib/kreuzberg/extraction_api.rb +84 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +79 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +82 -0
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +468 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +87 -0
- data/spec/binding/cli_spec.rb +54 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +42 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +134 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/build.rs +460 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +903 -0
- data/vendor/kreuzberg/src/core/io.rs +327 -0
- data/vendor/kreuzberg/src/core/mime.rs +615 -0
- data/vendor/kreuzberg/src/core/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
- data/vendor/kreuzberg/src/embeddings.rs +323 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
- data/vendor/kreuzberg/src/extractors/email.rs +129 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
- data/vendor/kreuzberg/src/extractors/html.rs +410 -0
- data/vendor/kreuzberg/src/extractors/image.rs +195 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
- data/vendor/kreuzberg/src/extractors/text.rs +242 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +102 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +420 -0
- data/vendor/kreuzberg/src/pdf/text.rs +161 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +873 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
- data/vendor/kreuzberg/tests/config_features.rs +580 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
- data/vendor/kreuzberg/tests/core_integration.rs +493 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
- data/vendor/kreuzberg/tests/security_validation.rs +404 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- metadata +471 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
use crate::error::{KreuzbergError, Result};
|
|
2
|
+
use fast_image_resize::{FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer, images::Image as FirImage};
|
|
3
|
+
use image::{DynamicImage, ImageBuffer, Rgb};
|
|
4
|
+
|
|
5
|
+
/// Resize an image using fast_image_resize with appropriate algorithm based on scale factor
|
|
6
|
+
pub fn resize_image(image: &DynamicImage, new_width: u32, new_height: u32, scale_factor: f64) -> Result<DynamicImage> {
|
|
7
|
+
let rgb_image = image.to_rgb8();
|
|
8
|
+
let (width, height) = rgb_image.dimensions();
|
|
9
|
+
|
|
10
|
+
let src_image = FirImage::from_vec_u8(width, height, rgb_image.into_raw(), PixelType::U8x3)
|
|
11
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to create source image: {e:?}")))?;
|
|
12
|
+
|
|
13
|
+
let mut dst_image = FirImage::new(new_width, new_height, PixelType::U8x3);
|
|
14
|
+
|
|
15
|
+
let algorithm = if scale_factor < 1.0 {
|
|
16
|
+
ResizeAlg::Convolution(FilterType::Lanczos3)
|
|
17
|
+
} else {
|
|
18
|
+
ResizeAlg::Convolution(FilterType::CatmullRom)
|
|
19
|
+
};
|
|
20
|
+
|
|
21
|
+
let mut resizer = Resizer::new();
|
|
22
|
+
resizer
|
|
23
|
+
.resize(&src_image, &mut dst_image, &ResizeOptions::new().resize_alg(algorithm))
|
|
24
|
+
.map_err(|e| KreuzbergError::parsing(format!("Resize failed: {e:?}")))?;
|
|
25
|
+
|
|
26
|
+
let buffer = dst_image.into_vec();
|
|
27
|
+
let img_buffer = ImageBuffer::<Rgb<u8>, Vec<u8>>::from_raw(new_width, new_height, buffer)
|
|
28
|
+
.ok_or_else(|| KreuzbergError::parsing("Failed to create image buffer".to_string()))?;
|
|
29
|
+
|
|
30
|
+
Ok(DynamicImage::ImageRgb8(img_buffer))
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#[cfg(test)]
|
|
34
|
+
mod tests {
|
|
35
|
+
use super::*;
|
|
36
|
+
use image::Rgb;
|
|
37
|
+
|
|
38
|
+
fn create_test_image() -> DynamicImage {
|
|
39
|
+
let mut img = ImageBuffer::new(100, 100);
|
|
40
|
+
for y in 0..100 {
|
|
41
|
+
for x in 0..100 {
|
|
42
|
+
img.put_pixel(x, y, Rgb([255u8, 0u8, 0u8]));
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
DynamicImage::ImageRgb8(img)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[test]
|
|
49
|
+
fn test_resize_image_downscale() {
|
|
50
|
+
let img = create_test_image();
|
|
51
|
+
let result = resize_image(&img, 50, 50, 0.5);
|
|
52
|
+
assert!(result.is_ok());
|
|
53
|
+
let resized = result.unwrap();
|
|
54
|
+
assert_eq!(resized.width(), 50);
|
|
55
|
+
assert_eq!(resized.height(), 50);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
#[test]
|
|
59
|
+
fn test_resize_image_upscale() {
|
|
60
|
+
let img = create_test_image();
|
|
61
|
+
let result = resize_image(&img, 200, 200, 2.0);
|
|
62
|
+
assert!(result.is_ok());
|
|
63
|
+
let resized = result.unwrap();
|
|
64
|
+
assert_eq!(resized.width(), 200);
|
|
65
|
+
assert_eq!(resized.height(), 200);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#[test]
|
|
69
|
+
fn test_resize_image_no_scale() {
|
|
70
|
+
let img = create_test_image();
|
|
71
|
+
let result = resize_image(&img, 100, 100, 1.0);
|
|
72
|
+
assert!(result.is_ok());
|
|
73
|
+
let resized = result.unwrap();
|
|
74
|
+
assert_eq!(resized.width(), 100);
|
|
75
|
+
assert_eq!(resized.height(), 100);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#[test]
|
|
79
|
+
fn test_resize_preserves_aspect_ratio() {
|
|
80
|
+
let img = create_test_image();
|
|
81
|
+
let result = resize_image(&img, 50, 50, 0.5);
|
|
82
|
+
assert!(result.is_ok());
|
|
83
|
+
let resized = result.unwrap();
|
|
84
|
+
|
|
85
|
+
let original_aspect = img.width() as f64 / img.height() as f64;
|
|
86
|
+
let resized_aspect = resized.width() as f64 / resized.height() as f64;
|
|
87
|
+
assert!((original_aspect - resized_aspect).abs() < 0.01);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
//! Configuration for keyword extraction.
|
|
2
|
+
|
|
3
|
+
use super::types::KeywordAlgorithm;
|
|
4
|
+
use serde::{Deserialize, Serialize};
|
|
5
|
+
|
|
6
|
+
/// YAKE-specific parameters.
|
|
7
|
+
#[cfg(feature = "keywords-yake")]
|
|
8
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
9
|
+
pub struct YakeParams {
|
|
10
|
+
/// Window size for co-occurrence analysis (default: 2).
|
|
11
|
+
///
|
|
12
|
+
/// Controls the context window for computing co-occurrence statistics.
|
|
13
|
+
pub window_size: usize,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
#[cfg(feature = "keywords-yake")]
|
|
17
|
+
impl Default for YakeParams {
|
|
18
|
+
fn default() -> Self {
|
|
19
|
+
Self { window_size: 2 }
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/// RAKE-specific parameters.
|
|
24
|
+
#[cfg(feature = "keywords-rake")]
|
|
25
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
26
|
+
pub struct RakeParams {
|
|
27
|
+
/// Minimum word length to consider (default: 1).
|
|
28
|
+
pub min_word_length: usize,
|
|
29
|
+
|
|
30
|
+
/// Maximum words in a keyword phrase (default: 3).
|
|
31
|
+
pub max_words_per_phrase: usize,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
#[cfg(feature = "keywords-rake")]
|
|
35
|
+
impl Default for RakeParams {
|
|
36
|
+
fn default() -> Self {
|
|
37
|
+
Self {
|
|
38
|
+
min_word_length: 1,
|
|
39
|
+
max_words_per_phrase: 3,
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Keyword extraction configuration.
|
|
45
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
46
|
+
pub struct KeywordConfig {
|
|
47
|
+
/// Algorithm to use for extraction.
|
|
48
|
+
pub algorithm: KeywordAlgorithm,
|
|
49
|
+
|
|
50
|
+
/// Maximum number of keywords to extract (default: 10).
|
|
51
|
+
pub max_keywords: usize,
|
|
52
|
+
|
|
53
|
+
/// Minimum score threshold (0.0-1.0, default: 0.0).
|
|
54
|
+
///
|
|
55
|
+
/// Keywords with scores below this threshold are filtered out.
|
|
56
|
+
/// Note: Score ranges differ between algorithms.
|
|
57
|
+
pub min_score: f32,
|
|
58
|
+
|
|
59
|
+
/// N-gram range for keyword extraction (min, max).
|
|
60
|
+
///
|
|
61
|
+
/// (1, 1) = unigrams only
|
|
62
|
+
/// (1, 2) = unigrams and bigrams
|
|
63
|
+
/// (1, 3) = unigrams, bigrams, and trigrams (default)
|
|
64
|
+
pub ngram_range: (usize, usize),
|
|
65
|
+
|
|
66
|
+
/// Language code for stopword filtering (e.g., "en", "de", "fr").
|
|
67
|
+
///
|
|
68
|
+
/// If None, no stopword filtering is applied.
|
|
69
|
+
pub language: Option<String>,
|
|
70
|
+
|
|
71
|
+
/// YAKE-specific tuning parameters.
|
|
72
|
+
#[cfg(feature = "keywords-yake")]
|
|
73
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
74
|
+
pub yake_params: Option<YakeParams>,
|
|
75
|
+
|
|
76
|
+
/// RAKE-specific tuning parameters.
|
|
77
|
+
#[cfg(feature = "keywords-rake")]
|
|
78
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
79
|
+
pub rake_params: Option<RakeParams>,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
impl Default for KeywordConfig {
|
|
83
|
+
fn default() -> Self {
|
|
84
|
+
Self {
|
|
85
|
+
algorithm: KeywordAlgorithm::default(),
|
|
86
|
+
max_keywords: 10,
|
|
87
|
+
min_score: 0.0,
|
|
88
|
+
ngram_range: (1, 3),
|
|
89
|
+
language: Some("en".to_string()),
|
|
90
|
+
#[cfg(feature = "keywords-yake")]
|
|
91
|
+
yake_params: None,
|
|
92
|
+
#[cfg(feature = "keywords-rake")]
|
|
93
|
+
rake_params: None,
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
impl KeywordConfig {
|
|
99
|
+
/// Create a new configuration with YAKE algorithm.
|
|
100
|
+
#[cfg(feature = "keywords-yake")]
|
|
101
|
+
pub fn yake() -> Self {
|
|
102
|
+
Self {
|
|
103
|
+
algorithm: KeywordAlgorithm::Yake,
|
|
104
|
+
..Default::default()
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/// Create a new configuration with RAKE algorithm.
|
|
109
|
+
#[cfg(feature = "keywords-rake")]
|
|
110
|
+
pub fn rake() -> Self {
|
|
111
|
+
Self {
|
|
112
|
+
algorithm: KeywordAlgorithm::Rake,
|
|
113
|
+
..Default::default()
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/// Set maximum number of keywords to extract.
|
|
118
|
+
pub fn with_max_keywords(mut self, max: usize) -> Self {
|
|
119
|
+
self.max_keywords = max;
|
|
120
|
+
self
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/// Set minimum score threshold.
|
|
124
|
+
pub fn with_min_score(mut self, score: f32) -> Self {
|
|
125
|
+
self.min_score = score;
|
|
126
|
+
self
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/// Set n-gram range.
|
|
130
|
+
pub fn with_ngram_range(mut self, min: usize, max: usize) -> Self {
|
|
131
|
+
self.ngram_range = (min, max);
|
|
132
|
+
self
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/// Set language for stopword filtering.
|
|
136
|
+
pub fn with_language(mut self, lang: impl Into<String>) -> Self {
|
|
137
|
+
self.language = Some(lang.into());
|
|
138
|
+
self
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/// Set YAKE-specific parameters.
|
|
142
|
+
#[cfg(feature = "keywords-yake")]
|
|
143
|
+
pub fn with_yake_params(mut self, params: YakeParams) -> Self {
|
|
144
|
+
self.yake_params = Some(params);
|
|
145
|
+
self
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Set RAKE-specific parameters.
|
|
149
|
+
#[cfg(feature = "keywords-rake")]
|
|
150
|
+
pub fn with_rake_params(mut self, params: RakeParams) -> Self {
|
|
151
|
+
self.rake_params = Some(params);
|
|
152
|
+
self
|
|
153
|
+
}
|
|
154
|
+
}
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
//! Keyword extraction module.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides unified keyword extraction interface supporting multiple algorithms:
|
|
4
|
+
//! - YAKE (Yet Another Keyword Extractor) - statistical approach
|
|
5
|
+
//! - RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
|
|
6
|
+
//!
|
|
7
|
+
//! # Feature Flags
|
|
8
|
+
//!
|
|
9
|
+
//! - `keywords-yake`: Enable YAKE algorithm
|
|
10
|
+
//! - `keywords-rake`: Enable RAKE algorithm
|
|
11
|
+
//! - `keywords`: Enable both algorithms (default in `full` feature)
|
|
12
|
+
//!
|
|
13
|
+
//! # Examples
|
|
14
|
+
//!
|
|
15
|
+
//! ```rust,no_run
|
|
16
|
+
//! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
|
|
17
|
+
//! let text = "Rust is a systems programming language focused on safety and performance.";
|
|
18
|
+
//!
|
|
19
|
+
//! // Use default algorithm (YAKE if available)
|
|
20
|
+
//! let config = KeywordConfig::default();
|
|
21
|
+
//! let keywords = extract_keywords(text, &config).unwrap();
|
|
22
|
+
//!
|
|
23
|
+
//! for keyword in keywords {
|
|
24
|
+
//! println!("{}: {:.3}", keyword.text, keyword.score);
|
|
25
|
+
//! }
|
|
26
|
+
//! ```
|
|
27
|
+
//!
|
|
28
|
+
//! ```rust,no_run
|
|
29
|
+
//! # #[cfg(feature = "keywords-rake")]
|
|
30
|
+
//! # {
|
|
31
|
+
//! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
|
|
32
|
+
//! // Use RAKE algorithm explicitly
|
|
33
|
+
//! let text = "Machine learning models require large datasets.";
|
|
34
|
+
//! let config = KeywordConfig::rake()
|
|
35
|
+
//! .with_max_keywords(5)
|
|
36
|
+
//! .with_min_score(0.3);
|
|
37
|
+
//!
|
|
38
|
+
//! let keywords = extract_keywords(text, &config).unwrap();
|
|
39
|
+
//! # }
|
|
40
|
+
//! ```
|
|
41
|
+
|
|
42
|
+
use crate::Result;
|
|
43
|
+
use crate::plugins::registry::get_post_processor_registry;
|
|
44
|
+
use once_cell::sync::Lazy;
|
|
45
|
+
use std::sync::Arc;
|
|
46
|
+
|
|
47
|
+
pub mod config;
|
|
48
|
+
pub mod processor;
|
|
49
|
+
pub mod types;
|
|
50
|
+
|
|
51
|
+
#[cfg(feature = "keywords-yake")]
|
|
52
|
+
mod yake;
|
|
53
|
+
|
|
54
|
+
#[cfg(feature = "keywords-rake")]
|
|
55
|
+
mod rake;
|
|
56
|
+
|
|
57
|
+
pub use config::KeywordConfig;
|
|
58
|
+
pub use processor::KeywordExtractor;
|
|
59
|
+
|
|
60
|
+
#[cfg(feature = "keywords-rake")]
|
|
61
|
+
pub use config::RakeParams;
|
|
62
|
+
|
|
63
|
+
#[cfg(feature = "keywords-yake")]
|
|
64
|
+
pub use config::YakeParams;
|
|
65
|
+
pub use types::{Keyword, KeywordAlgorithm};
|
|
66
|
+
|
|
67
|
+
/// Extract keywords from text using the specified algorithm.
|
|
68
|
+
///
|
|
69
|
+
/// This is the unified entry point for keyword extraction. The algorithm
|
|
70
|
+
/// used is determined by `config.algorithm`.
|
|
71
|
+
///
|
|
72
|
+
/// # Arguments
|
|
73
|
+
///
|
|
74
|
+
/// * `text` - The text to extract keywords from
|
|
75
|
+
/// * `config` - Keyword extraction configuration
|
|
76
|
+
///
|
|
77
|
+
/// # Returns
|
|
78
|
+
///
|
|
79
|
+
/// A vector of keywords sorted by relevance (highest score first).
|
|
80
|
+
///
|
|
81
|
+
/// # Errors
|
|
82
|
+
///
|
|
83
|
+
/// Returns an error if:
|
|
84
|
+
/// - The specified algorithm feature is not enabled
|
|
85
|
+
/// - Keyword extraction fails
|
|
86
|
+
///
|
|
87
|
+
/// # Examples
|
|
88
|
+
///
|
|
89
|
+
/// ```rust,no_run
|
|
90
|
+
/// # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
|
|
91
|
+
/// let text = "Document intelligence with Rust provides memory safety.";
|
|
92
|
+
/// let config = KeywordConfig::default()
|
|
93
|
+
/// .with_max_keywords(10)
|
|
94
|
+
/// .with_language("en");
|
|
95
|
+
///
|
|
96
|
+
/// let keywords = extract_keywords(text, &config)?;
|
|
97
|
+
///
|
|
98
|
+
/// for keyword in keywords {
|
|
99
|
+
/// println!("{}: {:.3}", keyword.text, keyword.score);
|
|
100
|
+
/// }
|
|
101
|
+
/// # Ok::<(), kreuzberg::KreuzbergError>(())
|
|
102
|
+
/// ```
|
|
103
|
+
pub fn extract_keywords(text: &str, config: &KeywordConfig) -> Result<Vec<Keyword>> {
|
|
104
|
+
match config.algorithm {
|
|
105
|
+
#[cfg(feature = "keywords-yake")]
|
|
106
|
+
KeywordAlgorithm::Yake => yake::extract_keywords_yake(text, config),
|
|
107
|
+
|
|
108
|
+
#[cfg(feature = "keywords-rake")]
|
|
109
|
+
KeywordAlgorithm::Rake => rake::extract_keywords_rake(text, config),
|
|
110
|
+
|
|
111
|
+
#[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
|
|
112
|
+
_ => Err(crate::KreuzbergError::Other(
|
|
113
|
+
"No keyword extraction algorithm feature enabled".to_string(),
|
|
114
|
+
)),
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/// Lazy-initialized flag that ensures keyword processor is registered exactly once.
|
|
119
|
+
///
|
|
120
|
+
/// This static is accessed on first use to automatically register the
|
|
121
|
+
/// keyword extraction processor with the plugin registry.
|
|
122
|
+
static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_keyword_processor);
|
|
123
|
+
|
|
124
|
+
/// Ensure the keyword processor is registered.
|
|
125
|
+
///
|
|
126
|
+
/// This function is called automatically when needed.
|
|
127
|
+
/// It's safe to call multiple times - registration only happens once.
|
|
128
|
+
pub fn ensure_initialized() -> Result<()> {
|
|
129
|
+
PROCESSOR_INITIALIZED
|
|
130
|
+
.as_ref()
|
|
131
|
+
.map(|_| ())
|
|
132
|
+
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
133
|
+
message: format!("Failed to register keyword processor: {}", e),
|
|
134
|
+
plugin_name: "keyword-extraction".to_string(),
|
|
135
|
+
})
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/// Register the keyword extraction processor with the global registry.
|
|
139
|
+
///
|
|
140
|
+
/// This function should be called once at application startup to register
|
|
141
|
+
/// the keyword extraction post-processor.
|
|
142
|
+
///
|
|
143
|
+
/// **Note:** This is called automatically on first use.
|
|
144
|
+
/// Explicit calling is optional.
|
|
145
|
+
///
|
|
146
|
+
/// # Example
|
|
147
|
+
///
|
|
148
|
+
/// ```rust
|
|
149
|
+
/// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
150
|
+
/// use kreuzberg::keywords::register_keyword_processor;
|
|
151
|
+
///
|
|
152
|
+
/// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
153
|
+
/// # fn main() -> kreuzberg::Result<()> {
|
|
154
|
+
/// register_keyword_processor()?;
|
|
155
|
+
/// # Ok(())
|
|
156
|
+
/// # }
|
|
157
|
+
/// # #[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
|
|
158
|
+
/// # fn main() {}
|
|
159
|
+
/// ```
|
|
160
|
+
pub fn register_keyword_processor() -> Result<()> {
|
|
161
|
+
let registry = get_post_processor_registry();
|
|
162
|
+
let mut registry = registry
|
|
163
|
+
.write()
|
|
164
|
+
.map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
|
|
165
|
+
|
|
166
|
+
registry.register(Arc::new(KeywordExtractor), 50)?;
|
|
167
|
+
|
|
168
|
+
Ok(())
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
#[cfg(test)]
|
|
172
|
+
mod tests {
|
|
173
|
+
use super::*;
|
|
174
|
+
|
|
175
|
+
#[test]
|
|
176
|
+
fn test_extract_keywords_default_algorithm() {
|
|
177
|
+
let text = "Rust programming language provides memory safety and performance.";
|
|
178
|
+
let config = KeywordConfig::default();
|
|
179
|
+
|
|
180
|
+
let keywords = extract_keywords(text, &config).unwrap();
|
|
181
|
+
|
|
182
|
+
assert!(!keywords.is_empty(), "Should extract keywords");
|
|
183
|
+
assert!(keywords.len() <= config.max_keywords);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
#[cfg(feature = "keywords-yake")]
|
|
187
|
+
#[test]
|
|
188
|
+
fn test_extract_keywords_yake() {
|
|
189
|
+
let text = "Natural language processing using Rust is efficient and safe.";
|
|
190
|
+
let config = KeywordConfig::yake();
|
|
191
|
+
|
|
192
|
+
let keywords = extract_keywords(text, &config).unwrap();
|
|
193
|
+
|
|
194
|
+
assert!(!keywords.is_empty());
|
|
195
|
+
assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Yake);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
#[cfg(feature = "keywords-rake")]
|
|
199
|
+
#[test]
|
|
200
|
+
fn test_extract_keywords_rake() {
|
|
201
|
+
let text = "Natural language processing using Rust is efficient and safe.";
|
|
202
|
+
let config = KeywordConfig::rake();
|
|
203
|
+
|
|
204
|
+
let keywords = extract_keywords(text, &config).unwrap();
|
|
205
|
+
|
|
206
|
+
assert!(!keywords.is_empty());
|
|
207
|
+
assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Rake);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
#[cfg(all(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
211
|
+
#[test]
|
|
212
|
+
fn test_compare_algorithms() {
|
|
213
|
+
let text = "Machine learning and artificial intelligence are transforming technology. \
|
|
214
|
+
Deep learning models require substantial computational resources.";
|
|
215
|
+
|
|
216
|
+
let yake_config = KeywordConfig::yake().with_max_keywords(5);
|
|
217
|
+
let yake_keywords = extract_keywords(text, &yake_config).unwrap();
|
|
218
|
+
|
|
219
|
+
let rake_config = KeywordConfig::rake().with_max_keywords(5);
|
|
220
|
+
let rake_keywords = extract_keywords(text, &rake_config).unwrap();
|
|
221
|
+
|
|
222
|
+
assert!(!yake_keywords.is_empty());
|
|
223
|
+
assert!(!rake_keywords.is_empty());
|
|
224
|
+
|
|
225
|
+
assert!(yake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Yake));
|
|
226
|
+
assert!(rake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Rake));
|
|
227
|
+
|
|
228
|
+
println!(
|
|
229
|
+
"YAKE keywords: {:?}",
|
|
230
|
+
yake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
|
|
231
|
+
);
|
|
232
|
+
println!(
|
|
233
|
+
"RAKE keywords: {:?}",
|
|
234
|
+
rake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
|
|
235
|
+
);
|
|
236
|
+
}
|
|
237
|
+
}
|