kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,89 @@
1
+ use crate::error::{KreuzbergError, Result};
2
+ use fast_image_resize::{FilterType, PixelType, ResizeAlg, ResizeOptions, Resizer, images::Image as FirImage};
3
+ use image::{DynamicImage, ImageBuffer, Rgb};
4
+
5
+ /// Resize an image using fast_image_resize with appropriate algorithm based on scale factor
6
+ pub fn resize_image(image: &DynamicImage, new_width: u32, new_height: u32, scale_factor: f64) -> Result<DynamicImage> {
7
+ let rgb_image = image.to_rgb8();
8
+ let (width, height) = rgb_image.dimensions();
9
+
10
+ let src_image = FirImage::from_vec_u8(width, height, rgb_image.into_raw(), PixelType::U8x3)
11
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to create source image: {e:?}")))?;
12
+
13
+ let mut dst_image = FirImage::new(new_width, new_height, PixelType::U8x3);
14
+
15
+ let algorithm = if scale_factor < 1.0 {
16
+ ResizeAlg::Convolution(FilterType::Lanczos3)
17
+ } else {
18
+ ResizeAlg::Convolution(FilterType::CatmullRom)
19
+ };
20
+
21
+ let mut resizer = Resizer::new();
22
+ resizer
23
+ .resize(&src_image, &mut dst_image, &ResizeOptions::new().resize_alg(algorithm))
24
+ .map_err(|e| KreuzbergError::parsing(format!("Resize failed: {e:?}")))?;
25
+
26
+ let buffer = dst_image.into_vec();
27
+ let img_buffer = ImageBuffer::<Rgb<u8>, Vec<u8>>::from_raw(new_width, new_height, buffer)
28
+ .ok_or_else(|| KreuzbergError::parsing("Failed to create image buffer".to_string()))?;
29
+
30
+ Ok(DynamicImage::ImageRgb8(img_buffer))
31
+ }
32
+
33
+ #[cfg(test)]
34
+ mod tests {
35
+ use super::*;
36
+ use image::Rgb;
37
+
38
+ fn create_test_image() -> DynamicImage {
39
+ let mut img = ImageBuffer::new(100, 100);
40
+ for y in 0..100 {
41
+ for x in 0..100 {
42
+ img.put_pixel(x, y, Rgb([255u8, 0u8, 0u8]));
43
+ }
44
+ }
45
+ DynamicImage::ImageRgb8(img)
46
+ }
47
+
48
+ #[test]
49
+ fn test_resize_image_downscale() {
50
+ let img = create_test_image();
51
+ let result = resize_image(&img, 50, 50, 0.5);
52
+ assert!(result.is_ok());
53
+ let resized = result.unwrap();
54
+ assert_eq!(resized.width(), 50);
55
+ assert_eq!(resized.height(), 50);
56
+ }
57
+
58
+ #[test]
59
+ fn test_resize_image_upscale() {
60
+ let img = create_test_image();
61
+ let result = resize_image(&img, 200, 200, 2.0);
62
+ assert!(result.is_ok());
63
+ let resized = result.unwrap();
64
+ assert_eq!(resized.width(), 200);
65
+ assert_eq!(resized.height(), 200);
66
+ }
67
+
68
+ #[test]
69
+ fn test_resize_image_no_scale() {
70
+ let img = create_test_image();
71
+ let result = resize_image(&img, 100, 100, 1.0);
72
+ assert!(result.is_ok());
73
+ let resized = result.unwrap();
74
+ assert_eq!(resized.width(), 100);
75
+ assert_eq!(resized.height(), 100);
76
+ }
77
+
78
+ #[test]
79
+ fn test_resize_preserves_aspect_ratio() {
80
+ let img = create_test_image();
81
+ let result = resize_image(&img, 50, 50, 0.5);
82
+ assert!(result.is_ok());
83
+ let resized = result.unwrap();
84
+
85
+ let original_aspect = img.width() as f64 / img.height() as f64;
86
+ let resized_aspect = resized.width() as f64 / resized.height() as f64;
87
+ assert!((original_aspect - resized_aspect).abs() < 0.01);
88
+ }
89
+ }
@@ -0,0 +1,154 @@
1
+ //! Configuration for keyword extraction.
2
+
3
+ use super::types::KeywordAlgorithm;
4
+ use serde::{Deserialize, Serialize};
5
+
6
+ /// YAKE-specific parameters.
7
+ #[cfg(feature = "keywords-yake")]
8
+ #[derive(Debug, Clone, Serialize, Deserialize)]
9
+ pub struct YakeParams {
10
+ /// Window size for co-occurrence analysis (default: 2).
11
+ ///
12
+ /// Controls the context window for computing co-occurrence statistics.
13
+ pub window_size: usize,
14
+ }
15
+
16
+ #[cfg(feature = "keywords-yake")]
17
+ impl Default for YakeParams {
18
+ fn default() -> Self {
19
+ Self { window_size: 2 }
20
+ }
21
+ }
22
+
23
+ /// RAKE-specific parameters.
24
+ #[cfg(feature = "keywords-rake")]
25
+ #[derive(Debug, Clone, Serialize, Deserialize)]
26
+ pub struct RakeParams {
27
+ /// Minimum word length to consider (default: 1).
28
+ pub min_word_length: usize,
29
+
30
+ /// Maximum words in a keyword phrase (default: 3).
31
+ pub max_words_per_phrase: usize,
32
+ }
33
+
34
+ #[cfg(feature = "keywords-rake")]
35
+ impl Default for RakeParams {
36
+ fn default() -> Self {
37
+ Self {
38
+ min_word_length: 1,
39
+ max_words_per_phrase: 3,
40
+ }
41
+ }
42
+ }
43
+
44
+ /// Keyword extraction configuration.
45
+ #[derive(Debug, Clone, Serialize, Deserialize)]
46
+ pub struct KeywordConfig {
47
+ /// Algorithm to use for extraction.
48
+ pub algorithm: KeywordAlgorithm,
49
+
50
+ /// Maximum number of keywords to extract (default: 10).
51
+ pub max_keywords: usize,
52
+
53
+ /// Minimum score threshold (0.0-1.0, default: 0.0).
54
+ ///
55
+ /// Keywords with scores below this threshold are filtered out.
56
+ /// Note: Score ranges differ between algorithms.
57
+ pub min_score: f32,
58
+
59
+ /// N-gram range for keyword extraction (min, max).
60
+ ///
61
+ /// (1, 1) = unigrams only
62
+ /// (1, 2) = unigrams and bigrams
63
+ /// (1, 3) = unigrams, bigrams, and trigrams (default)
64
+ pub ngram_range: (usize, usize),
65
+
66
+ /// Language code for stopword filtering (e.g., "en", "de", "fr").
67
+ ///
68
+ /// If None, no stopword filtering is applied.
69
+ pub language: Option<String>,
70
+
71
+ /// YAKE-specific tuning parameters.
72
+ #[cfg(feature = "keywords-yake")]
73
+ #[serde(skip_serializing_if = "Option::is_none")]
74
+ pub yake_params: Option<YakeParams>,
75
+
76
+ /// RAKE-specific tuning parameters.
77
+ #[cfg(feature = "keywords-rake")]
78
+ #[serde(skip_serializing_if = "Option::is_none")]
79
+ pub rake_params: Option<RakeParams>,
80
+ }
81
+
82
+ impl Default for KeywordConfig {
83
+ fn default() -> Self {
84
+ Self {
85
+ algorithm: KeywordAlgorithm::default(),
86
+ max_keywords: 10,
87
+ min_score: 0.0,
88
+ ngram_range: (1, 3),
89
+ language: Some("en".to_string()),
90
+ #[cfg(feature = "keywords-yake")]
91
+ yake_params: None,
92
+ #[cfg(feature = "keywords-rake")]
93
+ rake_params: None,
94
+ }
95
+ }
96
+ }
97
+
98
+ impl KeywordConfig {
99
+ /// Create a new configuration with YAKE algorithm.
100
+ #[cfg(feature = "keywords-yake")]
101
+ pub fn yake() -> Self {
102
+ Self {
103
+ algorithm: KeywordAlgorithm::Yake,
104
+ ..Default::default()
105
+ }
106
+ }
107
+
108
+ /// Create a new configuration with RAKE algorithm.
109
+ #[cfg(feature = "keywords-rake")]
110
+ pub fn rake() -> Self {
111
+ Self {
112
+ algorithm: KeywordAlgorithm::Rake,
113
+ ..Default::default()
114
+ }
115
+ }
116
+
117
+ /// Set maximum number of keywords to extract.
118
+ pub fn with_max_keywords(mut self, max: usize) -> Self {
119
+ self.max_keywords = max;
120
+ self
121
+ }
122
+
123
+ /// Set minimum score threshold.
124
+ pub fn with_min_score(mut self, score: f32) -> Self {
125
+ self.min_score = score;
126
+ self
127
+ }
128
+
129
+ /// Set n-gram range.
130
+ pub fn with_ngram_range(mut self, min: usize, max: usize) -> Self {
131
+ self.ngram_range = (min, max);
132
+ self
133
+ }
134
+
135
+ /// Set language for stopword filtering.
136
+ pub fn with_language(mut self, lang: impl Into<String>) -> Self {
137
+ self.language = Some(lang.into());
138
+ self
139
+ }
140
+
141
+ /// Set YAKE-specific parameters.
142
+ #[cfg(feature = "keywords-yake")]
143
+ pub fn with_yake_params(mut self, params: YakeParams) -> Self {
144
+ self.yake_params = Some(params);
145
+ self
146
+ }
147
+
148
+ /// Set RAKE-specific parameters.
149
+ #[cfg(feature = "keywords-rake")]
150
+ pub fn with_rake_params(mut self, params: RakeParams) -> Self {
151
+ self.rake_params = Some(params);
152
+ self
153
+ }
154
+ }
@@ -0,0 +1,237 @@
1
+ //! Keyword extraction module.
2
+ //!
3
+ //! Provides unified keyword extraction interface supporting multiple algorithms:
4
+ //! - YAKE (Yet Another Keyword Extractor) - statistical approach
5
+ //! - RAKE (Rapid Automatic Keyword Extraction) - co-occurrence based
6
+ //!
7
+ //! # Feature Flags
8
+ //!
9
+ //! - `keywords-yake`: Enable YAKE algorithm
10
+ //! - `keywords-rake`: Enable RAKE algorithm
11
+ //! - `keywords`: Enable both algorithms (default in `full` feature)
12
+ //!
13
+ //! # Examples
14
+ //!
15
+ //! ```rust,no_run
16
+ //! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
17
+ //! let text = "Rust is a systems programming language focused on safety and performance.";
18
+ //!
19
+ //! // Use default algorithm (YAKE if available)
20
+ //! let config = KeywordConfig::default();
21
+ //! let keywords = extract_keywords(text, &config).unwrap();
22
+ //!
23
+ //! for keyword in keywords {
24
+ //! println!("{}: {:.3}", keyword.text, keyword.score);
25
+ //! }
26
+ //! ```
27
+ //!
28
+ //! ```rust,no_run
29
+ //! # #[cfg(feature = "keywords-rake")]
30
+ //! # {
31
+ //! # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
32
+ //! // Use RAKE algorithm explicitly
33
+ //! let text = "Machine learning models require large datasets.";
34
+ //! let config = KeywordConfig::rake()
35
+ //! .with_max_keywords(5)
36
+ //! .with_min_score(0.3);
37
+ //!
38
+ //! let keywords = extract_keywords(text, &config).unwrap();
39
+ //! # }
40
+ //! ```
41
+
42
+ use crate::Result;
43
+ use crate::plugins::registry::get_post_processor_registry;
44
+ use once_cell::sync::Lazy;
45
+ use std::sync::Arc;
46
+
47
+ pub mod config;
48
+ pub mod processor;
49
+ pub mod types;
50
+
51
+ #[cfg(feature = "keywords-yake")]
52
+ mod yake;
53
+
54
+ #[cfg(feature = "keywords-rake")]
55
+ mod rake;
56
+
57
+ pub use config::KeywordConfig;
58
+ pub use processor::KeywordExtractor;
59
+
60
+ #[cfg(feature = "keywords-rake")]
61
+ pub use config::RakeParams;
62
+
63
+ #[cfg(feature = "keywords-yake")]
64
+ pub use config::YakeParams;
65
+ pub use types::{Keyword, KeywordAlgorithm};
66
+
67
+ /// Extract keywords from text using the specified algorithm.
68
+ ///
69
+ /// This is the unified entry point for keyword extraction. The algorithm
70
+ /// used is determined by `config.algorithm`.
71
+ ///
72
+ /// # Arguments
73
+ ///
74
+ /// * `text` - The text to extract keywords from
75
+ /// * `config` - Keyword extraction configuration
76
+ ///
77
+ /// # Returns
78
+ ///
79
+ /// A vector of keywords sorted by relevance (highest score first).
80
+ ///
81
+ /// # Errors
82
+ ///
83
+ /// Returns an error if:
84
+ /// - The specified algorithm feature is not enabled
85
+ /// - Keyword extraction fails
86
+ ///
87
+ /// # Examples
88
+ ///
89
+ /// ```rust,no_run
90
+ /// # use kreuzberg::keywords::{extract_keywords, KeywordConfig};
91
+ /// let text = "Document intelligence with Rust provides memory safety.";
92
+ /// let config = KeywordConfig::default()
93
+ /// .with_max_keywords(10)
94
+ /// .with_language("en");
95
+ ///
96
+ /// let keywords = extract_keywords(text, &config)?;
97
+ ///
98
+ /// for keyword in keywords {
99
+ /// println!("{}: {:.3}", keyword.text, keyword.score);
100
+ /// }
101
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
102
+ /// ```
103
+ pub fn extract_keywords(text: &str, config: &KeywordConfig) -> Result<Vec<Keyword>> {
104
+ match config.algorithm {
105
+ #[cfg(feature = "keywords-yake")]
106
+ KeywordAlgorithm::Yake => yake::extract_keywords_yake(text, config),
107
+
108
+ #[cfg(feature = "keywords-rake")]
109
+ KeywordAlgorithm::Rake => rake::extract_keywords_rake(text, config),
110
+
111
+ #[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
112
+ _ => Err(crate::KreuzbergError::Other(
113
+ "No keyword extraction algorithm feature enabled".to_string(),
114
+ )),
115
+ }
116
+ }
117
+
118
+ /// Lazy-initialized flag that ensures keyword processor is registered exactly once.
119
+ ///
120
+ /// This static is accessed on first use to automatically register the
121
+ /// keyword extraction processor with the plugin registry.
122
+ static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_keyword_processor);
123
+
124
+ /// Ensure the keyword processor is registered.
125
+ ///
126
+ /// This function is called automatically when needed.
127
+ /// It's safe to call multiple times - registration only happens once.
128
+ pub fn ensure_initialized() -> Result<()> {
129
+ PROCESSOR_INITIALIZED
130
+ .as_ref()
131
+ .map(|_| ())
132
+ .map_err(|e| crate::KreuzbergError::Plugin {
133
+ message: format!("Failed to register keyword processor: {}", e),
134
+ plugin_name: "keyword-extraction".to_string(),
135
+ })
136
+ }
137
+
138
+ /// Register the keyword extraction processor with the global registry.
139
+ ///
140
+ /// This function should be called once at application startup to register
141
+ /// the keyword extraction post-processor.
142
+ ///
143
+ /// **Note:** This is called automatically on first use.
144
+ /// Explicit calling is optional.
145
+ ///
146
+ /// # Example
147
+ ///
148
+ /// ```rust
149
+ /// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
150
+ /// use kreuzberg::keywords::register_keyword_processor;
151
+ ///
152
+ /// # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
153
+ /// # fn main() -> kreuzberg::Result<()> {
154
+ /// register_keyword_processor()?;
155
+ /// # Ok(())
156
+ /// # }
157
+ /// # #[cfg(not(any(feature = "keywords-yake", feature = "keywords-rake")))]
158
+ /// # fn main() {}
159
+ /// ```
160
+ pub fn register_keyword_processor() -> Result<()> {
161
+ let registry = get_post_processor_registry();
162
+ let mut registry = registry
163
+ .write()
164
+ .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
165
+
166
+ registry.register(Arc::new(KeywordExtractor), 50)?;
167
+
168
+ Ok(())
169
+ }
170
+
171
+ #[cfg(test)]
172
+ mod tests {
173
+ use super::*;
174
+
175
+ #[test]
176
+ fn test_extract_keywords_default_algorithm() {
177
+ let text = "Rust programming language provides memory safety and performance.";
178
+ let config = KeywordConfig::default();
179
+
180
+ let keywords = extract_keywords(text, &config).unwrap();
181
+
182
+ assert!(!keywords.is_empty(), "Should extract keywords");
183
+ assert!(keywords.len() <= config.max_keywords);
184
+ }
185
+
186
+ #[cfg(feature = "keywords-yake")]
187
+ #[test]
188
+ fn test_extract_keywords_yake() {
189
+ let text = "Natural language processing using Rust is efficient and safe.";
190
+ let config = KeywordConfig::yake();
191
+
192
+ let keywords = extract_keywords(text, &config).unwrap();
193
+
194
+ assert!(!keywords.is_empty());
195
+ assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Yake);
196
+ }
197
+
198
+ #[cfg(feature = "keywords-rake")]
199
+ #[test]
200
+ fn test_extract_keywords_rake() {
201
+ let text = "Natural language processing using Rust is efficient and safe.";
202
+ let config = KeywordConfig::rake();
203
+
204
+ let keywords = extract_keywords(text, &config).unwrap();
205
+
206
+ assert!(!keywords.is_empty());
207
+ assert_eq!(keywords[0].algorithm, KeywordAlgorithm::Rake);
208
+ }
209
+
210
+ #[cfg(all(feature = "keywords-yake", feature = "keywords-rake"))]
211
+ #[test]
212
+ fn test_compare_algorithms() {
213
+ let text = "Machine learning and artificial intelligence are transforming technology. \
214
+ Deep learning models require substantial computational resources.";
215
+
216
+ let yake_config = KeywordConfig::yake().with_max_keywords(5);
217
+ let yake_keywords = extract_keywords(text, &yake_config).unwrap();
218
+
219
+ let rake_config = KeywordConfig::rake().with_max_keywords(5);
220
+ let rake_keywords = extract_keywords(text, &rake_config).unwrap();
221
+
222
+ assert!(!yake_keywords.is_empty());
223
+ assert!(!rake_keywords.is_empty());
224
+
225
+ assert!(yake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Yake));
226
+ assert!(rake_keywords.iter().all(|k| k.algorithm == KeywordAlgorithm::Rake));
227
+
228
+ println!(
229
+ "YAKE keywords: {:?}",
230
+ yake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
231
+ );
232
+ println!(
233
+ "RAKE keywords: {:?}",
234
+ rake_keywords.iter().map(|k| &k.text).collect::<Vec<_>>()
235
+ );
236
+ }
237
+ }