kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,36 @@
1
+ //! Germanic language stopwords.
2
+ //!
3
+ //! Includes: English (en), German (de), Dutch (nl), Swedish (sv),
4
+ //! Norwegian (no), Danish (da), Afrikaans (af).
5
+
6
+ use ahash::{AHashMap, AHashSet};
7
+
8
+ /// Macro to generate embedded stopwords for Germanic languages.
9
+ macro_rules! embed_stopwords {
10
+ ($map:expr, $($lang:literal),* $(,)?) => {
11
+ $(
12
+ {
13
+ const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
14
+ match serde_json::from_str::<Vec<String>>(JSON) {
15
+ Ok(words) => {
16
+ let set: AHashSet<String> = words.into_iter().collect();
17
+ $map.insert($lang.to_string(), set);
18
+ }
19
+ Err(e) => {
20
+ panic!(
21
+ "Failed to parse embedded stopwords for language '{}': {}. \
22
+ This indicates corrupted or malformed JSON in the embedded stopwords data. \
23
+ Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
24
+ $lang, e
25
+ );
26
+ }
27
+ }
28
+ }
29
+ )*
30
+ };
31
+ }
32
+
33
+ /// Load Germanic language stopwords into the provided map.
34
+ pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
35
+ embed_stopwords!(map, "en", "de", "nl", "sv", "no", "da", "af");
36
+ }
@@ -0,0 +1,10 @@
1
+ //! Language family modules for stopword loading.
2
+ //!
3
+ //! Stopwords are organized by language family for easier maintenance
4
+ //! and navigation. Each module handles loading stopwords for related languages.
5
+
6
+ pub(super) mod asian;
7
+ pub(super) mod germanic;
8
+ pub(super) mod other;
9
+ pub(super) mod romance;
10
+ pub(super) mod slavic;
@@ -0,0 +1,44 @@
1
+ //! Other language stopwords.
2
+ //!
3
+ //! Includes: Arabic (ar), Hebrew (he), Turkish (tr), Persian (fa),
4
+ //! Kurdish (ku), Armenian (hy), Estonian (et), Basque (eu),
5
+ //! Breton (br), Esperanto (eo), Finnish (fi), Irish (ga),
6
+ //! Hungarian (hu), Indonesian (id), Latin (la), Lithuanian (lt),
7
+ //! Latvian (lv), Malay (ms), Tagalog (tl), Greek (el),
8
+ //! Hausa (ha), Swahili (sw), Yoruba (yo), Zulu (zu),
9
+ //! Somali (so), Sesotho (st).
10
+
11
+ use ahash::{AHashMap, AHashSet};
12
+
13
+ /// Macro to generate embedded stopwords for other languages.
14
+ macro_rules! embed_stopwords {
15
+ ($map:expr, $($lang:literal),* $(,)?) => {
16
+ $(
17
+ {
18
+ const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
19
+ match serde_json::from_str::<Vec<String>>(JSON) {
20
+ Ok(words) => {
21
+ let set: AHashSet<String> = words.into_iter().collect();
22
+ $map.insert($lang.to_string(), set);
23
+ }
24
+ Err(e) => {
25
+ panic!(
26
+ "Failed to parse embedded stopwords for language '{}': {}. \
27
+ This indicates corrupted or malformed JSON in the embedded stopwords data. \
28
+ Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
29
+ $lang, e
30
+ );
31
+ }
32
+ }
33
+ }
34
+ )*
35
+ };
36
+ }
37
+
38
+ /// Load other language stopwords into the provided map.
39
+ pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
40
+ embed_stopwords!(
41
+ map, "ar", "he", "tr", "fa", "ku", "hy", "et", "eu", "br", "eo", "fi", "ga", "hu", "id", "la", "lt", "lv",
42
+ "ms", "tl", "el", "ha", "sw", "yo", "zu", "so", "st"
43
+ );
44
+ }
@@ -0,0 +1,36 @@
1
+ //! Romance language stopwords.
2
+ //!
3
+ //! Includes: French (fr), Spanish (es), Italian (it), Portuguese (pt),
4
+ //! Romanian (ro), Catalan (ca), Galician (gl).
5
+
6
+ use ahash::{AHashMap, AHashSet};
7
+
8
+ /// Macro to generate embedded stopwords for Romance languages.
9
+ macro_rules! embed_stopwords {
10
+ ($map:expr, $($lang:literal),* $(,)?) => {
11
+ $(
12
+ {
13
+ const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
14
+ match serde_json::from_str::<Vec<String>>(JSON) {
15
+ Ok(words) => {
16
+ let set: AHashSet<String> = words.into_iter().collect();
17
+ $map.insert($lang.to_string(), set);
18
+ }
19
+ Err(e) => {
20
+ panic!(
21
+ "Failed to parse embedded stopwords for language '{}': {}. \
22
+ This indicates corrupted or malformed JSON in the embedded stopwords data. \
23
+ Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
24
+ $lang, e
25
+ );
26
+ }
27
+ }
28
+ }
29
+ )*
30
+ };
31
+ }
32
+
33
+ /// Load Romance language stopwords into the provided map.
34
+ pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
35
+ embed_stopwords!(map, "fr", "es", "it", "pt", "ro", "ca", "gl");
36
+ }
@@ -0,0 +1,36 @@
1
+ //! Slavic language stopwords.
2
+ //!
3
+ //! Includes: Russian (ru), Polish (pl), Czech (cs), Ukrainian (uk),
4
+ //! Bulgarian (bg), Slovak (sk), Croatian (hr), Slovenian (sl).
5
+
6
+ use ahash::{AHashMap, AHashSet};
7
+
8
+ /// Macro to generate embedded stopwords for Slavic languages.
9
+ macro_rules! embed_stopwords {
10
+ ($map:expr, $($lang:literal),* $(,)?) => {
11
+ $(
12
+ {
13
+ const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
14
+ match serde_json::from_str::<Vec<String>>(JSON) {
15
+ Ok(words) => {
16
+ let set: AHashSet<String> = words.into_iter().collect();
17
+ $map.insert($lang.to_string(), set);
18
+ }
19
+ Err(e) => {
20
+ panic!(
21
+ "Failed to parse embedded stopwords for language '{}': {}. \
22
+ This indicates corrupted or malformed JSON in the embedded stopwords data. \
23
+ Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
24
+ $lang, e
25
+ );
26
+ }
27
+ }
28
+ }
29
+ )*
30
+ };
31
+ }
32
+
33
+ /// Load Slavic language stopwords into the provided map.
34
+ pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
35
+ embed_stopwords!(map, "ru", "pl", "cs", "uk", "bg", "sk", "hr", "sl");
36
+ }
@@ -82,33 +82,7 @@
82
82
  use ahash::{AHashMap, AHashSet};
83
83
  use once_cell::sync::Lazy;
84
84
 
85
- /// Macro to generate embedded stopwords for all languages.
86
- ///
87
- /// This macro embeds the JSON files at compile time using `include_str!()` and
88
- /// generates code to parse and insert them into the stopwords map.
89
- macro_rules! embed_stopwords {
90
- ($map:expr, $($lang:literal),* $(,)?) => {
91
- $(
92
- {
93
- const JSON: &str = include_str!(concat!("../../stopwords/", $lang, "_stopwords.json"));
94
- match serde_json::from_str::<Vec<String>>(JSON) {
95
- Ok(words) => {
96
- let set: AHashSet<String> = words.into_iter().collect();
97
- $map.insert($lang.to_string(), set);
98
- }
99
- Err(e) => {
100
- panic!(
101
- "Failed to parse embedded stopwords for language '{}': {}. \
102
- This indicates corrupted or malformed JSON in the embedded stopwords data. \
103
- Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
104
- $lang, e
105
- );
106
- }
107
- }
108
- }
109
- )*
110
- };
111
- }
85
+ mod languages;
112
86
 
113
87
  /// Global stopwords registry.
114
88
  ///
@@ -146,12 +120,12 @@ macro_rules! embed_stopwords {
146
120
  pub static STOPWORDS: Lazy<AHashMap<String, AHashSet<String>>> = Lazy::new(|| {
147
121
  let mut map = AHashMap::new();
148
122
 
149
- embed_stopwords!(
150
- map, "af", "ar", "bg", "bn", "br", "ca", "cs", "da", "de", "el", "en", "eo", "es", "et", "eu", "fa", "fi",
151
- "fr", "ga", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", "it", "ja", "kn", "ko", "ku", "la", "lt",
152
- "lv", "ml", "mr", "ms", "ne", "nl", "no", "pl", "pt", "ro", "ru", "si", "sk", "sl", "so", "st", "sv", "sw",
153
- "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "yo", "zh", "zu",
154
- );
123
+ // Load stopwords by language family
124
+ languages::germanic::load_stopwords(&mut map);
125
+ languages::romance::load_stopwords(&mut map);
126
+ languages::slavic::load_stopwords(&mut map);
127
+ languages::asian::load_stopwords(&mut map);
128
+ languages::other::load_stopwords(&mut map);
155
129
 
156
130
  apply_stopword_whitelist(&mut map);
157
131
 
@@ -677,7 +677,7 @@ mod tests {
677
677
  fn test_normalize_whitespace_cow_no_changes() {
678
678
  let text = Cow::Borrowed("normaltext");
679
679
  let result = normalize_whitespace_cow(text);
680
- assert_eq!(result.as_ref(), "normaltext");
680
+ assert_eq!(&*result, "normaltext");
681
681
  }
682
682
 
683
683
  #[test]
@@ -123,6 +123,8 @@ mod tests {
123
123
  chunks: None,
124
124
  images: None,
125
125
  pages: None,
126
+ elements: None,
127
+ djot_content: None,
126
128
  };
127
129
 
128
130
  processor.process(&mut result, &config).await.unwrap();
@@ -148,7 +150,9 @@ mod tests {
148
150
  detected_languages: None,
149
151
  chunks: None,
150
152
  images: None,
153
+ djot_content: None,
151
154
  pages: None,
155
+ elements: None,
152
156
  };
153
157
 
154
158
  processor.process(&mut result, &config).await.unwrap();
@@ -181,7 +185,9 @@ mod tests {
181
185
  detected_languages: None,
182
186
  chunks: None,
183
187
  images: None,
188
+ djot_content: None,
184
189
  pages: None,
190
+ elements: None,
185
191
  };
186
192
 
187
193
  let config_with_quality = ExtractionConfig {
@@ -209,7 +215,9 @@ mod tests {
209
215
  detected_languages: None,
210
216
  chunks: None,
211
217
  images: None,
218
+ djot_content: None,
212
219
  pages: None,
220
+ elements: None,
213
221
  };
214
222
 
215
223
  let long_result = ExtractionResult {
@@ -220,7 +228,9 @@ mod tests {
220
228
  detected_languages: None,
221
229
  chunks: None,
222
230
  images: None,
231
+ djot_content: None,
223
232
  pages: None,
233
+ elements: None,
224
234
  };
225
235
 
226
236
  let short_duration = processor.estimated_duration_ms(&short_result);
@@ -0,0 +1,238 @@
1
+ use ahash::AHashMap;
2
+
3
+ /// Bonus added for sentences at the beginning or end of the document
4
+ const SENTENCE_EDGE_POSITION_BONUS: f32 = 0.3;
5
+
6
+ /// Bonus added for sentences with ideal word count (neither too short nor too long)
7
+ const IDEAL_WORD_COUNT_BONUS: f32 = 0.2;
8
+
9
+ /// Minimum word count for ideal sentence length
10
+ const MIN_IDEAL_WORD_COUNT: usize = 3;
11
+
12
+ /// Maximum word count for ideal sentence length
13
+ const MAX_IDEAL_WORD_COUNT: usize = 25;
14
+
15
+ /// Weight multiplier for numeric content density in sentences
16
+ const NUMERIC_CONTENT_WEIGHT: f32 = 0.3;
17
+
18
+ /// Weight multiplier for capitalized/acronym word density in sentences
19
+ const CAPS_ACRONYM_WEIGHT: f32 = 0.25;
20
+
21
+ /// Weight multiplier for long word density in sentences
22
+ const LONG_WORD_WEIGHT: f32 = 0.2;
23
+
24
+ /// Minimum character length for a word to be considered "long"
25
+ const LONG_WORD_THRESHOLD: usize = 8;
26
+
27
+ /// Weight multiplier for punctuation density in sentences
28
+ const PUNCTUATION_DENSITY_WEIGHT: f32 = 0.15;
29
+
30
+ /// Weight multiplier for word diversity ratio (unique words / total words)
31
+ const DIVERSITY_RATIO_WEIGHT: f32 = 0.15;
32
+
33
+ /// Weight multiplier for character entropy (measure of text randomness/information)
34
+ const CHAR_ENTROPY_WEIGHT: f32 = 0.1;
35
+
36
+ /// Analyzes text characteristics and scores content importance.
37
+ pub struct TextAnalyzer;
38
+
39
+ impl TextAnalyzer {
40
+ /// Scores the importance of a sentence based on various characteristics.
41
+ pub fn score_sentence_importance(sentence: &str, position: usize, total_sentences: usize) -> f32 {
42
+ let mut score = 0.0;
43
+
44
+ if position == 0 || position == total_sentences - 1 {
45
+ score += SENTENCE_EDGE_POSITION_BONUS;
46
+ }
47
+
48
+ let words: Vec<&str> = sentence.split_whitespace().collect();
49
+ if words.is_empty() {
50
+ return score;
51
+ }
52
+
53
+ let word_count = words.len();
54
+ if (MIN_IDEAL_WORD_COUNT..=MAX_IDEAL_WORD_COUNT).contains(&word_count) {
55
+ score += IDEAL_WORD_COUNT_BONUS;
56
+ }
57
+
58
+ let mut numeric_count = 0;
59
+ let mut caps_count = 0;
60
+ let mut long_word_count = 0;
61
+ let mut punct_density = 0;
62
+
63
+ for word in &words {
64
+ if word.chars().any(|c| c.is_numeric()) {
65
+ numeric_count += 1;
66
+ }
67
+
68
+ if word.len() > 1 && word.chars().all(|c| c.is_uppercase()) {
69
+ caps_count += 1;
70
+ }
71
+
72
+ if word.len() > LONG_WORD_THRESHOLD {
73
+ long_word_count += 1;
74
+ }
75
+
76
+ punct_density += word.chars().filter(|c| c.is_ascii_punctuation()).count();
77
+ }
78
+
79
+ score += (numeric_count as f32 / words.len() as f32) * NUMERIC_CONTENT_WEIGHT;
80
+ score += (caps_count as f32 / words.len() as f32) * CAPS_ACRONYM_WEIGHT;
81
+ score += (long_word_count as f32 / words.len() as f32) * LONG_WORD_WEIGHT;
82
+ score += (punct_density as f32 / sentence.len() as f32) * PUNCTUATION_DENSITY_WEIGHT;
83
+
84
+ let estimated_unique = (words.len() as f32 * 0.6).ceil() as usize;
85
+ let mut unique_words: ahash::AHashSet<String> = ahash::AHashSet::with_capacity(estimated_unique.max(10));
86
+
87
+ for w in &words {
88
+ let clean = w
89
+ .chars()
90
+ .filter(|c| c.is_alphabetic())
91
+ .collect::<String>()
92
+ .to_lowercase();
93
+ unique_words.insert(clean);
94
+
95
+ if unique_words.len() >= estimated_unique {
96
+ break;
97
+ }
98
+ }
99
+
100
+ let final_unique_count = if unique_words.len() >= estimated_unique {
101
+ unique_words.len()
102
+ } else {
103
+ for w in &words {
104
+ let clean = w
105
+ .chars()
106
+ .filter(|c| c.is_alphabetic())
107
+ .collect::<String>()
108
+ .to_lowercase();
109
+ unique_words.insert(clean);
110
+ }
111
+ unique_words.len()
112
+ };
113
+
114
+ let diversity_ratio = final_unique_count as f32 / words.len() as f32;
115
+ score += diversity_ratio * DIVERSITY_RATIO_WEIGHT;
116
+
117
+ let char_entropy = Self::calculate_char_entropy(sentence);
118
+ score += char_entropy * CHAR_ENTROPY_WEIGHT;
119
+
120
+ score
121
+ }
122
+
123
+ /// Calculates character entropy (measure of text randomness/information content).
124
+ pub fn calculate_char_entropy(text: &str) -> f32 {
125
+ let chars: Vec<char> = text.chars().collect();
126
+ if chars.is_empty() {
127
+ return 0.0;
128
+ }
129
+
130
+ let estimated_unique = (chars.len() as f32 * 0.1).ceil() as usize;
131
+ let mut char_freq = AHashMap::with_capacity(estimated_unique.max(26));
132
+
133
+ for &ch in &chars {
134
+ let lowercase_ch = ch
135
+ .to_lowercase()
136
+ .next()
137
+ .expect("to_lowercase() must yield at least one character for valid Unicode");
138
+ *char_freq.entry(lowercase_ch).or_insert(0) += 1;
139
+ }
140
+
141
+ let total_chars = chars.len() as f32;
142
+ char_freq
143
+ .values()
144
+ .map(|&freq| {
145
+ let p = freq as f32 / total_chars;
146
+ if p > 0.0 { -p * p.log2() } else { 0.0 }
147
+ })
148
+ .sum::<f32>()
149
+ .min(5.0)
150
+ }
151
+
152
+ /// Checks if a word has important characteristics that should be preserved.
153
+ pub fn has_important_characteristics(word: &str) -> bool {
154
+ if word.len() > 1 && word.chars().all(|c| c.is_uppercase()) {
155
+ return true;
156
+ }
157
+
158
+ if word.chars().any(|c| c.is_numeric()) {
159
+ return true;
160
+ }
161
+
162
+ if word.len() > 10 {
163
+ return true;
164
+ }
165
+
166
+ let uppercase_count = word.chars().filter(|c| c.is_uppercase()).count();
167
+ if uppercase_count > 1 && uppercase_count < word.len() {
168
+ return true;
169
+ }
170
+
171
+ if Self::has_cjk_importance(word) {
172
+ return true;
173
+ }
174
+
175
+ false
176
+ }
177
+
178
+ /// Checks if a CJK word has important characteristics.
179
+ pub fn has_cjk_importance(word: &str) -> bool {
180
+ let chars: Vec<char> = word.chars().collect();
181
+
182
+ let has_cjk = chars.iter().any(|&c| c as u32 >= 0x4E00 && (c as u32) <= 0x9FFF);
183
+ if !has_cjk {
184
+ return false;
185
+ }
186
+
187
+ let important_radicals = [
188
+ '学', '智', '能', '技', '术', '法', '算', '理', '科', '研', '究', '发', '展', '系', '统', '模', '型', '方',
189
+ '式', '过', '程', '结', '构', '功', '效', '应', '分', '析', '计', '算', '数', '据', '信', '息', '处', '理',
190
+ '语', '言', '文', '生', '成', '产', '用', '作', '为', '成', '变', '化', '转', '换', '提', '高', '网', '络',
191
+ '神', '经', '机', '器', '人', '工', '智', '能', '自', '然', '复',
192
+ ];
193
+
194
+ for &char in &chars {
195
+ if important_radicals.contains(&char) {
196
+ return true;
197
+ }
198
+ }
199
+
200
+ if chars.len() == 2 && has_cjk {
201
+ let has_technical = chars.iter().any(|&c| {
202
+ let code = c as u32;
203
+ (0x4E00..=0x4FFF).contains(&code)
204
+ || (0x5000..=0x51FF).contains(&code)
205
+ || (0x6700..=0x68FF).contains(&code)
206
+ || (0x7500..=0x76FF).contains(&code)
207
+ });
208
+
209
+ if has_technical {
210
+ return true;
211
+ }
212
+ }
213
+
214
+ false
215
+ }
216
+ }
217
+
218
+ #[cfg(test)]
219
+ mod tests {
220
+ use super::*;
221
+
222
+ #[test]
223
+ fn test_calculate_char_entropy() {
224
+ let low_entropy = TextAnalyzer::calculate_char_entropy("aaaaaaa");
225
+ assert!(low_entropy < 1.0);
226
+
227
+ let high_entropy = TextAnalyzer::calculate_char_entropy("abcdefg123");
228
+ assert!(high_entropy > low_entropy);
229
+ }
230
+
231
+ #[test]
232
+ fn test_important_word_characteristics() {
233
+ assert!(TextAnalyzer::has_important_characteristics("IMPORTANT"));
234
+ assert!(TextAnalyzer::has_important_characteristics("COVID-19"));
235
+ assert!(TextAnalyzer::has_important_characteristics("PyTorch"));
236
+ assert!(TextAnalyzer::has_important_characteristics("verylongword123"));
237
+ }
238
+ }
@@ -0,0 +1,8 @@
1
+ mod analysis;
2
+ mod punctuation;
3
+ mod reducer;
4
+ mod sentence_selection;
5
+ mod word_filtering;
6
+
7
+ // Re-export the main public interface
8
+ pub use reducer::TokenReducer;
@@ -0,0 +1,54 @@
1
+ use once_cell::sync::Lazy;
2
+ use regex::Regex;
3
+ use std::borrow::Cow;
4
+
5
+ static REPEATED_EXCLAMATION: Lazy<Regex> =
6
+ Lazy::new(|| Regex::new(r"[!]{2,}").expect("Repeated exclamation regex pattern is valid and should compile"));
7
+ static REPEATED_QUESTION: Lazy<Regex> =
8
+ Lazy::new(|| Regex::new(r"[?]{2,}").expect("Repeated question regex pattern is valid and should compile"));
9
+ static REPEATED_COMMA: Lazy<Regex> =
10
+ Lazy::new(|| Regex::new(r"[,]{2,}").expect("Repeated comma regex pattern is valid and should compile"));
11
+
12
+ /// Handles punctuation cleaning and normalization.
13
+ pub struct PunctuationCleaner;
14
+
15
+ impl PunctuationCleaner {
16
+ /// Cleans excessive punctuation from text using optimized Cow pattern.
17
+ pub fn clean_punctuation_optimized(text: &str) -> String {
18
+ let mut result = Cow::Borrowed(text);
19
+
20
+ if REPEATED_EXCLAMATION.is_match(&result) {
21
+ result = Cow::Owned(REPEATED_EXCLAMATION.replace_all(&result, "!").into_owned());
22
+ }
23
+ if REPEATED_QUESTION.is_match(&result) {
24
+ result = Cow::Owned(REPEATED_QUESTION.replace_all(&result, "?").into_owned());
25
+ }
26
+ if REPEATED_COMMA.is_match(&result) {
27
+ result = Cow::Owned(REPEATED_COMMA.replace_all(&result, ",").into_owned());
28
+ }
29
+
30
+ result.into_owned()
31
+ }
32
+ }
33
+
34
+ #[cfg(test)]
35
+ mod tests {
36
+ use super::*;
37
+
38
+ #[test]
39
+ fn test_punctuation_normalization() {
40
+ let input = "Text!!!!!! with????? excessive,,,,,, punctuation";
41
+ let result = PunctuationCleaner::clean_punctuation_optimized(input);
42
+
43
+ assert!(!result.contains("!!!!!!"));
44
+ assert!(!result.contains("?????"));
45
+ assert!(!result.contains(",,,,,,"));
46
+ }
47
+
48
+ #[test]
49
+ fn test_punctuation_no_change() {
50
+ let input = "Text with normal punctuation!";
51
+ let result = PunctuationCleaner::clean_punctuation_optimized(input);
52
+ assert_eq!(result, input);
53
+ }
54
+ }