kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -1,27 +1,29 @@
1
1
  use crate::error::{KreuzbergError, Result};
2
2
  use crate::stopwords::STOPWORDS;
3
3
  use crate::text::token_reduction::config::TokenReductionConfig;
4
- use crate::text::utf8_validation;
5
4
  use ahash::{AHashMap, AHashSet};
6
- use once_cell::sync::Lazy;
7
5
  use regex::Regex;
8
6
  use std::sync::Arc;
9
7
 
10
- static HTML_COMMENT_REGEX: Lazy<Regex> =
11
- Lazy::new(|| Regex::new(r"<!--.*?-->").expect("HTML comment regex pattern is valid and should compile"));
12
- static EXCESSIVE_NEWLINES_REGEX: Lazy<Regex> =
13
- Lazy::new(|| Regex::new(r"\n{3,}").expect("Excessive newlines regex pattern is valid and should compile"));
14
- static MULTIPLE_SPACES_REGEX: Lazy<Regex> =
15
- Lazy::new(|| Regex::new(r" {2,}").expect("Multiple spaces regex pattern is valid and should compile"));
16
- static MARKDOWN_CODE_BLOCK_REGEX: Lazy<Regex> =
17
- Lazy::new(|| Regex::new(r"```[\s\S]*?```").expect("Markdown code block regex pattern is valid and should compile"));
18
- static MARKDOWN_INLINE_CODE_REGEX: Lazy<Regex> =
19
- Lazy::new(|| Regex::new(r"`[^`\n]+`").expect("Markdown inline code regex pattern is valid and should compile"));
20
- static MARKDOWN_HEADERS_REGEX: Lazy<Regex> =
21
- Lazy::new(|| Regex::new(r"^#{1,6}\s+").expect("Markdown headers regex pattern is valid and should compile"));
22
- static MARKDOWN_LISTS_REGEX: Lazy<Regex> =
23
- Lazy::new(|| Regex::new(r"^[ \t]*[-*+]\s+").expect("Markdown lists regex pattern is valid and should compile"));
24
-
8
+ // Import filter modules
9
+ mod general;
10
+ mod html;
11
+ mod markdown;
12
+
13
+ // Re-export all filter functions for backward compatibility
14
+ pub use general::{normalize_newlines, normalize_spaces, remove_stopwords};
15
+ pub use html::remove_html_comments;
16
+ pub use markdown::{
17
+ extract_and_preserve_code, is_markdown_header, is_markdown_list, is_markdown_table, preserve_markdown_structure,
18
+ restore_preserved_blocks,
19
+ };
20
+
21
+ /// Main filter pipeline orchestrator that coordinates various text filtering operations.
22
+ ///
23
+ /// The `FilterPipeline` provides a high-level interface for applying different levels
24
+ /// of text filtering, from light cleaning (HTML comments, whitespace) to moderate
25
+ /// filtering (stopword removal) while respecting preservation rules for code,
26
+ /// markdown, and custom patterns.
25
27
  pub struct FilterPipeline {
26
28
  config: Arc<TokenReductionConfig>,
27
29
  stopwords: AHashSet<String>,
@@ -30,6 +32,17 @@ pub struct FilterPipeline {
30
32
  }
31
33
 
32
34
  impl FilterPipeline {
35
+ /// Creates a new `FilterPipeline` with the specified configuration and language.
36
+ ///
37
+ /// # Arguments
38
+ /// * `config` - Token reduction configuration
39
+ /// * `language` - Language code for stopword selection (e.g., "en", "es", "de")
40
+ ///
41
+ /// # Returns
42
+ /// A `Result` containing the new `FilterPipeline` or an error if regex patterns are invalid
43
+ ///
44
+ /// # Errors
45
+ /// Returns a `KreuzbergError::Validation` if any preserve patterns are invalid regex
33
46
  pub fn new(config: &Arc<TokenReductionConfig>, language: &str) -> Result<Self> {
34
47
  let mut stopwords = STOPWORDS.get(language).cloned().unwrap_or_else(|| {
35
48
  STOPWORDS
@@ -63,256 +76,145 @@ impl FilterPipeline {
63
76
  })
64
77
  }
65
78
 
79
+ /// Applies light filtering to text, removing HTML comments and normalizing whitespace.
80
+ ///
81
+ /// Light filters include:
82
+ /// - HTML comment removal
83
+ /// - Multiple space normalization
84
+ /// - Excessive newline reduction
85
+ /// - Markdown structure preservation (if enabled)
86
+ /// - Code preservation (if enabled)
87
+ ///
88
+ /// # Arguments
89
+ /// * `text` - The input text to filter
90
+ ///
91
+ /// # Returns
92
+ /// A new `String` with light filters applied
66
93
  pub fn apply_light_filters(&self, text: &str) -> String {
67
94
  use std::borrow::Cow;
68
95
 
69
96
  let mut result = Cow::Borrowed(text);
70
97
 
98
+ // Preserve markdown code blocks if configured
71
99
  let mut preserved_blocks: Option<AHashMap<String, String>> = None;
72
100
  if self.config.preserve_markdown {
73
101
  let mut blocks = AHashMap::new();
74
- result = Cow::Owned(self.extract_and_preserve_code(result.as_ref(), &mut blocks));
102
+ result = Cow::Owned(extract_and_preserve_code(result.as_ref(), &mut blocks));
75
103
  preserved_blocks = Some(blocks);
76
104
  }
77
105
 
78
- if HTML_COMMENT_REGEX.is_match(&result) {
79
- result = Cow::Owned(HTML_COMMENT_REGEX.replace_all(&result, "").into_owned());
80
- }
81
-
82
- if MULTIPLE_SPACES_REGEX.is_match(&result) {
83
- result = Cow::Owned(MULTIPLE_SPACES_REGEX.replace_all(&result, " ").into_owned());
84
- }
106
+ // Remove HTML comments
107
+ result = Cow::Owned(remove_html_comments(&result));
85
108
 
86
- if EXCESSIVE_NEWLINES_REGEX.is_match(&result) {
87
- result = Cow::Owned(EXCESSIVE_NEWLINES_REGEX.replace_all(&result, "\n\n").into_owned());
88
- }
109
+ // Normalize whitespace
110
+ result = Cow::Owned(normalize_spaces(&result));
111
+ result = Cow::Owned(normalize_newlines(&result));
89
112
 
113
+ // Preserve markdown structure if configured
90
114
  if self.config.preserve_markdown {
91
- result = Cow::Owned(self.preserve_markdown_structure(&result));
115
+ result = Cow::Owned(preserve_markdown_structure(&result));
92
116
  }
93
117
 
118
+ // Restore preserved code blocks
94
119
  if let Some(blocks) = &preserved_blocks {
95
- result = Cow::Owned(self.restore_preserved_blocks(&result, blocks));
120
+ result = Cow::Owned(restore_preserved_blocks(&result, blocks));
96
121
  }
97
122
 
98
123
  result.into_owned()
99
124
  }
100
125
 
126
+ /// Applies moderate filtering to text, including stopword removal.
127
+ ///
128
+ /// Moderate filters include all light filters plus:
129
+ /// - Stopword removal (with markdown awareness if enabled)
130
+ /// - Code preservation during stopword removal
131
+ ///
132
+ /// # Arguments
133
+ /// * `text` - The input text to filter
134
+ ///
135
+ /// # Returns
136
+ /// A new `String` with moderate filters applied
101
137
  pub fn apply_moderate_filters(&self, text: &str) -> String {
102
138
  let mut result = self.apply_light_filters(text);
103
139
 
140
+ // Preserve code blocks during stopword removal if configured
104
141
  let mut preserved_blocks: Option<AHashMap<String, String>> = None;
105
142
  if self.config.preserve_code {
106
143
  let mut blocks = AHashMap::new();
107
- result = self.extract_and_preserve_code(&result, &mut blocks);
144
+ result = extract_and_preserve_code(&result, &mut blocks);
108
145
  preserved_blocks = Some(blocks);
109
146
  }
110
147
 
148
+ // Remove stopwords with markdown awareness if configured
111
149
  if self.config.preserve_markdown {
112
150
  result = self.remove_stopwords_preserving_markdown(&result);
113
151
  } else {
114
- result = self.remove_stopwords(&result);
152
+ result = remove_stopwords(&result, &self.stopwords, &self.preserve_patterns);
115
153
  }
116
154
 
155
+ // Restore preserved code blocks
117
156
  if let Some(blocks) = &preserved_blocks {
118
- result = self.restore_preserved_blocks(&result, blocks);
157
+ result = restore_preserved_blocks(&result, blocks);
119
158
  }
120
159
 
121
160
  result
122
161
  }
123
162
 
163
+ /// Removes stopwords while preserving markdown structural elements.
164
+ ///
165
+ /// This function processes text line-by-line, preserving:
166
+ /// - Markdown headers
167
+ /// - List items
168
+ /// - Table rows
169
+ ///
170
+ /// # Arguments
171
+ /// * `text` - The input text to filter
172
+ ///
173
+ /// # Returns
174
+ /// A new `String` with stopwords removed but markdown structure preserved
124
175
  fn remove_stopwords_preserving_markdown(&self, text: &str) -> String {
125
176
  let lines: Vec<&str> = text.lines().collect();
126
177
  let mut processed_lines = Vec::with_capacity(lines.len());
127
178
 
128
179
  for line in lines {
129
- if MARKDOWN_HEADERS_REGEX.is_match(line) {
180
+ // Preserve markdown headers
181
+ if is_markdown_header(line) {
130
182
  processed_lines.push(line.to_string());
131
183
  continue;
132
184
  }
133
185
 
134
- if MARKDOWN_LISTS_REGEX.is_match(line) {
186
+ // Preserve markdown list items
187
+ if is_markdown_list(line) {
135
188
  processed_lines.push(line.to_string());
136
189
  continue;
137
190
  }
138
191
 
139
- if line.trim().starts_with('|') && line.trim().ends_with('|') {
192
+ // Preserve markdown table rows
193
+ if is_markdown_table(line) {
140
194
  processed_lines.push(line.to_string());
141
195
  continue;
142
196
  }
143
197
 
144
- let processed_line = self.remove_stopwords(line);
198
+ // Apply stopword removal to regular text lines
199
+ let processed_line = remove_stopwords(line, &self.stopwords, &self.preserve_patterns);
145
200
  processed_lines.push(processed_line);
146
201
  }
147
202
 
148
203
  processed_lines.join("\n")
149
204
  }
150
205
 
151
- fn remove_stopwords(&self, text: &str) -> String {
152
- let words: Vec<&str> = text.split_whitespace().collect();
153
- let mut filtered_words = Vec::with_capacity((words.len() as f32 * 0.7).ceil() as usize);
154
-
155
- for word in words {
156
- if word.is_empty() {
157
- continue;
158
- }
159
-
160
- if self.should_preserve_word(word) {
161
- filtered_words.push(word);
162
- continue;
163
- }
164
-
165
- if word.len() > 1 && word.bytes().all(|b| b.is_ascii_uppercase() || !b.is_ascii_alphabetic()) {
166
- filtered_words.push(word);
167
- continue;
168
- }
169
-
170
- if word.bytes().any(|b| b.is_ascii_digit()) {
171
- filtered_words.push(word);
172
- continue;
173
- }
174
-
175
- let clean_word = if word.is_ascii() {
176
- let clean_bytes: Vec<u8> = word
177
- .bytes()
178
- .filter(|&b| b.is_ascii_alphabetic())
179
- .map(|b| b.to_ascii_lowercase())
180
- .collect();
181
- utf8_validation::string_from_utf8(clean_bytes).unwrap_or_else(|_| {
182
- word.chars()
183
- .filter(|c| c.is_alphabetic())
184
- .collect::<String>()
185
- .to_lowercase()
186
- })
187
- } else {
188
- word.chars()
189
- .filter(|c| c.is_alphabetic())
190
- .collect::<String>()
191
- .to_lowercase()
192
- };
193
-
194
- if clean_word.is_empty() {
195
- filtered_words.push(word);
196
- continue;
197
- }
198
-
199
- if clean_word.len() <= 1 {
200
- filtered_words.push(word);
201
- continue;
202
- }
203
-
204
- if !self.stopwords.contains(&clean_word) {
205
- filtered_words.push(word);
206
- }
207
- }
208
-
209
- filtered_words.join(" ")
210
- }
211
-
212
- /// Get the language code for this filter pipeline.
206
+ /// Gets the language code for this filter pipeline.
213
207
  ///
214
208
  /// Primarily useful for testing and debugging to verify language configuration.
215
209
  #[cfg_attr(not(test), allow(dead_code))]
216
210
  pub fn language(&self) -> &str {
217
211
  &self.language
218
212
  }
219
-
220
- /// Check if a word should be preserved based on configured patterns.
221
- fn should_preserve_word(&self, word: &str) -> bool {
222
- self.preserve_patterns.iter().any(|pattern| pattern.is_match(word))
223
- }
224
-
225
- /// Split a word into prefix (non-alphanumeric), core (alphanumeric), and suffix (non-alphanumeric).
226
- ///
227
- /// This is useful for handling punctuation-wrapped words like "(hello)" or "world!".
228
- /// Currently used in tests; reserved for future word boundary-aware filtering.
229
- #[cfg_attr(not(test), allow(dead_code))]
230
- fn split_word_boundaries(&self, word: &str) -> (String, String, String) {
231
- let chars: Vec<char> = word.chars().collect();
232
- let mut start = 0;
233
- let mut end = chars.len();
234
-
235
- while start < chars.len() && !chars[start].is_alphanumeric() {
236
- start += 1;
237
- }
238
-
239
- while end > start && !chars[end - 1].is_alphanumeric() {
240
- end -= 1;
241
- }
242
-
243
- let prefix: String = chars[..start].iter().collect();
244
- let core: String = chars[start..end].iter().collect();
245
- let suffix: String = chars[end..].iter().collect();
246
-
247
- (prefix, core, suffix)
248
- }
249
-
250
- fn preserve_markdown_structure(&self, text: &str) -> String {
251
- let lines: Vec<&str> = text.lines().collect();
252
- let mut processed_lines = Vec::with_capacity(lines.len());
253
-
254
- for line in lines {
255
- if MARKDOWN_HEADERS_REGEX.is_match(line) {
256
- processed_lines.push(line);
257
- continue;
258
- }
259
-
260
- if MARKDOWN_LISTS_REGEX.is_match(line) {
261
- processed_lines.push(line);
262
- continue;
263
- }
264
-
265
- processed_lines.push(line);
266
- }
267
-
268
- processed_lines.join("\n")
269
- }
270
-
271
- fn extract_and_preserve_code(&self, text: &str, preserved: &mut AHashMap<String, String>) -> String {
272
- let mut result = text.to_string();
273
- let mut code_block_id = 0;
274
- let mut inline_code_id = 0;
275
-
276
- result = MARKDOWN_CODE_BLOCK_REGEX
277
- .replace_all(&result, |caps: &regex::Captures| {
278
- let code_block = caps[0].to_string();
279
- let placeholder = format!("__CODEBLOCK_{}__", code_block_id);
280
- code_block_id += 1;
281
- preserved.insert(placeholder.clone(), code_block);
282
- placeholder
283
- })
284
- .to_string();
285
-
286
- result = MARKDOWN_INLINE_CODE_REGEX
287
- .replace_all(&result, |caps: &regex::Captures| {
288
- let inline_code = caps[0].to_string();
289
- let placeholder = format!("__INLINECODE_{}__", inline_code_id);
290
- inline_code_id += 1;
291
- preserved.insert(placeholder.clone(), inline_code);
292
- placeholder
293
- })
294
- .to_string();
295
-
296
- result
297
- }
298
-
299
- fn restore_preserved_blocks(&self, text: &str, preserved: &AHashMap<String, String>) -> String {
300
- if preserved.is_empty() {
301
- return text.to_string();
302
- }
303
-
304
- let mut result = text.to_string();
305
-
306
- for (placeholder, original_content) in preserved {
307
- result = result.replace(placeholder, original_content);
308
- }
309
-
310
- result
311
- }
312
213
  }
313
214
 
314
215
  #[cfg(all(test, feature = "stopwords"))]
315
216
  mod tests {
217
+ use super::general::split_word_boundaries;
316
218
  use super::*;
317
219
 
318
220
  #[test]
@@ -321,7 +223,7 @@ mod tests {
321
223
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
322
224
 
323
225
  let input = "The quick brown fox is jumping over the lazy dog";
324
- let result = pipeline.remove_stopwords(input);
226
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
325
227
 
326
228
  assert!(!result.contains(" the "));
327
229
  assert!(!result.contains(" is "));
@@ -341,7 +243,7 @@ mod tests {
341
243
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
342
244
 
343
245
  let input = "The NASA mission is a success";
344
- let result = pipeline.remove_stopwords(input);
246
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
345
247
 
346
248
  assert!(result.contains("NASA"));
347
249
  assert!(result.contains("mission"));
@@ -411,7 +313,7 @@ mod tests {
411
313
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
412
314
 
413
315
  let input = "The API is working WITH the SDK";
414
- let result = pipeline.remove_stopwords(input);
316
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
415
317
 
416
318
  assert!(result.contains("API"));
417
319
  assert!(result.contains("SDK"));
@@ -426,7 +328,7 @@ mod tests {
426
328
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
427
329
 
428
330
  let input = "The version is 3.14 and the count is 42";
429
- let result = pipeline.remove_stopwords(input);
331
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
430
332
 
431
333
  assert!(result.contains("3.14"));
432
334
  assert!(result.contains("42"));
@@ -441,7 +343,7 @@ mod tests {
441
343
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
442
344
 
443
345
  let input = "Hello, the world! This is great.";
444
- let result = pipeline.remove_stopwords(input);
346
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
445
347
 
446
348
  assert!(result.contains("Hello,"));
447
349
  assert!(result.contains("world!"));
@@ -465,7 +367,7 @@ mod tests {
465
367
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
466
368
 
467
369
  let input = "This is a custom word test";
468
- let result = pipeline.remove_stopwords(input);
370
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
469
371
 
470
372
  assert!(!result.contains("custom"));
471
373
  assert!(!result.contains("word"));
@@ -478,7 +380,7 @@ mod tests {
478
380
  let pipeline = FilterPipeline::new(&config, "es").unwrap();
479
381
 
480
382
  let input = "El perro grande bonito tiene";
481
- let result = pipeline.remove_stopwords(input);
383
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
482
384
 
483
385
  assert!(result.contains("perro"));
484
386
  assert!(result.contains("grande"));
@@ -495,7 +397,7 @@ mod tests {
495
397
  let pipeline = FilterPipeline::new(&config, "unknown").unwrap();
496
398
 
497
399
  let input = "The quick test with unknown language";
498
- let result = pipeline.remove_stopwords(input);
400
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
499
401
 
500
402
  assert!(!result.contains("The "));
501
403
  assert!(result.contains("quick"));
@@ -561,11 +463,11 @@ mod tests {
561
463
  preserve_code: true,
562
464
  ..Default::default()
563
465
  });
564
- let pipeline = FilterPipeline::new(&config, "en").unwrap();
466
+ let _pipeline = FilterPipeline::new(&config, "en").unwrap();
565
467
 
566
468
  let mut preserved = AHashMap::new();
567
469
  let input = "Text before\n```rust\nfn main() {}\n```\nText after";
568
- let result = pipeline.extract_and_preserve_code(input, &mut preserved);
470
+ let result = extract_and_preserve_code(input, &mut preserved);
569
471
 
570
472
  assert_eq!(preserved.len(), 1);
571
473
  assert!(preserved.values().any(|v| v.contains("fn main()")));
@@ -578,11 +480,11 @@ mod tests {
578
480
  preserve_code: true,
579
481
  ..Default::default()
580
482
  });
581
- let pipeline = FilterPipeline::new(&config, "en").unwrap();
483
+ let _pipeline = FilterPipeline::new(&config, "en").unwrap();
582
484
 
583
485
  let mut preserved = AHashMap::new();
584
486
  let input = "Use the `println!` macro";
585
- let result = pipeline.extract_and_preserve_code(input, &mut preserved);
487
+ let result = extract_and_preserve_code(input, &mut preserved);
586
488
 
587
489
  assert_eq!(preserved.len(), 1);
588
490
  assert!(preserved.values().any(|v| v == "`println!`"));
@@ -592,13 +494,13 @@ mod tests {
592
494
  #[test]
593
495
  fn test_restore_preserved_blocks() {
594
496
  let config = Arc::new(TokenReductionConfig::default());
595
- let pipeline = FilterPipeline::new(&config, "en").unwrap();
497
+ let _pipeline = FilterPipeline::new(&config, "en").unwrap();
596
498
 
597
499
  let mut preserved = AHashMap::new();
598
500
  preserved.insert("__CODEBLOCK_0__".to_string(), "```code```".to_string());
599
501
  preserved.insert("__INLINECODE_0__".to_string(), "`inline`".to_string());
600
502
  let input = "Text __CODEBLOCK_0__ and __INLINECODE_0__ here";
601
- let result = pipeline.restore_preserved_blocks(input, &preserved);
503
+ let result = restore_preserved_blocks(input, &preserved);
602
504
 
603
505
  assert!(result.contains("```code```"));
604
506
  assert!(result.contains("`inline`"));
@@ -654,7 +556,7 @@ mod tests {
654
556
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
655
557
 
656
558
  let input = "I a x test";
657
- let result = pipeline.remove_stopwords(input);
559
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
658
560
 
659
561
  assert!(result.contains("I"));
660
562
  assert!(result.contains("x"));
@@ -667,7 +569,7 @@ mod tests {
667
569
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
668
570
 
669
571
  let input = "The Test Is Working";
670
- let result = pipeline.remove_stopwords(input);
572
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
671
573
 
672
574
  assert!(!result.contains("The"));
673
575
  assert!(!result.contains("Is"));
@@ -675,29 +577,18 @@ mod tests {
675
577
  assert!(result.contains("Working"));
676
578
  }
677
579
 
678
- #[test]
679
- fn test_lazy_regex_initialization() {
680
- let _ = &*HTML_COMMENT_REGEX;
681
- let _ = &*EXCESSIVE_NEWLINES_REGEX;
682
- let _ = &*MULTIPLE_SPACES_REGEX;
683
- let _ = &*MARKDOWN_CODE_BLOCK_REGEX;
684
- let _ = &*MARKDOWN_INLINE_CODE_REGEX;
685
- let _ = &*MARKDOWN_HEADERS_REGEX;
686
- let _ = &*MARKDOWN_LISTS_REGEX;
687
- }
688
-
689
580
  #[test]
690
581
  fn test_multiple_code_blocks_hashmap_approach() {
691
582
  let config = Arc::new(TokenReductionConfig {
692
583
  preserve_code: true,
693
584
  ..Default::default()
694
585
  });
695
- let pipeline = FilterPipeline::new(&config, "en").unwrap();
586
+ let _pipeline = FilterPipeline::new(&config, "en").unwrap();
696
587
 
697
588
  let input =
698
589
  "Start ```rust\nlet x = 1;\n``` middle `inline1` text ```python\nprint('hi')\n``` and `inline2` end";
699
590
  let mut preserved = AHashMap::new();
700
- let result = pipeline.extract_and_preserve_code(input, &mut preserved);
591
+ let result = extract_and_preserve_code(input, &mut preserved);
701
592
 
702
593
  assert_eq!(preserved.len(), 4);
703
594
  assert!(preserved.contains_key("__CODEBLOCK_0__"));
@@ -710,7 +601,7 @@ mod tests {
710
601
  assert_eq!(preserved.get("__INLINECODE_0__").unwrap(), "`inline1`");
711
602
  assert_eq!(preserved.get("__INLINECODE_1__").unwrap(), "`inline2`");
712
603
 
713
- let restored = pipeline.restore_preserved_blocks(&result, &preserved);
604
+ let restored = restore_preserved_blocks(&result, &preserved);
714
605
  assert!(restored.contains("```rust\nlet x = 1;\n```"));
715
606
  assert!(restored.contains("```python\nprint('hi')\n```"));
716
607
  assert!(restored.contains("`inline1`"));
@@ -725,14 +616,14 @@ mod tests {
725
616
  preserve_code: true,
726
617
  ..Default::default()
727
618
  });
728
- let pipeline = FilterPipeline::new(&config, "en").unwrap();
619
+ let _pipeline = FilterPipeline::new(&config, "en").unwrap();
729
620
 
730
621
  let input = "Text `a` and `b` and `c` here";
731
622
  let mut preserved = AHashMap::new();
732
- let result = pipeline.extract_and_preserve_code(input, &mut preserved);
623
+ let result = extract_and_preserve_code(input, &mut preserved);
733
624
 
734
625
  assert_eq!(preserved.len(), 3);
735
- let restored = pipeline.restore_preserved_blocks(&result, &preserved);
626
+ let restored = restore_preserved_blocks(&result, &preserved);
736
627
 
737
628
  assert!(restored.contains("`a`"));
738
629
  assert!(restored.contains("`b`"));
@@ -755,7 +646,7 @@ mod tests {
755
646
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
756
647
 
757
648
  let input = "The NASA and HTTP protocols version 1.2.3 by @john";
758
- let result = pipeline.remove_stopwords(input);
649
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
759
650
 
760
651
  assert!(result.contains("NASA"));
761
652
  assert!(result.contains("HTTP"));
@@ -774,7 +665,7 @@ mod tests {
774
665
  assert_eq!(pipeline_en.language(), "en");
775
666
 
776
667
  let input_en = "the quick brown fox";
777
- let result_en = pipeline_en.remove_stopwords(input_en);
668
+ let result_en = remove_stopwords(input_en, &pipeline_en.stopwords, &pipeline_en.preserve_patterns);
778
669
  assert!(!result_en.contains(" the "));
779
670
 
780
671
  let config_de = Arc::new(TokenReductionConfig::default());
@@ -782,7 +673,7 @@ mod tests {
782
673
  assert_eq!(pipeline_de.language(), "de");
783
674
 
784
675
  let input_de = "der schnelle braune fuchs";
785
- let result_de = pipeline_de.remove_stopwords(input_de);
676
+ let result_de = remove_stopwords(input_de, &pipeline_de.stopwords, &pipeline_de.preserve_patterns);
786
677
  assert!(!result_de.contains(" der "));
787
678
  assert!(result_de.contains("schnelle"));
788
679
  }
@@ -795,7 +686,7 @@ mod tests {
795
686
  assert_eq!(pipeline.language(), "unsupported_lang");
796
687
 
797
688
  let input = "the quick brown fox";
798
- let result = pipeline.remove_stopwords(input);
689
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
799
690
 
800
691
  assert!(!result.contains(" the "));
801
692
  assert!(result.contains("quick"));
@@ -803,30 +694,27 @@ mod tests {
803
694
 
804
695
  #[test]
805
696
  fn test_split_word_boundaries() {
806
- let config = Arc::new(TokenReductionConfig::default());
807
- let pipeline = FilterPipeline::new(&config, "en").unwrap();
808
-
809
- let (prefix, core, suffix) = pipeline.split_word_boundaries("(hello)");
697
+ let (prefix, core, suffix) = split_word_boundaries("(hello)");
810
698
  assert_eq!(prefix, "(");
811
699
  assert_eq!(core, "hello");
812
700
  assert_eq!(suffix, ")");
813
701
 
814
- let (prefix2, core2, suffix2) = pipeline.split_word_boundaries("world!");
702
+ let (prefix2, core2, suffix2) = split_word_boundaries("world!");
815
703
  assert_eq!(prefix2, "");
816
704
  assert_eq!(core2, "world");
817
705
  assert_eq!(suffix2, "!");
818
706
 
819
- let (prefix3, core3, suffix3) = pipeline.split_word_boundaries("'test");
707
+ let (prefix3, core3, suffix3) = split_word_boundaries("'test");
820
708
  assert_eq!(prefix3, "'");
821
709
  assert_eq!(core3, "test");
822
710
  assert_eq!(suffix3, "");
823
711
 
824
- let (prefix4, core4, suffix4) = pipeline.split_word_boundaries("simple");
712
+ let (prefix4, core4, suffix4) = split_word_boundaries("simple");
825
713
  assert_eq!(prefix4, "");
826
714
  assert_eq!(core4, "simple");
827
715
  assert_eq!(suffix4, "");
828
716
 
829
- let (prefix5, core5, suffix5) = pipeline.split_word_boundaries("\"example!!!\"");
717
+ let (prefix5, core5, suffix5) = split_word_boundaries("\"example!!!\"");
830
718
  assert_eq!(prefix5, "\"");
831
719
  assert_eq!(core5, "example");
832
720
  assert_eq!(suffix5, "!!!\"");
@@ -834,25 +722,22 @@ mod tests {
834
722
 
835
723
  #[test]
836
724
  fn test_split_word_boundaries_edge_cases() {
837
- let config = Arc::new(TokenReductionConfig::default());
838
- let pipeline = FilterPipeline::new(&config, "en").unwrap();
839
-
840
- let (prefix, core, suffix) = pipeline.split_word_boundaries("!!!");
725
+ let (prefix, core, suffix) = split_word_boundaries("!!!");
841
726
  assert_eq!(prefix, "!!!");
842
727
  assert_eq!(core, "");
843
728
  assert_eq!(suffix, "");
844
729
 
845
- let (prefix2, core2, suffix2) = pipeline.split_word_boundaries("");
730
+ let (prefix2, core2, suffix2) = split_word_boundaries("");
846
731
  assert_eq!(prefix2, "");
847
732
  assert_eq!(core2, "");
848
733
  assert_eq!(suffix2, "");
849
734
 
850
- let (prefix3, core3, suffix3) = pipeline.split_word_boundaries("a");
735
+ let (prefix3, core3, suffix3) = split_word_boundaries("a");
851
736
  assert_eq!(prefix3, "");
852
737
  assert_eq!(core3, "a");
853
738
  assert_eq!(suffix3, "");
854
739
 
855
- let (prefix4, core4, suffix4) = pipeline.split_word_boundaries("(café)");
740
+ let (prefix4, core4, suffix4) = split_word_boundaries("(café)");
856
741
  assert_eq!(prefix4, "(");
857
742
  assert_eq!(core4, "café");
858
743
  assert_eq!(suffix4, ")");
@@ -874,7 +759,7 @@ mod tests {
874
759
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
875
760
 
876
761
  let input = "this is a custom stopword test";
877
- let result = pipeline.remove_stopwords(input);
762
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
878
763
 
879
764
  assert!(!result.contains(" custom "));
880
765
  assert!(!result.contains(" stopword "));
@@ -894,7 +779,7 @@ mod tests {
894
779
  let pipeline = FilterPipeline::new(&config, "en").unwrap();
895
780
 
896
781
  let input = "The quick brown fox";
897
- let result = pipeline.remove_stopwords(input);
782
+ let result = remove_stopwords(input, &pipeline.stopwords, &pipeline.preserve_patterns);
898
783
 
899
784
  assert!(!result.contains(" The "));
900
785
  assert!(result.contains("quick"));