kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,377 @@
1
+ use crate::text::utf8_validation;
2
+ use ahash::AHashSet;
3
+ use once_cell::sync::Lazy;
4
+ use regex::Regex;
5
+
6
+ /// Regular expression for matching excessive newlines (3 or more consecutive newlines).
7
+ static EXCESSIVE_NEWLINES_REGEX: Lazy<Regex> =
8
+ Lazy::new(|| Regex::new(r"\n{3,}").expect("Excessive newlines regex pattern is valid and should compile"));
9
+
10
+ /// Regular expression for matching multiple consecutive spaces (2 or more).
11
+ static MULTIPLE_SPACES_REGEX: Lazy<Regex> =
12
+ Lazy::new(|| Regex::new(r" {2,}").expect("Multiple spaces regex pattern is valid and should compile"));
13
+
14
+ /// Normalizes whitespace in text by collapsing multiple spaces into a single space.
15
+ ///
16
+ /// # Arguments
17
+ /// * `text` - The input text with potentially multiple consecutive spaces
18
+ ///
19
+ /// # Returns
20
+ /// A new `String` with multiple spaces collapsed to single spaces
21
+ pub fn normalize_spaces(text: &str) -> String {
22
+ if MULTIPLE_SPACES_REGEX.is_match(text) {
23
+ MULTIPLE_SPACES_REGEX.replace_all(text, " ").into_owned()
24
+ } else {
25
+ text.to_string()
26
+ }
27
+ }
28
+
29
+ /// Reduces excessive newlines in text by collapsing 3+ consecutive newlines into 2.
30
+ ///
31
+ /// # Arguments
32
+ /// * `text` - The input text with potentially excessive newlines
33
+ ///
34
+ /// # Returns
35
+ /// A new `String` with excessive newlines normalized to at most 2 consecutive newlines
36
+ pub fn normalize_newlines(text: &str) -> String {
37
+ if EXCESSIVE_NEWLINES_REGEX.is_match(text) {
38
+ EXCESSIVE_NEWLINES_REGEX.replace_all(text, "\n\n").into_owned()
39
+ } else {
40
+ text.to_string()
41
+ }
42
+ }
43
+
44
+ /// Removes stopwords from text while preserving important patterns.
45
+ ///
46
+ /// This function intelligently filters out common stopwords while preserving:
47
+ /// - All-uppercase words (acronyms)
48
+ /// - Words containing digits
49
+ /// - Words matching custom preserve patterns
50
+ /// - Single-letter words
51
+ /// - Words with non-alphabetic characters
52
+ ///
53
+ /// # Arguments
54
+ /// * `text` - The input text to filter
55
+ /// * `stopwords` - Set of stopwords to remove (should be lowercase)
56
+ /// * `preserve_patterns` - Regex patterns for words that should never be removed
57
+ ///
58
+ /// # Returns
59
+ /// A new `String` with stopwords removed
60
+ pub fn remove_stopwords(text: &str, stopwords: &AHashSet<String>, preserve_patterns: &[Regex]) -> String {
61
+ let words: Vec<&str> = text.split_whitespace().collect();
62
+ let mut filtered_words = Vec::with_capacity((words.len() as f32 * 0.7).ceil() as usize);
63
+
64
+ for word in words {
65
+ if word.is_empty() {
66
+ continue;
67
+ }
68
+
69
+ // Check preserve patterns first
70
+ if should_preserve_word(word, preserve_patterns) {
71
+ filtered_words.push(word);
72
+ continue;
73
+ }
74
+
75
+ // Preserve all-uppercase words (acronyms like API, SDK, HTTP)
76
+ if word.len() > 1 && word.bytes().all(|b| b.is_ascii_uppercase() || !b.is_ascii_alphabetic()) {
77
+ filtered_words.push(word);
78
+ continue;
79
+ }
80
+
81
+ // Preserve words containing digits (version numbers, counts, etc.)
82
+ if word.bytes().any(|b| b.is_ascii_digit()) {
83
+ filtered_words.push(word);
84
+ continue;
85
+ }
86
+
87
+ // Extract the alphabetic core of the word for stopword matching
88
+ let clean_word = if word.is_ascii() {
89
+ let clean_bytes: Vec<u8> = word
90
+ .bytes()
91
+ .filter(|&b| b.is_ascii_alphabetic())
92
+ .map(|b| b.to_ascii_lowercase())
93
+ .collect();
94
+ utf8_validation::string_from_utf8(clean_bytes).unwrap_or_else(|_| {
95
+ word.chars()
96
+ .filter(|c| c.is_alphabetic())
97
+ .collect::<String>()
98
+ .to_lowercase()
99
+ })
100
+ } else {
101
+ word.chars()
102
+ .filter(|c| c.is_alphabetic())
103
+ .collect::<String>()
104
+ .to_lowercase()
105
+ };
106
+
107
+ // If the clean word is empty (word was all punctuation), preserve it
108
+ if clean_word.is_empty() {
109
+ filtered_words.push(word);
110
+ continue;
111
+ }
112
+
113
+ // Preserve single-letter words
114
+ if clean_word.len() <= 1 {
115
+ filtered_words.push(word);
116
+ continue;
117
+ }
118
+
119
+ // Check if the clean word is a stopword
120
+ if !stopwords.contains(&clean_word) {
121
+ filtered_words.push(word);
122
+ }
123
+ }
124
+
125
+ filtered_words.join(" ")
126
+ }
127
+
128
+ /// Checks if a word should be preserved based on configured patterns.
129
+ ///
130
+ /// # Arguments
131
+ /// * `word` - The word to check
132
+ /// * `preserve_patterns` - Regex patterns for words that should be preserved
133
+ ///
134
+ /// # Returns
135
+ /// `true` if the word matches any preserve pattern, `false` otherwise
136
+ #[inline]
137
+ pub fn should_preserve_word(word: &str, preserve_patterns: &[Regex]) -> bool {
138
+ preserve_patterns.iter().any(|pattern| pattern.is_match(word))
139
+ }
140
+
141
+ /// Splits a word into prefix (non-alphanumeric), core (alphanumeric), and suffix (non-alphanumeric).
142
+ ///
143
+ /// This is useful for handling punctuation-wrapped words like "(hello)" or "world!".
144
+ /// Currently used in tests; reserved for future word boundary-aware filtering.
145
+ ///
146
+ /// # Arguments
147
+ /// * `word` - The word to split
148
+ ///
149
+ /// # Returns
150
+ /// A tuple of (prefix, core, suffix) strings
151
+ #[cfg(test)]
152
+ pub fn split_word_boundaries(word: &str) -> (String, String, String) {
153
+ let chars: Vec<char> = word.chars().collect();
154
+ let mut start = 0;
155
+ let mut end = chars.len();
156
+
157
+ // Find the start of alphanumeric content
158
+ while start < chars.len() && !chars[start].is_alphanumeric() {
159
+ start += 1;
160
+ }
161
+
162
+ // Find the end of alphanumeric content
163
+ while end > start && !chars[end - 1].is_alphanumeric() {
164
+ end -= 1;
165
+ }
166
+
167
+ let prefix: String = chars[..start].iter().collect();
168
+ let core: String = chars[start..end].iter().collect();
169
+ let suffix: String = chars[end..].iter().collect();
170
+
171
+ (prefix, core, suffix)
172
+ }
173
+
174
+ #[cfg(all(test, feature = "stopwords"))]
175
+ mod tests {
176
+ use super::*;
177
+
178
+ fn create_test_stopwords() -> AHashSet<String> {
179
+ let mut set = AHashSet::new();
180
+ set.insert("the".to_string());
181
+ set.insert("is".to_string());
182
+ set.insert("a".to_string());
183
+ set.insert("and".to_string());
184
+ set.insert("with".to_string());
185
+ set.insert("by".to_string());
186
+ set
187
+ }
188
+
189
+ #[test]
190
+ fn test_normalize_spaces() {
191
+ let input = "Text with multiple spaces";
192
+ let result = normalize_spaces(input);
193
+ assert!(!result.contains(" "));
194
+ assert!(result.contains("Text with multiple spaces"));
195
+ }
196
+
197
+ #[test]
198
+ fn test_normalize_spaces_no_change() {
199
+ let input = "Text with single spaces";
200
+ let result = normalize_spaces(input);
201
+ assert_eq!(result, input);
202
+ }
203
+
204
+ #[test]
205
+ fn test_normalize_newlines() {
206
+ let input = "Paragraph 1\n\n\n\n\nParagraph 2";
207
+ let result = normalize_newlines(input);
208
+ assert!(!result.contains("\n\n\n"));
209
+ assert!(result.contains("Paragraph 1"));
210
+ assert!(result.contains("Paragraph 2"));
211
+ }
212
+
213
+ #[test]
214
+ fn test_normalize_newlines_no_change() {
215
+ let input = "Paragraph 1\n\nParagraph 2";
216
+ let result = normalize_newlines(input);
217
+ assert_eq!(result, input);
218
+ }
219
+
220
+ #[test]
221
+ fn test_remove_stopwords() {
222
+ let stopwords = create_test_stopwords();
223
+ let preserve_patterns = vec![];
224
+
225
+ let input = "The quick brown fox is jumping over the lazy dog";
226
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
227
+
228
+ assert!(!result.contains(" the "));
229
+ assert!(!result.contains(" is "));
230
+ assert!(result.contains("quick"));
231
+ assert!(result.contains("brown"));
232
+ assert!(result.contains("fox"));
233
+ }
234
+
235
+ #[test]
236
+ fn test_remove_stopwords_preserves_uppercase() {
237
+ let stopwords = create_test_stopwords();
238
+ let preserve_patterns = vec![];
239
+
240
+ let input = "The API is working WITH the SDK";
241
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
242
+
243
+ assert!(result.contains("API"));
244
+ assert!(result.contains("SDK"));
245
+ assert!(result.contains("WITH"));
246
+ assert!(!result.contains("The "));
247
+ assert!(!result.contains(" is "));
248
+ }
249
+
250
+ #[test]
251
+ fn test_remove_stopwords_preserves_numbers() {
252
+ let stopwords = create_test_stopwords();
253
+ let preserve_patterns = vec![];
254
+
255
+ let input = "The version is 3.14 and the count is 42";
256
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
257
+
258
+ assert!(result.contains("3.14"));
259
+ assert!(result.contains("42"));
260
+ assert!(result.contains("version"));
261
+ assert!(result.contains("count"));
262
+ }
263
+
264
+ #[cfg_attr(coverage, ignore = "coverage instrumentation disables SIMD stopword paths")]
265
+ #[test]
266
+ fn test_remove_stopwords_handles_punctuation() {
267
+ let stopwords = create_test_stopwords();
268
+ let preserve_patterns = vec![];
269
+
270
+ let input = "Hello, the world! This is great.";
271
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
272
+
273
+ assert!(result.contains("Hello,"));
274
+ assert!(result.contains("world!"));
275
+ assert!(result.contains("great."));
276
+ }
277
+
278
+ #[test]
279
+ fn test_remove_stopwords_single_letter() {
280
+ let stopwords = create_test_stopwords();
281
+ let preserve_patterns = vec![];
282
+
283
+ let input = "I a x test";
284
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
285
+
286
+ assert!(result.contains("I"));
287
+ assert!(result.contains("x"));
288
+ }
289
+
290
+ #[test]
291
+ fn test_preserve_patterns() {
292
+ let stopwords = create_test_stopwords();
293
+ let preserve_patterns = vec![
294
+ Regex::new(r"\b[A-Z]{2,}\b").unwrap(),
295
+ Regex::new(r"\b\d+\.\d+\.\d+\b").unwrap(),
296
+ Regex::new(r"@\w+").unwrap(),
297
+ ];
298
+
299
+ let input = "The NASA and HTTP protocols version 1.2.3 by @john";
300
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
301
+
302
+ assert!(result.contains("NASA"));
303
+ assert!(result.contains("HTTP"));
304
+ assert!(result.contains("1.2.3"));
305
+ assert!(result.contains("@john"));
306
+
307
+ assert!(!result.contains(" the "));
308
+ assert!(!result.contains(" and "));
309
+ assert!(!result.contains(" by "));
310
+ }
311
+
312
+ #[test]
313
+ fn test_should_preserve_word() {
314
+ let patterns = vec![Regex::new(r"\b[A-Z]{2,}\b").unwrap()];
315
+
316
+ assert!(should_preserve_word("NASA", &patterns));
317
+ assert!(should_preserve_word("HTTP", &patterns));
318
+ assert!(!should_preserve_word("hello", &patterns));
319
+ }
320
+
321
+ #[test]
322
+ fn test_split_word_boundaries() {
323
+ let (prefix, core, suffix) = split_word_boundaries("(hello)");
324
+ assert_eq!(prefix, "(");
325
+ assert_eq!(core, "hello");
326
+ assert_eq!(suffix, ")");
327
+
328
+ let (prefix2, core2, suffix2) = split_word_boundaries("world!");
329
+ assert_eq!(prefix2, "");
330
+ assert_eq!(core2, "world");
331
+ assert_eq!(suffix2, "!");
332
+
333
+ let (prefix3, core3, suffix3) = split_word_boundaries("'test");
334
+ assert_eq!(prefix3, "'");
335
+ assert_eq!(core3, "test");
336
+ assert_eq!(suffix3, "");
337
+
338
+ let (prefix4, core4, suffix4) = split_word_boundaries("simple");
339
+ assert_eq!(prefix4, "");
340
+ assert_eq!(core4, "simple");
341
+ assert_eq!(suffix4, "");
342
+
343
+ let (prefix5, core5, suffix5) = split_word_boundaries("\"example!!!\"");
344
+ assert_eq!(prefix5, "\"");
345
+ assert_eq!(core5, "example");
346
+ assert_eq!(suffix5, "!!!\"");
347
+ }
348
+
349
+ #[test]
350
+ fn test_split_word_boundaries_edge_cases() {
351
+ let (prefix, core, suffix) = split_word_boundaries("!!!");
352
+ assert_eq!(prefix, "!!!");
353
+ assert_eq!(core, "");
354
+ assert_eq!(suffix, "");
355
+
356
+ let (prefix2, core2, suffix2) = split_word_boundaries("");
357
+ assert_eq!(prefix2, "");
358
+ assert_eq!(core2, "");
359
+ assert_eq!(suffix2, "");
360
+
361
+ let (prefix3, core3, suffix3) = split_word_boundaries("a");
362
+ assert_eq!(prefix3, "");
363
+ assert_eq!(core3, "a");
364
+ assert_eq!(suffix3, "");
365
+
366
+ let (prefix4, core4, suffix4) = split_word_boundaries("(café)");
367
+ assert_eq!(prefix4, "(");
368
+ assert_eq!(core4, "café");
369
+ assert_eq!(suffix4, ")");
370
+ }
371
+
372
+ #[test]
373
+ fn test_lazy_regex_initialization() {
374
+ let _ = &*EXCESSIVE_NEWLINES_REGEX;
375
+ let _ = &*MULTIPLE_SPACES_REGEX;
376
+ }
377
+ }
@@ -0,0 +1,51 @@
1
+ use once_cell::sync::Lazy;
2
+ use regex::Regex;
3
+
4
+ /// Regular expression for matching HTML comments.
5
+ /// Matches the pattern `<!-- ... -->` for removing HTML comments from text.
6
+ static HTML_COMMENT_REGEX: Lazy<Regex> =
7
+ Lazy::new(|| Regex::new(r"<!--.*?-->").expect("HTML comment regex pattern is valid and should compile"));
8
+
9
+ /// Removes HTML comments from the input text.
10
+ ///
11
+ /// This function uses a regex to strip out all HTML comment blocks (`<!-- ... -->`).
12
+ ///
13
+ /// # Arguments
14
+ /// * `text` - The input text that may contain HTML comments
15
+ ///
16
+ /// # Returns
17
+ /// A new `String` with all HTML comments removed
18
+ pub fn remove_html_comments(text: &str) -> String {
19
+ if HTML_COMMENT_REGEX.is_match(text) {
20
+ HTML_COMMENT_REGEX.replace_all(text, "").into_owned()
21
+ } else {
22
+ text.to_string()
23
+ }
24
+ }
25
+
26
+ #[cfg(test)]
27
+ mod tests {
28
+ use super::*;
29
+
30
+ #[test]
31
+ fn test_remove_html_comments() {
32
+ let input = "Text before <!-- comment --> text after";
33
+ let result = remove_html_comments(input);
34
+
35
+ assert!(!result.contains("<!-- comment -->"));
36
+ assert!(result.contains("Text before"));
37
+ assert!(result.contains("text after"));
38
+ }
39
+
40
+ #[test]
41
+ fn test_no_html_comments() {
42
+ let input = "Text without comments";
43
+ let result = remove_html_comments(input);
44
+ assert_eq!(result, input);
45
+ }
46
+
47
+ #[test]
48
+ fn test_lazy_regex_initialization() {
49
+ let _ = &*HTML_COMMENT_REGEX;
50
+ }
51
+ }