kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,377 @@
1
+ use crate::text::utf8_validation;
2
+ use ahash::AHashSet;
3
+ use once_cell::sync::Lazy;
4
+ use regex::Regex;
5
+
6
+ /// Regular expression for matching excessive newlines (3 or more consecutive newlines).
7
+ static EXCESSIVE_NEWLINES_REGEX: Lazy<Regex> =
8
+ Lazy::new(|| Regex::new(r"\n{3,}").expect("Excessive newlines regex pattern is valid and should compile"));
9
+
10
+ /// Regular expression for matching multiple consecutive spaces (2 or more).
11
+ static MULTIPLE_SPACES_REGEX: Lazy<Regex> =
12
+ Lazy::new(|| Regex::new(r" {2,}").expect("Multiple spaces regex pattern is valid and should compile"));
13
+
14
+ /// Normalizes whitespace in text by collapsing multiple spaces into a single space.
15
+ ///
16
+ /// # Arguments
17
+ /// * `text` - The input text with potentially multiple consecutive spaces
18
+ ///
19
+ /// # Returns
20
+ /// A new `String` with multiple spaces collapsed to single spaces
21
+ pub fn normalize_spaces(text: &str) -> String {
22
+ if MULTIPLE_SPACES_REGEX.is_match(text) {
23
+ MULTIPLE_SPACES_REGEX.replace_all(text, " ").into_owned()
24
+ } else {
25
+ text.to_string()
26
+ }
27
+ }
28
+
29
+ /// Reduces excessive newlines in text by collapsing 3+ consecutive newlines into 2.
30
+ ///
31
+ /// # Arguments
32
+ /// * `text` - The input text with potentially excessive newlines
33
+ ///
34
+ /// # Returns
35
+ /// A new `String` with excessive newlines normalized to at most 2 consecutive newlines
36
+ pub fn normalize_newlines(text: &str) -> String {
37
+ if EXCESSIVE_NEWLINES_REGEX.is_match(text) {
38
+ EXCESSIVE_NEWLINES_REGEX.replace_all(text, "\n\n").into_owned()
39
+ } else {
40
+ text.to_string()
41
+ }
42
+ }
43
+
44
+ /// Removes stopwords from text while preserving important patterns.
45
+ ///
46
+ /// This function intelligently filters out common stopwords while preserving:
47
+ /// - All-uppercase words (acronyms)
48
+ /// - Words containing digits
49
+ /// - Words matching custom preserve patterns
50
+ /// - Single-letter words
51
+ /// - Words with non-alphabetic characters
52
+ ///
53
+ /// # Arguments
54
+ /// * `text` - The input text to filter
55
+ /// * `stopwords` - Set of stopwords to remove (should be lowercase)
56
+ /// * `preserve_patterns` - Regex patterns for words that should never be removed
57
+ ///
58
+ /// # Returns
59
+ /// A new `String` with stopwords removed
60
+ pub fn remove_stopwords(text: &str, stopwords: &AHashSet<String>, preserve_patterns: &[Regex]) -> String {
61
+ let words: Vec<&str> = text.split_whitespace().collect();
62
+ let mut filtered_words = Vec::with_capacity((words.len() as f32 * 0.7).ceil() as usize);
63
+
64
+ for word in words {
65
+ if word.is_empty() {
66
+ continue;
67
+ }
68
+
69
+ // Check preserve patterns first
70
+ if should_preserve_word(word, preserve_patterns) {
71
+ filtered_words.push(word);
72
+ continue;
73
+ }
74
+
75
+ // Preserve all-uppercase words (acronyms like API, SDK, HTTP)
76
+ if word.len() > 1 && word.bytes().all(|b| b.is_ascii_uppercase() || !b.is_ascii_alphabetic()) {
77
+ filtered_words.push(word);
78
+ continue;
79
+ }
80
+
81
+ // Preserve words containing digits (version numbers, counts, etc.)
82
+ if word.bytes().any(|b| b.is_ascii_digit()) {
83
+ filtered_words.push(word);
84
+ continue;
85
+ }
86
+
87
+ // Extract the alphabetic core of the word for stopword matching
88
+ let clean_word = if word.is_ascii() {
89
+ let clean_bytes: Vec<u8> = word
90
+ .bytes()
91
+ .filter(|&b| b.is_ascii_alphabetic())
92
+ .map(|b| b.to_ascii_lowercase())
93
+ .collect();
94
+ utf8_validation::string_from_utf8(clean_bytes).unwrap_or_else(|_| {
95
+ word.chars()
96
+ .filter(|c| c.is_alphabetic())
97
+ .collect::<String>()
98
+ .to_lowercase()
99
+ })
100
+ } else {
101
+ word.chars()
102
+ .filter(|c| c.is_alphabetic())
103
+ .collect::<String>()
104
+ .to_lowercase()
105
+ };
106
+
107
+ // If the clean word is empty (word was all punctuation), preserve it
108
+ if clean_word.is_empty() {
109
+ filtered_words.push(word);
110
+ continue;
111
+ }
112
+
113
+ // Preserve single-letter words
114
+ if clean_word.len() <= 1 {
115
+ filtered_words.push(word);
116
+ continue;
117
+ }
118
+
119
+ // Check if the clean word is a stopword
120
+ if !stopwords.contains(&clean_word) {
121
+ filtered_words.push(word);
122
+ }
123
+ }
124
+
125
+ filtered_words.join(" ")
126
+ }
127
+
128
+ /// Checks if a word should be preserved based on configured patterns.
129
+ ///
130
+ /// # Arguments
131
+ /// * `word` - The word to check
132
+ /// * `preserve_patterns` - Regex patterns for words that should be preserved
133
+ ///
134
+ /// # Returns
135
+ /// `true` if the word matches any preserve pattern, `false` otherwise
136
+ #[inline]
137
+ pub fn should_preserve_word(word: &str, preserve_patterns: &[Regex]) -> bool {
138
+ preserve_patterns.iter().any(|pattern| pattern.is_match(word))
139
+ }
140
+
141
+ /// Splits a word into prefix (non-alphanumeric), core (alphanumeric), and suffix (non-alphanumeric).
142
+ ///
143
+ /// This is useful for handling punctuation-wrapped words like "(hello)" or "world!".
144
+ /// Currently used in tests; reserved for future word boundary-aware filtering.
145
+ ///
146
+ /// # Arguments
147
+ /// * `word` - The word to split
148
+ ///
149
+ /// # Returns
150
+ /// A tuple of (prefix, core, suffix) strings
151
+ #[cfg(test)]
152
+ pub fn split_word_boundaries(word: &str) -> (String, String, String) {
153
+ let chars: Vec<char> = word.chars().collect();
154
+ let mut start = 0;
155
+ let mut end = chars.len();
156
+
157
+ // Find the start of alphanumeric content
158
+ while start < chars.len() && !chars[start].is_alphanumeric() {
159
+ start += 1;
160
+ }
161
+
162
+ // Find the end of alphanumeric content
163
+ while end > start && !chars[end - 1].is_alphanumeric() {
164
+ end -= 1;
165
+ }
166
+
167
+ let prefix: String = chars[..start].iter().collect();
168
+ let core: String = chars[start..end].iter().collect();
169
+ let suffix: String = chars[end..].iter().collect();
170
+
171
+ (prefix, core, suffix)
172
+ }
173
+
174
+ #[cfg(all(test, feature = "stopwords"))]
175
+ mod tests {
176
+ use super::*;
177
+
178
+ fn create_test_stopwords() -> AHashSet<String> {
179
+ let mut set = AHashSet::new();
180
+ set.insert("the".to_string());
181
+ set.insert("is".to_string());
182
+ set.insert("a".to_string());
183
+ set.insert("and".to_string());
184
+ set.insert("with".to_string());
185
+ set.insert("by".to_string());
186
+ set
187
+ }
188
+
189
+ #[test]
190
+ fn test_normalize_spaces() {
191
+ let input = "Text with multiple spaces";
192
+ let result = normalize_spaces(input);
193
+ assert!(!result.contains(" "));
194
+ assert!(result.contains("Text with multiple spaces"));
195
+ }
196
+
197
+ #[test]
198
+ fn test_normalize_spaces_no_change() {
199
+ let input = "Text with single spaces";
200
+ let result = normalize_spaces(input);
201
+ assert_eq!(result, input);
202
+ }
203
+
204
+ #[test]
205
+ fn test_normalize_newlines() {
206
+ let input = "Paragraph 1\n\n\n\n\nParagraph 2";
207
+ let result = normalize_newlines(input);
208
+ assert!(!result.contains("\n\n\n"));
209
+ assert!(result.contains("Paragraph 1"));
210
+ assert!(result.contains("Paragraph 2"));
211
+ }
212
+
213
+ #[test]
214
+ fn test_normalize_newlines_no_change() {
215
+ let input = "Paragraph 1\n\nParagraph 2";
216
+ let result = normalize_newlines(input);
217
+ assert_eq!(result, input);
218
+ }
219
+
220
+ #[test]
221
+ fn test_remove_stopwords() {
222
+ let stopwords = create_test_stopwords();
223
+ let preserve_patterns = vec![];
224
+
225
+ let input = "The quick brown fox is jumping over the lazy dog";
226
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
227
+
228
+ assert!(!result.contains(" the "));
229
+ assert!(!result.contains(" is "));
230
+ assert!(result.contains("quick"));
231
+ assert!(result.contains("brown"));
232
+ assert!(result.contains("fox"));
233
+ }
234
+
235
+ #[test]
236
+ fn test_remove_stopwords_preserves_uppercase() {
237
+ let stopwords = create_test_stopwords();
238
+ let preserve_patterns = vec![];
239
+
240
+ let input = "The API is working WITH the SDK";
241
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
242
+
243
+ assert!(result.contains("API"));
244
+ assert!(result.contains("SDK"));
245
+ assert!(result.contains("WITH"));
246
+ assert!(!result.contains("The "));
247
+ assert!(!result.contains(" is "));
248
+ }
249
+
250
+ #[test]
251
+ fn test_remove_stopwords_preserves_numbers() {
252
+ let stopwords = create_test_stopwords();
253
+ let preserve_patterns = vec![];
254
+
255
+ let input = "The version is 3.14 and the count is 42";
256
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
257
+
258
+ assert!(result.contains("3.14"));
259
+ assert!(result.contains("42"));
260
+ assert!(result.contains("version"));
261
+ assert!(result.contains("count"));
262
+ }
263
+
264
+ #[cfg_attr(coverage, ignore = "coverage instrumentation disables SIMD stopword paths")]
265
+ #[test]
266
+ fn test_remove_stopwords_handles_punctuation() {
267
+ let stopwords = create_test_stopwords();
268
+ let preserve_patterns = vec![];
269
+
270
+ let input = "Hello, the world! This is great.";
271
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
272
+
273
+ assert!(result.contains("Hello,"));
274
+ assert!(result.contains("world!"));
275
+ assert!(result.contains("great."));
276
+ }
277
+
278
+ #[test]
279
+ fn test_remove_stopwords_single_letter() {
280
+ let stopwords = create_test_stopwords();
281
+ let preserve_patterns = vec![];
282
+
283
+ let input = "I a x test";
284
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
285
+
286
+ assert!(result.contains("I"));
287
+ assert!(result.contains("x"));
288
+ }
289
+
290
+ #[test]
291
+ fn test_preserve_patterns() {
292
+ let stopwords = create_test_stopwords();
293
+ let preserve_patterns = vec![
294
+ Regex::new(r"\b[A-Z]{2,}\b").unwrap(),
295
+ Regex::new(r"\b\d+\.\d+\.\d+\b").unwrap(),
296
+ Regex::new(r"@\w+").unwrap(),
297
+ ];
298
+
299
+ let input = "The NASA and HTTP protocols version 1.2.3 by @john";
300
+ let result = remove_stopwords(input, &stopwords, &preserve_patterns);
301
+
302
+ assert!(result.contains("NASA"));
303
+ assert!(result.contains("HTTP"));
304
+ assert!(result.contains("1.2.3"));
305
+ assert!(result.contains("@john"));
306
+
307
+ assert!(!result.contains(" the "));
308
+ assert!(!result.contains(" and "));
309
+ assert!(!result.contains(" by "));
310
+ }
311
+
312
+ #[test]
313
+ fn test_should_preserve_word() {
314
+ let patterns = vec![Regex::new(r"\b[A-Z]{2,}\b").unwrap()];
315
+
316
+ assert!(should_preserve_word("NASA", &patterns));
317
+ assert!(should_preserve_word("HTTP", &patterns));
318
+ assert!(!should_preserve_word("hello", &patterns));
319
+ }
320
+
321
+ #[test]
322
+ fn test_split_word_boundaries() {
323
+ let (prefix, core, suffix) = split_word_boundaries("(hello)");
324
+ assert_eq!(prefix, "(");
325
+ assert_eq!(core, "hello");
326
+ assert_eq!(suffix, ")");
327
+
328
+ let (prefix2, core2, suffix2) = split_word_boundaries("world!");
329
+ assert_eq!(prefix2, "");
330
+ assert_eq!(core2, "world");
331
+ assert_eq!(suffix2, "!");
332
+
333
+ let (prefix3, core3, suffix3) = split_word_boundaries("'test");
334
+ assert_eq!(prefix3, "'");
335
+ assert_eq!(core3, "test");
336
+ assert_eq!(suffix3, "");
337
+
338
+ let (prefix4, core4, suffix4) = split_word_boundaries("simple");
339
+ assert_eq!(prefix4, "");
340
+ assert_eq!(core4, "simple");
341
+ assert_eq!(suffix4, "");
342
+
343
+ let (prefix5, core5, suffix5) = split_word_boundaries("\"example!!!\"");
344
+ assert_eq!(prefix5, "\"");
345
+ assert_eq!(core5, "example");
346
+ assert_eq!(suffix5, "!!!\"");
347
+ }
348
+
349
+ #[test]
350
+ fn test_split_word_boundaries_edge_cases() {
351
+ let (prefix, core, suffix) = split_word_boundaries("!!!");
352
+ assert_eq!(prefix, "!!!");
353
+ assert_eq!(core, "");
354
+ assert_eq!(suffix, "");
355
+
356
+ let (prefix2, core2, suffix2) = split_word_boundaries("");
357
+ assert_eq!(prefix2, "");
358
+ assert_eq!(core2, "");
359
+ assert_eq!(suffix2, "");
360
+
361
+ let (prefix3, core3, suffix3) = split_word_boundaries("a");
362
+ assert_eq!(prefix3, "");
363
+ assert_eq!(core3, "a");
364
+ assert_eq!(suffix3, "");
365
+
366
+ let (prefix4, core4, suffix4) = split_word_boundaries("(café)");
367
+ assert_eq!(prefix4, "(");
368
+ assert_eq!(core4, "café");
369
+ assert_eq!(suffix4, ")");
370
+ }
371
+
372
+ #[test]
373
+ fn test_lazy_regex_initialization() {
374
+ let _ = &*EXCESSIVE_NEWLINES_REGEX;
375
+ let _ = &*MULTIPLE_SPACES_REGEX;
376
+ }
377
+ }
@@ -0,0 +1,51 @@
1
+ use once_cell::sync::Lazy;
2
+ use regex::Regex;
3
+
4
+ /// Regular expression for matching HTML comments.
5
+ /// Matches the pattern `<!-- ... -->` for removing HTML comments from text.
6
+ static HTML_COMMENT_REGEX: Lazy<Regex> =
7
+ Lazy::new(|| Regex::new(r"<!--.*?-->").expect("HTML comment regex pattern is valid and should compile"));
8
+
9
+ /// Removes HTML comments from the input text.
10
+ ///
11
+ /// This function uses a regex to strip out all HTML comment blocks (`<!-- ... -->`).
12
+ ///
13
+ /// # Arguments
14
+ /// * `text` - The input text that may contain HTML comments
15
+ ///
16
+ /// # Returns
17
+ /// A new `String` with all HTML comments removed
18
+ pub fn remove_html_comments(text: &str) -> String {
19
+ if HTML_COMMENT_REGEX.is_match(text) {
20
+ HTML_COMMENT_REGEX.replace_all(text, "").into_owned()
21
+ } else {
22
+ text.to_string()
23
+ }
24
+ }
25
+
26
+ #[cfg(test)]
27
+ mod tests {
28
+ use super::*;
29
+
30
+ #[test]
31
+ fn test_remove_html_comments() {
32
+ let input = "Text before <!-- comment --> text after";
33
+ let result = remove_html_comments(input);
34
+
35
+ assert!(!result.contains("<!-- comment -->"));
36
+ assert!(result.contains("Text before"));
37
+ assert!(result.contains("text after"));
38
+ }
39
+
40
+ #[test]
41
+ fn test_no_html_comments() {
42
+ let input = "Text without comments";
43
+ let result = remove_html_comments(input);
44
+ assert_eq!(result, input);
45
+ }
46
+
47
+ #[test]
48
+ fn test_lazy_regex_initialization() {
49
+ let _ = &*HTML_COMMENT_REGEX;
50
+ }
51
+ }