kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,384 @@
1
+ use crate::error::Result;
2
+ use crate::text::token_reduction::{
3
+ config::{ReductionLevel, TokenReductionConfig},
4
+ filters::FilterPipeline,
5
+ semantic::SemanticAnalyzer,
6
+ simd_text::{SimdTextProcessor, chunk_text_for_parallel},
7
+ };
8
+ use rayon::prelude::*;
9
+ use std::sync::Arc;
10
+ use unicode_normalization::UnicodeNormalization;
11
+
12
+ use super::punctuation::PunctuationCleaner;
13
+ use super::sentence_selection::SentenceSelector;
14
+ use super::word_filtering::WordFilter;
15
+
16
+ pub struct TokenReducer {
17
+ config: Arc<TokenReductionConfig>,
18
+ text_processor: SimdTextProcessor,
19
+ filter_pipeline: FilterPipeline,
20
+ semantic_analyzer: Option<SemanticAnalyzer>,
21
+ word_filter: WordFilter,
22
+ language: String,
23
+ }
24
+
25
+ impl TokenReducer {
26
+ pub fn new(config: &TokenReductionConfig, language_hint: Option<&str>) -> Result<Self> {
27
+ let config = Arc::new(config.clone());
28
+ let language = language_hint
29
+ .or(config.language_hint.as_deref())
30
+ .unwrap_or("en")
31
+ .to_string();
32
+
33
+ let text_processor = SimdTextProcessor::new();
34
+ let filter_pipeline = FilterPipeline::new(&config, &language)?;
35
+
36
+ let semantic_analyzer = if matches!(config.level, ReductionLevel::Aggressive | ReductionLevel::Maximum) {
37
+ Some(SemanticAnalyzer::new(&language))
38
+ } else {
39
+ None
40
+ };
41
+
42
+ Ok(Self {
43
+ config,
44
+ text_processor,
45
+ filter_pipeline,
46
+ semantic_analyzer,
47
+ word_filter: WordFilter::new(),
48
+ language,
49
+ })
50
+ }
51
+
52
+ /// Get the language code being used for stopwords and semantic analysis.
53
+ pub fn language(&self) -> &str {
54
+ &self.language
55
+ }
56
+
57
+ pub fn reduce(&self, text: &str) -> String {
58
+ if text.is_empty() || matches!(self.config.level, ReductionLevel::Off) {
59
+ return text.to_string();
60
+ }
61
+
62
+ let nfc_string;
63
+ let working_text = if text.is_ascii() {
64
+ text
65
+ } else {
66
+ nfc_string = text.nfc().collect::<String>();
67
+ &nfc_string
68
+ };
69
+
70
+ match self.config.level {
71
+ ReductionLevel::Off => working_text.to_string(),
72
+ ReductionLevel::Light => self.apply_light_reduction_optimized(working_text),
73
+ ReductionLevel::Moderate => self.apply_moderate_reduction_optimized(working_text),
74
+ ReductionLevel::Aggressive => self.apply_aggressive_reduction_optimized(working_text),
75
+ ReductionLevel::Maximum => self.apply_maximum_reduction_optimized(working_text),
76
+ }
77
+ }
78
+
79
+ pub fn batch_reduce(&self, texts: &[&str]) -> Vec<String> {
80
+ if !self.config.enable_parallel || texts.len() < 2 {
81
+ return texts.iter().map(|text| self.reduce(text)).collect();
82
+ }
83
+
84
+ texts.par_iter().map(|text| self.reduce(text)).collect()
85
+ }
86
+
87
+ fn apply_light_reduction_optimized(&self, text: &str) -> String {
88
+ let mut result = if self.config.use_simd {
89
+ self.text_processor.clean_punctuation(text)
90
+ } else {
91
+ PunctuationCleaner::clean_punctuation_optimized(text)
92
+ };
93
+
94
+ result = self.filter_pipeline.apply_light_filters(&result);
95
+ result.trim().to_string()
96
+ }
97
+
98
+ fn apply_moderate_reduction_optimized(&self, text: &str) -> String {
99
+ let mut result = self.apply_light_reduction_optimized(text);
100
+
101
+ result = if self.config.enable_parallel && text.len() > 1000 {
102
+ self.apply_parallel_moderate_reduction(&result)
103
+ } else {
104
+ self.filter_pipeline.apply_moderate_filters(&result)
105
+ };
106
+
107
+ result
108
+ }
109
+
110
+ fn apply_aggressive_reduction_optimized(&self, text: &str) -> String {
111
+ let mut result = self.apply_moderate_reduction_optimized(text);
112
+
113
+ result = self.word_filter.remove_additional_common_words(&result);
114
+ result = SentenceSelector::apply_sentence_selection(&result);
115
+
116
+ if let Some(ref analyzer) = self.semantic_analyzer {
117
+ result = analyzer.apply_semantic_filtering(&result, self.config.semantic_threshold);
118
+ }
119
+
120
+ result
121
+ }
122
+
123
+ fn apply_maximum_reduction_optimized(&self, text: &str) -> String {
124
+ let mut result = self.apply_aggressive_reduction_optimized(text);
125
+
126
+ if let Some(ref analyzer) = self.semantic_analyzer
127
+ && self.config.enable_semantic_clustering
128
+ {
129
+ result = analyzer.apply_hypernym_compression(&result, self.config.target_reduction);
130
+ }
131
+
132
+ result
133
+ }
134
+
135
+ fn apply_parallel_moderate_reduction(&self, text: &str) -> String {
136
+ let num_threads = rayon::current_num_threads();
137
+ let chunks = chunk_text_for_parallel(text, num_threads);
138
+
139
+ let processed_chunks: Vec<String> = chunks
140
+ .par_iter()
141
+ .map(|chunk| self.filter_pipeline.apply_moderate_filters(chunk))
142
+ .collect();
143
+
144
+ processed_chunks.join(" ")
145
+ }
146
+ }
147
+
148
+ #[cfg(test)]
149
+ mod tests {
150
+ use super::*;
151
+
152
+ #[test]
153
+ fn test_light_reduction() {
154
+ let config = TokenReductionConfig {
155
+ level: ReductionLevel::Light,
156
+ use_simd: false,
157
+ ..Default::default()
158
+ };
159
+
160
+ let reducer = TokenReducer::new(&config, None).unwrap();
161
+ let input = "Hello world!!! How are you???";
162
+ let result = reducer.reduce(input);
163
+
164
+ assert!(result.len() < input.len());
165
+ assert!(!result.contains(" "));
166
+ }
167
+
168
+ #[test]
169
+ fn test_moderate_reduction() {
170
+ let config = TokenReductionConfig {
171
+ level: ReductionLevel::Moderate,
172
+ use_simd: false,
173
+ ..Default::default()
174
+ };
175
+
176
+ let reducer = TokenReducer::new(&config, Some("en")).unwrap();
177
+ let input = "The quick brown fox is jumping over the lazy dog";
178
+ let result = reducer.reduce(input);
179
+
180
+ assert!(result.len() < input.len());
181
+ assert!(result.contains("quick"));
182
+ assert!(result.contains("brown"));
183
+ assert!(result.contains("fox"));
184
+ }
185
+
186
+ #[test]
187
+ fn test_batch_processing() {
188
+ let config = TokenReductionConfig {
189
+ level: ReductionLevel::Light,
190
+ enable_parallel: false,
191
+ ..Default::default()
192
+ };
193
+
194
+ let reducer = TokenReducer::new(&config, None).unwrap();
195
+ let inputs = vec!["Hello world!", "How are you?", "Fine, thanks!"];
196
+ let results = reducer.batch_reduce(&inputs);
197
+
198
+ assert_eq!(results.len(), inputs.len());
199
+ for result in &results {
200
+ assert!(!result.contains(" "));
201
+ }
202
+ }
203
+
204
+ #[test]
205
+ fn test_aggressive_reduction() {
206
+ let config = TokenReductionConfig {
207
+ level: ReductionLevel::Aggressive,
208
+ use_simd: false,
209
+ ..Default::default()
210
+ };
211
+
212
+ let reducer = TokenReducer::new(&config, Some("en")).unwrap();
213
+ let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
214
+ let result = reducer.reduce(input);
215
+
216
+ assert!(result.len() < input.len());
217
+ assert!(!result.is_empty());
218
+ }
219
+
220
+ #[test]
221
+ fn test_maximum_reduction() {
222
+ let config = TokenReductionConfig {
223
+ level: ReductionLevel::Maximum,
224
+ use_simd: false,
225
+ enable_semantic_clustering: true,
226
+ ..Default::default()
227
+ };
228
+
229
+ let reducer = TokenReducer::new(&config, Some("en")).unwrap();
230
+ let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
231
+ let result = reducer.reduce(input);
232
+
233
+ assert!(result.len() < input.len());
234
+ assert!(!result.is_empty());
235
+ }
236
+
237
+ #[test]
238
+ fn test_empty_text_handling() {
239
+ let config = TokenReductionConfig {
240
+ level: ReductionLevel::Moderate,
241
+ ..Default::default()
242
+ };
243
+
244
+ let reducer = TokenReducer::new(&config, None).unwrap();
245
+ assert_eq!(reducer.reduce(""), "");
246
+ let result = reducer.reduce(" ");
247
+ assert!(result == " " || result.is_empty());
248
+ }
249
+
250
+ #[test]
251
+ fn test_off_mode_preserves_text() {
252
+ let config = TokenReductionConfig {
253
+ level: ReductionLevel::Off,
254
+ ..Default::default()
255
+ };
256
+
257
+ let reducer = TokenReducer::new(&config, None).unwrap();
258
+ let input = "Text with multiple spaces!!!";
259
+ assert_eq!(reducer.reduce(input), input);
260
+ }
261
+
262
+ #[test]
263
+ fn test_parallel_batch_processing() {
264
+ let config = TokenReductionConfig {
265
+ level: ReductionLevel::Light,
266
+ enable_parallel: true,
267
+ ..Default::default()
268
+ };
269
+
270
+ let reducer = TokenReducer::new(&config, None).unwrap();
271
+ let inputs = vec![
272
+ "First text with spaces",
273
+ "Second text with spaces",
274
+ "Third text with spaces",
275
+ ];
276
+ let results = reducer.batch_reduce(&inputs);
277
+
278
+ assert_eq!(results.len(), inputs.len());
279
+ for result in &results {
280
+ assert!(!result.contains(" "));
281
+ }
282
+ }
283
+
284
+ #[test]
285
+ fn test_cjk_text_handling() {
286
+ let config = TokenReductionConfig {
287
+ level: ReductionLevel::Moderate,
288
+ ..Default::default()
289
+ };
290
+
291
+ let reducer = TokenReducer::new(&config, Some("zh")).unwrap();
292
+ let input = "这是中文文本测试";
293
+ let result = reducer.reduce(input);
294
+
295
+ assert!(!result.is_empty());
296
+ }
297
+
298
+ #[test]
299
+ fn test_mixed_language_text() {
300
+ let config = TokenReductionConfig {
301
+ level: ReductionLevel::Moderate,
302
+ ..Default::default()
303
+ };
304
+
305
+ let reducer = TokenReducer::new(&config, None).unwrap();
306
+ let input = "This is English text 这是中文 and some more English";
307
+ let result = reducer.reduce(input);
308
+
309
+ assert!(!result.is_empty());
310
+ assert!(result.contains("English") || result.contains("中"));
311
+ }
312
+
313
+ #[test]
314
+ fn test_unicode_normalization_ascii() {
315
+ let config = TokenReductionConfig {
316
+ level: ReductionLevel::Light,
317
+ ..Default::default()
318
+ };
319
+
320
+ let reducer = TokenReducer::new(&config, None).unwrap();
321
+ let input = "Pure ASCII text without special characters";
322
+ let result = reducer.reduce(input);
323
+
324
+ assert!(result.contains("ASCII"));
325
+ }
326
+
327
+ #[test]
328
+ fn test_unicode_normalization_non_ascii() {
329
+ let config = TokenReductionConfig {
330
+ level: ReductionLevel::Light,
331
+ ..Default::default()
332
+ };
333
+
334
+ let reducer = TokenReducer::new(&config, None).unwrap();
335
+ let input = "Café naïve résumé";
336
+ let result = reducer.reduce(input);
337
+
338
+ assert!(result.contains("Café") || result.contains("Cafe"));
339
+ }
340
+
341
+ #[test]
342
+ fn test_single_text_vs_batch() {
343
+ let config = TokenReductionConfig {
344
+ level: ReductionLevel::Moderate,
345
+ ..Default::default()
346
+ };
347
+
348
+ let reducer = TokenReducer::new(&config, None).unwrap();
349
+ let text = "The quick brown fox jumps over the lazy dog";
350
+
351
+ let single_result = reducer.reduce(text);
352
+ let batch_results = reducer.batch_reduce(&[text]);
353
+
354
+ assert_eq!(single_result, batch_results[0]);
355
+ }
356
+
357
+ #[test]
358
+ fn test_important_word_preservation() {
359
+ let config = TokenReductionConfig {
360
+ level: ReductionLevel::Aggressive,
361
+ ..Default::default()
362
+ };
363
+
364
+ let reducer = TokenReducer::new(&config, None).unwrap();
365
+ let input = "The IMPORTANT word COVID-19 and 12345 numbers should be preserved";
366
+ let result = reducer.reduce(input);
367
+
368
+ assert!(result.contains("IMPORTANT") || result.contains("COVID") || result.contains("12345"));
369
+ }
370
+
371
+ #[test]
372
+ fn test_technical_terms_preservation() {
373
+ let config = TokenReductionConfig {
374
+ level: ReductionLevel::Aggressive,
375
+ ..Default::default()
376
+ };
377
+
378
+ let reducer = TokenReducer::new(&config, None).unwrap();
379
+ let input = "The implementation uses PyTorch and TensorFlow frameworks";
380
+ let result = reducer.reduce(input);
381
+
382
+ assert!(result.contains("PyTorch") || result.contains("TensorFlow"));
383
+ }
384
+ }
@@ -0,0 +1,68 @@
1
+ use super::analysis::TextAnalyzer;
2
+
3
+ /// Handles sentence selection and filtering based on importance scoring.
4
+ pub struct SentenceSelector;
5
+
6
+ impl SentenceSelector {
7
+ /// Applies sentence selection to keep only the most important sentences.
8
+ pub fn apply_sentence_selection(text: &str) -> String {
9
+ let sentences: Vec<&str> = text
10
+ .split(['.', '!', '?'])
11
+ .map(|s| s.trim())
12
+ .filter(|s| !s.is_empty())
13
+ .collect();
14
+
15
+ if sentences.len() <= 2 {
16
+ return text.to_string();
17
+ }
18
+
19
+ let mut scored_sentences: Vec<(usize, f32, &str)> = sentences
20
+ .iter()
21
+ .enumerate()
22
+ .map(|(i, sentence)| {
23
+ let score = TextAnalyzer::score_sentence_importance(sentence, i, sentences.len());
24
+ (i, score, *sentence)
25
+ })
26
+ .collect();
27
+
28
+ scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
29
+
30
+ let keep_count = ((sentences.len() as f32 * 0.4).ceil() as usize).max(1);
31
+ let mut selected_indices: Vec<usize> = scored_sentences[..keep_count].iter().map(|(i, _, _)| *i).collect();
32
+
33
+ selected_indices.sort();
34
+
35
+ let selected_sentences: Vec<&str> = selected_indices
36
+ .iter()
37
+ .filter_map(|&i| sentences.get(i))
38
+ .copied()
39
+ .collect();
40
+
41
+ if selected_sentences.is_empty() {
42
+ text.to_string()
43
+ } else {
44
+ selected_sentences.join(". ")
45
+ }
46
+ }
47
+ }
48
+
49
+ #[cfg(test)]
50
+ mod tests {
51
+ use super::*;
52
+
53
+ #[test]
54
+ fn test_sentence_selection() {
55
+ let input = "First sentence here. Second sentence with more words. Third one. Fourth sentence is even longer than the others.";
56
+ let result = SentenceSelector::apply_sentence_selection(input);
57
+
58
+ assert!(result.len() < input.len());
59
+ assert!(result.split(". ").count() < 4);
60
+ }
61
+
62
+ #[test]
63
+ fn test_sentence_selection_short_text() {
64
+ let input = "Only one sentence.";
65
+ let result = SentenceSelector::apply_sentence_selection(input);
66
+ assert_eq!(result, input);
67
+ }
68
+ }
@@ -0,0 +1,156 @@
1
+ use crate::text::token_reduction::cjk_utils::CjkTokenizer;
2
+ use ahash::AHashMap;
3
+
4
+ use super::analysis::TextAnalyzer;
5
+
6
+ /// Handles word filtering and token removal operations.
7
+ pub struct WordFilter {
8
+ cjk_tokenizer: CjkTokenizer,
9
+ }
10
+
11
+ impl WordFilter {
12
+ pub fn new() -> Self {
13
+ Self {
14
+ cjk_tokenizer: CjkTokenizer::new(),
15
+ }
16
+ }
17
+
18
+ /// Removes additional common words based on frequency and characteristics.
19
+ pub fn remove_additional_common_words(&self, text: &str) -> String {
20
+ let words = self.universal_tokenize(text);
21
+
22
+ if words.len() < 4 {
23
+ return text.to_string();
24
+ }
25
+
26
+ let estimated_unique = (words.len() as f32 * 0.7).ceil() as usize;
27
+ let mut word_freq = AHashMap::with_capacity(estimated_unique);
28
+
29
+ let mut word_lengths = Vec::with_capacity(words.len());
30
+
31
+ for word in &words {
32
+ let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
33
+ word.to_lowercase()
34
+ } else {
35
+ word.chars()
36
+ .filter(|c| c.is_alphabetic())
37
+ .collect::<String>()
38
+ .to_lowercase()
39
+ };
40
+
41
+ if !clean_word.is_empty() {
42
+ *word_freq.entry(clean_word.clone()).or_insert(0) += 1;
43
+ word_lengths.push(clean_word.chars().count());
44
+ }
45
+ }
46
+
47
+ let avg_length = if !word_lengths.is_empty() {
48
+ word_lengths.iter().sum::<usize>() as f32 / word_lengths.len() as f32
49
+ } else {
50
+ 5.0
51
+ };
52
+
53
+ let original_count = words.len();
54
+ let has_cjk_content = text.chars().any(|c| c as u32 >= 0x4E00 && (c as u32) <= 0x9FFF);
55
+
56
+ let mut filtered_words = Vec::with_capacity(words.len());
57
+ for word in &words {
58
+ let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
59
+ word.to_lowercase()
60
+ } else {
61
+ word.chars()
62
+ .filter(|c| c.is_alphabetic())
63
+ .collect::<String>()
64
+ .to_lowercase()
65
+ };
66
+
67
+ if clean_word.is_empty() {
68
+ filtered_words.push(word.clone());
69
+ } else {
70
+ let freq = word_freq.get(&clean_word).unwrap_or(&0);
71
+ let word_len = clean_word.chars().count() as f32;
72
+
73
+ if TextAnalyzer::has_important_characteristics(word)
74
+ || (*freq <= 2 && word_len >= avg_length * 0.8)
75
+ || (word_len >= avg_length * 1.5)
76
+ {
77
+ filtered_words.push(word.clone());
78
+ }
79
+ }
80
+ }
81
+
82
+ let fallback_threshold = if has_cjk_content {
83
+ original_count / 5
84
+ } else {
85
+ original_count / 3
86
+ };
87
+
88
+ if filtered_words.len() < fallback_threshold {
89
+ let mut fallback_words = Vec::with_capacity(words.len());
90
+ for word in &words {
91
+ let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
92
+ word.to_lowercase()
93
+ } else {
94
+ word.chars().filter(|c| c.is_alphabetic()).collect::<String>()
95
+ };
96
+
97
+ if clean_word.is_empty()
98
+ || clean_word.chars().count() >= 3
99
+ || TextAnalyzer::has_important_characteristics(word)
100
+ {
101
+ fallback_words.push(word.clone());
102
+ }
103
+ }
104
+ self.smart_join(&fallback_words, has_cjk_content)
105
+ } else {
106
+ self.smart_join(&filtered_words, has_cjk_content)
107
+ }
108
+ }
109
+
110
+ /// Smart joins tokens based on language type (CJK vs. other).
111
+ pub fn smart_join(&self, tokens: &[String], has_cjk_content: bool) -> String {
112
+ if has_cjk_content {
113
+ tokens.join("")
114
+ } else {
115
+ tokens.join(" ")
116
+ }
117
+ }
118
+
119
+ /// Universal tokenizer that handles both CJK and non-CJK text.
120
+ pub fn universal_tokenize(&self, text: &str) -> Vec<String> {
121
+ self.cjk_tokenizer.tokenize_mixed_text(text)
122
+ }
123
+ }
124
+
125
+ impl Default for WordFilter {
126
+ fn default() -> Self {
127
+ Self::new()
128
+ }
129
+ }
130
+
131
+ #[cfg(test)]
132
+ mod tests {
133
+ use super::*;
134
+
135
+ #[test]
136
+ fn test_universal_tokenize_english() {
137
+ let filter = WordFilter::new();
138
+ let tokens = filter.universal_tokenize("hello world test");
139
+ assert_eq!(tokens, vec!["hello", "world", "test"]);
140
+ }
141
+
142
+ #[test]
143
+ fn test_universal_tokenize_cjk() {
144
+ let filter = WordFilter::new();
145
+ let tokens = filter.universal_tokenize("中文");
146
+ assert!(!tokens.is_empty());
147
+ }
148
+
149
+ #[test]
150
+ fn test_fallback_threshold() {
151
+ let filter = WordFilter::new();
152
+ let input = "a the is of to in for on at by";
153
+ let result = filter.remove_additional_common_words(input);
154
+ assert!(!result.is_empty());
155
+ }
156
+ }