kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,384 @@
1
+ use crate::error::Result;
2
+ use crate::text::token_reduction::{
3
+ config::{ReductionLevel, TokenReductionConfig},
4
+ filters::FilterPipeline,
5
+ semantic::SemanticAnalyzer,
6
+ simd_text::{SimdTextProcessor, chunk_text_for_parallel},
7
+ };
8
+ use rayon::prelude::*;
9
+ use std::sync::Arc;
10
+ use unicode_normalization::UnicodeNormalization;
11
+
12
+ use super::punctuation::PunctuationCleaner;
13
+ use super::sentence_selection::SentenceSelector;
14
+ use super::word_filtering::WordFilter;
15
+
16
+ pub struct TokenReducer {
17
+ config: Arc<TokenReductionConfig>,
18
+ text_processor: SimdTextProcessor,
19
+ filter_pipeline: FilterPipeline,
20
+ semantic_analyzer: Option<SemanticAnalyzer>,
21
+ word_filter: WordFilter,
22
+ language: String,
23
+ }
24
+
25
+ impl TokenReducer {
26
+ pub fn new(config: &TokenReductionConfig, language_hint: Option<&str>) -> Result<Self> {
27
+ let config = Arc::new(config.clone());
28
+ let language = language_hint
29
+ .or(config.language_hint.as_deref())
30
+ .unwrap_or("en")
31
+ .to_string();
32
+
33
+ let text_processor = SimdTextProcessor::new();
34
+ let filter_pipeline = FilterPipeline::new(&config, &language)?;
35
+
36
+ let semantic_analyzer = if matches!(config.level, ReductionLevel::Aggressive | ReductionLevel::Maximum) {
37
+ Some(SemanticAnalyzer::new(&language))
38
+ } else {
39
+ None
40
+ };
41
+
42
+ Ok(Self {
43
+ config,
44
+ text_processor,
45
+ filter_pipeline,
46
+ semantic_analyzer,
47
+ word_filter: WordFilter::new(),
48
+ language,
49
+ })
50
+ }
51
+
52
+ /// Get the language code being used for stopwords and semantic analysis.
53
+ pub fn language(&self) -> &str {
54
+ &self.language
55
+ }
56
+
57
+ pub fn reduce(&self, text: &str) -> String {
58
+ if text.is_empty() || matches!(self.config.level, ReductionLevel::Off) {
59
+ return text.to_string();
60
+ }
61
+
62
+ let nfc_string;
63
+ let working_text = if text.is_ascii() {
64
+ text
65
+ } else {
66
+ nfc_string = text.nfc().collect::<String>();
67
+ &nfc_string
68
+ };
69
+
70
+ match self.config.level {
71
+ ReductionLevel::Off => working_text.to_string(),
72
+ ReductionLevel::Light => self.apply_light_reduction_optimized(working_text),
73
+ ReductionLevel::Moderate => self.apply_moderate_reduction_optimized(working_text),
74
+ ReductionLevel::Aggressive => self.apply_aggressive_reduction_optimized(working_text),
75
+ ReductionLevel::Maximum => self.apply_maximum_reduction_optimized(working_text),
76
+ }
77
+ }
78
+
79
+ pub fn batch_reduce(&self, texts: &[&str]) -> Vec<String> {
80
+ if !self.config.enable_parallel || texts.len() < 2 {
81
+ return texts.iter().map(|text| self.reduce(text)).collect();
82
+ }
83
+
84
+ texts.par_iter().map(|text| self.reduce(text)).collect()
85
+ }
86
+
87
+ fn apply_light_reduction_optimized(&self, text: &str) -> String {
88
+ let mut result = if self.config.use_simd {
89
+ self.text_processor.clean_punctuation(text)
90
+ } else {
91
+ PunctuationCleaner::clean_punctuation_optimized(text)
92
+ };
93
+
94
+ result = self.filter_pipeline.apply_light_filters(&result);
95
+ result.trim().to_string()
96
+ }
97
+
98
+ fn apply_moderate_reduction_optimized(&self, text: &str) -> String {
99
+ let mut result = self.apply_light_reduction_optimized(text);
100
+
101
+ result = if self.config.enable_parallel && text.len() > 1000 {
102
+ self.apply_parallel_moderate_reduction(&result)
103
+ } else {
104
+ self.filter_pipeline.apply_moderate_filters(&result)
105
+ };
106
+
107
+ result
108
+ }
109
+
110
+ fn apply_aggressive_reduction_optimized(&self, text: &str) -> String {
111
+ let mut result = self.apply_moderate_reduction_optimized(text);
112
+
113
+ result = self.word_filter.remove_additional_common_words(&result);
114
+ result = SentenceSelector::apply_sentence_selection(&result);
115
+
116
+ if let Some(ref analyzer) = self.semantic_analyzer {
117
+ result = analyzer.apply_semantic_filtering(&result, self.config.semantic_threshold);
118
+ }
119
+
120
+ result
121
+ }
122
+
123
+ fn apply_maximum_reduction_optimized(&self, text: &str) -> String {
124
+ let mut result = self.apply_aggressive_reduction_optimized(text);
125
+
126
+ if let Some(ref analyzer) = self.semantic_analyzer
127
+ && self.config.enable_semantic_clustering
128
+ {
129
+ result = analyzer.apply_hypernym_compression(&result, self.config.target_reduction);
130
+ }
131
+
132
+ result
133
+ }
134
+
135
+ fn apply_parallel_moderate_reduction(&self, text: &str) -> String {
136
+ let num_threads = rayon::current_num_threads();
137
+ let chunks = chunk_text_for_parallel(text, num_threads);
138
+
139
+ let processed_chunks: Vec<String> = chunks
140
+ .par_iter()
141
+ .map(|chunk| self.filter_pipeline.apply_moderate_filters(chunk))
142
+ .collect();
143
+
144
+ processed_chunks.join(" ")
145
+ }
146
+ }
147
+
148
+ #[cfg(test)]
149
+ mod tests {
150
+ use super::*;
151
+
152
+ #[test]
153
+ fn test_light_reduction() {
154
+ let config = TokenReductionConfig {
155
+ level: ReductionLevel::Light,
156
+ use_simd: false,
157
+ ..Default::default()
158
+ };
159
+
160
+ let reducer = TokenReducer::new(&config, None).unwrap();
161
+ let input = "Hello world!!! How are you???";
162
+ let result = reducer.reduce(input);
163
+
164
+ assert!(result.len() < input.len());
165
+ assert!(!result.contains(" "));
166
+ }
167
+
168
+ #[test]
169
+ fn test_moderate_reduction() {
170
+ let config = TokenReductionConfig {
171
+ level: ReductionLevel::Moderate,
172
+ use_simd: false,
173
+ ..Default::default()
174
+ };
175
+
176
+ let reducer = TokenReducer::new(&config, Some("en")).unwrap();
177
+ let input = "The quick brown fox is jumping over the lazy dog";
178
+ let result = reducer.reduce(input);
179
+
180
+ assert!(result.len() < input.len());
181
+ assert!(result.contains("quick"));
182
+ assert!(result.contains("brown"));
183
+ assert!(result.contains("fox"));
184
+ }
185
+
186
+ #[test]
187
+ fn test_batch_processing() {
188
+ let config = TokenReductionConfig {
189
+ level: ReductionLevel::Light,
190
+ enable_parallel: false,
191
+ ..Default::default()
192
+ };
193
+
194
+ let reducer = TokenReducer::new(&config, None).unwrap();
195
+ let inputs = vec!["Hello world!", "How are you?", "Fine, thanks!"];
196
+ let results = reducer.batch_reduce(&inputs);
197
+
198
+ assert_eq!(results.len(), inputs.len());
199
+ for result in &results {
200
+ assert!(!result.contains(" "));
201
+ }
202
+ }
203
+
204
+ #[test]
205
+ fn test_aggressive_reduction() {
206
+ let config = TokenReductionConfig {
207
+ level: ReductionLevel::Aggressive,
208
+ use_simd: false,
209
+ ..Default::default()
210
+ };
211
+
212
+ let reducer = TokenReducer::new(&config, Some("en")).unwrap();
213
+ let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
214
+ let result = reducer.reduce(input);
215
+
216
+ assert!(result.len() < input.len());
217
+ assert!(!result.is_empty());
218
+ }
219
+
220
+ #[test]
221
+ fn test_maximum_reduction() {
222
+ let config = TokenReductionConfig {
223
+ level: ReductionLevel::Maximum,
224
+ use_simd: false,
225
+ enable_semantic_clustering: true,
226
+ ..Default::default()
227
+ };
228
+
229
+ let reducer = TokenReducer::new(&config, Some("en")).unwrap();
230
+ let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
231
+ let result = reducer.reduce(input);
232
+
233
+ assert!(result.len() < input.len());
234
+ assert!(!result.is_empty());
235
+ }
236
+
237
+ #[test]
238
+ fn test_empty_text_handling() {
239
+ let config = TokenReductionConfig {
240
+ level: ReductionLevel::Moderate,
241
+ ..Default::default()
242
+ };
243
+
244
+ let reducer = TokenReducer::new(&config, None).unwrap();
245
+ assert_eq!(reducer.reduce(""), "");
246
+ let result = reducer.reduce(" ");
247
+ assert!(result == " " || result.is_empty());
248
+ }
249
+
250
+ #[test]
251
+ fn test_off_mode_preserves_text() {
252
+ let config = TokenReductionConfig {
253
+ level: ReductionLevel::Off,
254
+ ..Default::default()
255
+ };
256
+
257
+ let reducer = TokenReducer::new(&config, None).unwrap();
258
+ let input = "Text with multiple spaces!!!";
259
+ assert_eq!(reducer.reduce(input), input);
260
+ }
261
+
262
+ #[test]
263
+ fn test_parallel_batch_processing() {
264
+ let config = TokenReductionConfig {
265
+ level: ReductionLevel::Light,
266
+ enable_parallel: true,
267
+ ..Default::default()
268
+ };
269
+
270
+ let reducer = TokenReducer::new(&config, None).unwrap();
271
+ let inputs = vec![
272
+ "First text with spaces",
273
+ "Second text with spaces",
274
+ "Third text with spaces",
275
+ ];
276
+ let results = reducer.batch_reduce(&inputs);
277
+
278
+ assert_eq!(results.len(), inputs.len());
279
+ for result in &results {
280
+ assert!(!result.contains(" "));
281
+ }
282
+ }
283
+
284
+ #[test]
285
+ fn test_cjk_text_handling() {
286
+ let config = TokenReductionConfig {
287
+ level: ReductionLevel::Moderate,
288
+ ..Default::default()
289
+ };
290
+
291
+ let reducer = TokenReducer::new(&config, Some("zh")).unwrap();
292
+ let input = "这是中文文本测试";
293
+ let result = reducer.reduce(input);
294
+
295
+ assert!(!result.is_empty());
296
+ }
297
+
298
+ #[test]
299
+ fn test_mixed_language_text() {
300
+ let config = TokenReductionConfig {
301
+ level: ReductionLevel::Moderate,
302
+ ..Default::default()
303
+ };
304
+
305
+ let reducer = TokenReducer::new(&config, None).unwrap();
306
+ let input = "This is English text 这是中文 and some more English";
307
+ let result = reducer.reduce(input);
308
+
309
+ assert!(!result.is_empty());
310
+ assert!(result.contains("English") || result.contains("中"));
311
+ }
312
+
313
+ #[test]
314
+ fn test_unicode_normalization_ascii() {
315
+ let config = TokenReductionConfig {
316
+ level: ReductionLevel::Light,
317
+ ..Default::default()
318
+ };
319
+
320
+ let reducer = TokenReducer::new(&config, None).unwrap();
321
+ let input = "Pure ASCII text without special characters";
322
+ let result = reducer.reduce(input);
323
+
324
+ assert!(result.contains("ASCII"));
325
+ }
326
+
327
+ #[test]
328
+ fn test_unicode_normalization_non_ascii() {
329
+ let config = TokenReductionConfig {
330
+ level: ReductionLevel::Light,
331
+ ..Default::default()
332
+ };
333
+
334
+ let reducer = TokenReducer::new(&config, None).unwrap();
335
+ let input = "Café naïve résumé";
336
+ let result = reducer.reduce(input);
337
+
338
+ assert!(result.contains("Café") || result.contains("Cafe"));
339
+ }
340
+
341
+ #[test]
342
+ fn test_single_text_vs_batch() {
343
+ let config = TokenReductionConfig {
344
+ level: ReductionLevel::Moderate,
345
+ ..Default::default()
346
+ };
347
+
348
+ let reducer = TokenReducer::new(&config, None).unwrap();
349
+ let text = "The quick brown fox jumps over the lazy dog";
350
+
351
+ let single_result = reducer.reduce(text);
352
+ let batch_results = reducer.batch_reduce(&[text]);
353
+
354
+ assert_eq!(single_result, batch_results[0]);
355
+ }
356
+
357
+ #[test]
358
+ fn test_important_word_preservation() {
359
+ let config = TokenReductionConfig {
360
+ level: ReductionLevel::Aggressive,
361
+ ..Default::default()
362
+ };
363
+
364
+ let reducer = TokenReducer::new(&config, None).unwrap();
365
+ let input = "The IMPORTANT word COVID-19 and 12345 numbers should be preserved";
366
+ let result = reducer.reduce(input);
367
+
368
+ assert!(result.contains("IMPORTANT") || result.contains("COVID") || result.contains("12345"));
369
+ }
370
+
371
+ #[test]
372
+ fn test_technical_terms_preservation() {
373
+ let config = TokenReductionConfig {
374
+ level: ReductionLevel::Aggressive,
375
+ ..Default::default()
376
+ };
377
+
378
+ let reducer = TokenReducer::new(&config, None).unwrap();
379
+ let input = "The implementation uses PyTorch and TensorFlow frameworks";
380
+ let result = reducer.reduce(input);
381
+
382
+ assert!(result.contains("PyTorch") || result.contains("TensorFlow"));
383
+ }
384
+ }
@@ -0,0 +1,68 @@
1
+ use super::analysis::TextAnalyzer;
2
+
3
+ /// Handles sentence selection and filtering based on importance scoring.
4
+ pub struct SentenceSelector;
5
+
6
+ impl SentenceSelector {
7
+ /// Applies sentence selection to keep only the most important sentences.
8
+ pub fn apply_sentence_selection(text: &str) -> String {
9
+ let sentences: Vec<&str> = text
10
+ .split(['.', '!', '?'])
11
+ .map(|s| s.trim())
12
+ .filter(|s| !s.is_empty())
13
+ .collect();
14
+
15
+ if sentences.len() <= 2 {
16
+ return text.to_string();
17
+ }
18
+
19
+ let mut scored_sentences: Vec<(usize, f32, &str)> = sentences
20
+ .iter()
21
+ .enumerate()
22
+ .map(|(i, sentence)| {
23
+ let score = TextAnalyzer::score_sentence_importance(sentence, i, sentences.len());
24
+ (i, score, *sentence)
25
+ })
26
+ .collect();
27
+
28
+ scored_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
29
+
30
+ let keep_count = ((sentences.len() as f32 * 0.4).ceil() as usize).max(1);
31
+ let mut selected_indices: Vec<usize> = scored_sentences[..keep_count].iter().map(|(i, _, _)| *i).collect();
32
+
33
+ selected_indices.sort();
34
+
35
+ let selected_sentences: Vec<&str> = selected_indices
36
+ .iter()
37
+ .filter_map(|&i| sentences.get(i))
38
+ .copied()
39
+ .collect();
40
+
41
+ if selected_sentences.is_empty() {
42
+ text.to_string()
43
+ } else {
44
+ selected_sentences.join(". ")
45
+ }
46
+ }
47
+ }
48
+
49
+ #[cfg(test)]
50
+ mod tests {
51
+ use super::*;
52
+
53
+ #[test]
54
+ fn test_sentence_selection() {
55
+ let input = "First sentence here. Second sentence with more words. Third one. Fourth sentence is even longer than the others.";
56
+ let result = SentenceSelector::apply_sentence_selection(input);
57
+
58
+ assert!(result.len() < input.len());
59
+ assert!(result.split(". ").count() < 4);
60
+ }
61
+
62
+ #[test]
63
+ fn test_sentence_selection_short_text() {
64
+ let input = "Only one sentence.";
65
+ let result = SentenceSelector::apply_sentence_selection(input);
66
+ assert_eq!(result, input);
67
+ }
68
+ }
@@ -0,0 +1,156 @@
1
+ use crate::text::token_reduction::cjk_utils::CjkTokenizer;
2
+ use ahash::AHashMap;
3
+
4
+ use super::analysis::TextAnalyzer;
5
+
6
+ /// Handles word filtering and token removal operations.
7
+ pub struct WordFilter {
8
+ cjk_tokenizer: CjkTokenizer,
9
+ }
10
+
11
+ impl WordFilter {
12
+ pub fn new() -> Self {
13
+ Self {
14
+ cjk_tokenizer: CjkTokenizer::new(),
15
+ }
16
+ }
17
+
18
+ /// Removes additional common words based on frequency and characteristics.
19
+ pub fn remove_additional_common_words(&self, text: &str) -> String {
20
+ let words = self.universal_tokenize(text);
21
+
22
+ if words.len() < 4 {
23
+ return text.to_string();
24
+ }
25
+
26
+ let estimated_unique = (words.len() as f32 * 0.7).ceil() as usize;
27
+ let mut word_freq = AHashMap::with_capacity(estimated_unique);
28
+
29
+ let mut word_lengths = Vec::with_capacity(words.len());
30
+
31
+ for word in &words {
32
+ let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
33
+ word.to_lowercase()
34
+ } else {
35
+ word.chars()
36
+ .filter(|c| c.is_alphabetic())
37
+ .collect::<String>()
38
+ .to_lowercase()
39
+ };
40
+
41
+ if !clean_word.is_empty() {
42
+ *word_freq.entry(clean_word.clone()).or_insert(0) += 1;
43
+ word_lengths.push(clean_word.chars().count());
44
+ }
45
+ }
46
+
47
+ let avg_length = if !word_lengths.is_empty() {
48
+ word_lengths.iter().sum::<usize>() as f32 / word_lengths.len() as f32
49
+ } else {
50
+ 5.0
51
+ };
52
+
53
+ let original_count = words.len();
54
+ let has_cjk_content = text.chars().any(|c| c as u32 >= 0x4E00 && (c as u32) <= 0x9FFF);
55
+
56
+ let mut filtered_words = Vec::with_capacity(words.len());
57
+ for word in &words {
58
+ let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
59
+ word.to_lowercase()
60
+ } else {
61
+ word.chars()
62
+ .filter(|c| c.is_alphabetic())
63
+ .collect::<String>()
64
+ .to_lowercase()
65
+ };
66
+
67
+ if clean_word.is_empty() {
68
+ filtered_words.push(word.clone());
69
+ } else {
70
+ let freq = word_freq.get(&clean_word).unwrap_or(&0);
71
+ let word_len = clean_word.chars().count() as f32;
72
+
73
+ if TextAnalyzer::has_important_characteristics(word)
74
+ || (*freq <= 2 && word_len >= avg_length * 0.8)
75
+ || (word_len >= avg_length * 1.5)
76
+ {
77
+ filtered_words.push(word.clone());
78
+ }
79
+ }
80
+ }
81
+
82
+ let fallback_threshold = if has_cjk_content {
83
+ original_count / 5
84
+ } else {
85
+ original_count / 3
86
+ };
87
+
88
+ if filtered_words.len() < fallback_threshold {
89
+ let mut fallback_words = Vec::with_capacity(words.len());
90
+ for word in &words {
91
+ let clean_word = if word.chars().all(|c| c.is_alphabetic()) {
92
+ word.to_lowercase()
93
+ } else {
94
+ word.chars().filter(|c| c.is_alphabetic()).collect::<String>()
95
+ };
96
+
97
+ if clean_word.is_empty()
98
+ || clean_word.chars().count() >= 3
99
+ || TextAnalyzer::has_important_characteristics(word)
100
+ {
101
+ fallback_words.push(word.clone());
102
+ }
103
+ }
104
+ self.smart_join(&fallback_words, has_cjk_content)
105
+ } else {
106
+ self.smart_join(&filtered_words, has_cjk_content)
107
+ }
108
+ }
109
+
110
+ /// Smart joins tokens based on language type (CJK vs. other).
111
+ pub fn smart_join(&self, tokens: &[String], has_cjk_content: bool) -> String {
112
+ if has_cjk_content {
113
+ tokens.join("")
114
+ } else {
115
+ tokens.join(" ")
116
+ }
117
+ }
118
+
119
+ /// Universal tokenizer that handles both CJK and non-CJK text.
120
+ pub fn universal_tokenize(&self, text: &str) -> Vec<String> {
121
+ self.cjk_tokenizer.tokenize_mixed_text(text)
122
+ }
123
+ }
124
+
125
+ impl Default for WordFilter {
126
+ fn default() -> Self {
127
+ Self::new()
128
+ }
129
+ }
130
+
131
+ #[cfg(test)]
132
+ mod tests {
133
+ use super::*;
134
+
135
+ #[test]
136
+ fn test_universal_tokenize_english() {
137
+ let filter = WordFilter::new();
138
+ let tokens = filter.universal_tokenize("hello world test");
139
+ assert_eq!(tokens, vec!["hello", "world", "test"]);
140
+ }
141
+
142
+ #[test]
143
+ fn test_universal_tokenize_cjk() {
144
+ let filter = WordFilter::new();
145
+ let tokens = filter.universal_tokenize("中文");
146
+ assert!(!tokens.is_empty());
147
+ }
148
+
149
+ #[test]
150
+ fn test_fallback_threshold() {
151
+ let filter = WordFilter::new();
152
+ let input = "a the is of to in for on at by";
153
+ let result = filter.remove_additional_common_words(input);
154
+ assert!(!result.is_empty());
155
+ }
156
+ }