kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,238 @@
1
+ //! Validator registry implementation.
2
+
3
+ use crate::Result;
4
+ use crate::plugins::Validator;
5
+ use indexmap::IndexMap;
6
+ use std::collections::BTreeMap;
7
+ use std::sync::Arc;
8
+
9
+ /// Registry for validator plugins.
10
+ ///
11
+ /// Manages validators with priority-based execution order.
12
+ pub struct ValidatorRegistry {
13
+ validators: BTreeMap<i32, IndexMap<String, Arc<dyn Validator>>>,
14
+ }
15
+
16
+ impl ValidatorRegistry {
17
+ /// Create a new empty validator registry.
18
+ pub fn new() -> Self {
19
+ Self {
20
+ validators: BTreeMap::new(),
21
+ }
22
+ }
23
+
24
+ /// Register a validator.
25
+ ///
26
+ /// # Arguments
27
+ ///
28
+ /// * `validator` - The validator to register
29
+ pub fn register(&mut self, validator: Arc<dyn Validator>) -> Result<()> {
30
+ let name = validator.name().to_string();
31
+ let priority = validator.priority();
32
+
33
+ super::validate_plugin_name(&name)?;
34
+
35
+ validator.initialize()?;
36
+
37
+ self.validators.entry(priority).or_default().insert(name, validator);
38
+
39
+ Ok(())
40
+ }
41
+
42
+ /// Get all validators in priority order.
43
+ ///
44
+ /// # Returns
45
+ ///
46
+ /// Vector of validators in priority order (highest first).
47
+ pub fn get_all(&self) -> Vec<Arc<dyn Validator>> {
48
+ let mut result = Vec::new();
49
+
50
+ for (_priority, validators) in self.validators.iter().rev() {
51
+ for validator in validators.values() {
52
+ result.push(Arc::clone(validator));
53
+ }
54
+ }
55
+
56
+ result
57
+ }
58
+
59
+ /// List all registered validator names.
60
+ pub fn list(&self) -> Vec<String> {
61
+ let mut names = std::collections::HashSet::new();
62
+ for validators in self.validators.values() {
63
+ names.extend(validators.keys().cloned());
64
+ }
65
+ names.into_iter().collect()
66
+ }
67
+
68
+ /// Remove a validator from the registry.
69
+ pub fn remove(&mut self, name: &str) -> Result<()> {
70
+ let mut validator_to_shutdown: Option<Arc<dyn Validator>> = None;
71
+
72
+ for validators in self.validators.values_mut() {
73
+ if let Some(validator) = validators.shift_remove(name)
74
+ && validator_to_shutdown.is_none()
75
+ {
76
+ validator_to_shutdown = Some(validator);
77
+ }
78
+ }
79
+
80
+ if let Some(validator) = validator_to_shutdown {
81
+ validator.shutdown()?;
82
+ }
83
+
84
+ self.validators.retain(|_, validators| !validators.is_empty());
85
+
86
+ Ok(())
87
+ }
88
+
89
+ /// Shutdown all validators and clear the registry.
90
+ pub fn shutdown_all(&mut self) -> Result<()> {
91
+ let names = self.list();
92
+ for name in names {
93
+ self.remove(&name)?;
94
+ }
95
+ Ok(())
96
+ }
97
+ }
98
+
99
+ impl Default for ValidatorRegistry {
100
+ fn default() -> Self {
101
+ Self::new()
102
+ }
103
+ }
104
+
105
+ #[cfg(test)]
106
+ mod tests {
107
+ use super::*;
108
+ use crate::KreuzbergError;
109
+ use crate::core::config::ExtractionConfig;
110
+ use crate::plugins::Plugin;
111
+ use crate::types::ExtractionResult;
112
+ use async_trait::async_trait;
113
+
114
+ struct MockValidator {
115
+ name: String,
116
+ priority: i32,
117
+ }
118
+
119
+ impl Plugin for MockValidator {
120
+ fn name(&self) -> &str {
121
+ &self.name
122
+ }
123
+ fn version(&self) -> String {
124
+ "1.0.0".to_string()
125
+ }
126
+ fn initialize(&self) -> Result<()> {
127
+ Ok(())
128
+ }
129
+ fn shutdown(&self) -> Result<()> {
130
+ Ok(())
131
+ }
132
+ }
133
+
134
+ #[async_trait]
135
+ impl Validator for MockValidator {
136
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
137
+ Ok(())
138
+ }
139
+
140
+ fn priority(&self) -> i32 {
141
+ self.priority
142
+ }
143
+ }
144
+
145
+ #[test]
146
+ fn test_validator_registry() {
147
+ let mut registry = ValidatorRegistry::new();
148
+
149
+ let high_priority = Arc::new(MockValidator {
150
+ name: "high-priority".to_string(),
151
+ priority: 100,
152
+ });
153
+
154
+ let low_priority = Arc::new(MockValidator {
155
+ name: "low-priority".to_string(),
156
+ priority: 10,
157
+ });
158
+
159
+ registry.register(high_priority).unwrap();
160
+ registry.register(low_priority).unwrap();
161
+
162
+ let validators = registry.get_all();
163
+ assert_eq!(validators.len(), 2);
164
+ assert_eq!(validators[0].name(), "high-priority");
165
+ assert_eq!(validators[1].name(), "low-priority");
166
+ }
167
+
168
+ #[test]
169
+ fn test_validator_registry_remove() {
170
+ let mut registry = ValidatorRegistry::new();
171
+
172
+ let validator = Arc::new(MockValidator {
173
+ name: "test-validator".to_string(),
174
+ priority: 50,
175
+ });
176
+
177
+ registry.register(validator).unwrap();
178
+ assert_eq!(registry.get_all().len(), 1);
179
+
180
+ registry.remove("test-validator").unwrap();
181
+ assert_eq!(registry.get_all().len(), 0);
182
+ }
183
+
184
+ #[test]
185
+ fn test_validator_registry_default() {
186
+ let registry = ValidatorRegistry::default();
187
+ assert_eq!(registry.get_all().len(), 0);
188
+ }
189
+
190
+ #[test]
191
+ fn test_validator_registry_invalid_name_empty() {
192
+ let mut registry = ValidatorRegistry::new();
193
+
194
+ let validator = Arc::new(MockValidator {
195
+ name: "".to_string(),
196
+ priority: 50,
197
+ });
198
+
199
+ let result = registry.register(validator);
200
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
201
+ }
202
+
203
+ #[test]
204
+ fn test_validator_registry_invalid_name_whitespace() {
205
+ let mut registry = ValidatorRegistry::new();
206
+
207
+ let validator = Arc::new(MockValidator {
208
+ name: "my validator".to_string(),
209
+ priority: 50,
210
+ });
211
+
212
+ let result = registry.register(validator);
213
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
214
+ }
215
+
216
+ #[test]
217
+ fn test_validator_registry_shutdown_all() {
218
+ let mut registry = ValidatorRegistry::new();
219
+
220
+ let validator1 = Arc::new(MockValidator {
221
+ name: "validator1".to_string(),
222
+ priority: 100,
223
+ });
224
+
225
+ let validator2 = Arc::new(MockValidator {
226
+ name: "validator2".to_string(),
227
+ priority: 50,
228
+ });
229
+
230
+ registry.register(validator1).unwrap();
231
+ registry.register(validator2).unwrap();
232
+
233
+ assert_eq!(registry.get_all().len(), 2);
234
+
235
+ registry.shutdown_all().unwrap();
236
+ assert_eq!(registry.get_all().len(), 0);
237
+ }
238
+ }
@@ -0,0 +1,424 @@
1
+ //! Validator plugin system.
2
+ //!
3
+ //! This module provides the trait and registry for implementing custom validators.
4
+
5
+ mod registry;
6
+ mod r#trait;
7
+
8
+ // Re-export trait for backward compatibility
9
+ pub use r#trait::Validator;
10
+
11
+ // Re-export registry functions for backward compatibility
12
+ pub use registry::{clear_validators, list_validators, register_validator, unregister_validator};
13
+
14
+ #[cfg(test)]
15
+ mod tests {
16
+ use super::*;
17
+ use crate::KreuzbergError;
18
+ use crate::Result;
19
+ use crate::core::config::ExtractionConfig;
20
+ use crate::plugins::Plugin;
21
+ use crate::types::ExtractionResult;
22
+ use async_trait::async_trait;
23
+ use std::collections::HashMap;
24
+
25
+ struct MockValidator {
26
+ should_fail: bool,
27
+ }
28
+
29
+ impl Plugin for MockValidator {
30
+ fn name(&self) -> &str {
31
+ "mock-validator"
32
+ }
33
+
34
+ fn version(&self) -> String {
35
+ "1.0.0".to_string()
36
+ }
37
+
38
+ fn initialize(&self) -> Result<()> {
39
+ Ok(())
40
+ }
41
+
42
+ fn shutdown(&self) -> Result<()> {
43
+ Ok(())
44
+ }
45
+ }
46
+
47
+ #[async_trait]
48
+ impl Validator for MockValidator {
49
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
50
+ if self.should_fail {
51
+ Err(KreuzbergError::validation("Validation failed".to_string()))
52
+ } else {
53
+ Ok(())
54
+ }
55
+ }
56
+ }
57
+
58
+ #[tokio::test]
59
+ async fn test_validator_success() {
60
+ let validator = MockValidator { should_fail: false };
61
+
62
+ let result = ExtractionResult {
63
+ content: "test content".to_string(),
64
+ mime_type: "text/plain".to_string(),
65
+ metadata: crate::types::Metadata::default(),
66
+ tables: vec![],
67
+ detected_languages: None,
68
+ chunks: None,
69
+ images: None,
70
+ djot_content: None,
71
+ pages: None,
72
+ elements: None,
73
+ };
74
+
75
+ let config = ExtractionConfig::default();
76
+ assert!(validator.validate(&result, &config).await.is_ok());
77
+ }
78
+
79
+ #[tokio::test]
80
+ async fn test_validator_failure() {
81
+ let validator = MockValidator { should_fail: true };
82
+
83
+ let result = ExtractionResult {
84
+ content: "test content".to_string(),
85
+ mime_type: "text/plain".to_string(),
86
+ metadata: crate::types::Metadata::default(),
87
+ tables: vec![],
88
+ detected_languages: None,
89
+ chunks: None,
90
+ images: None,
91
+ djot_content: None,
92
+ pages: None,
93
+ elements: None,
94
+ };
95
+
96
+ let config = ExtractionConfig::default();
97
+ let validation_result = validator.validate(&result, &config).await;
98
+
99
+ assert!(matches!(validation_result, Err(KreuzbergError::Validation { .. })));
100
+ }
101
+
102
+ #[test]
103
+ fn test_validator_should_validate_default() {
104
+ let validator = MockValidator { should_fail: false };
105
+
106
+ let result = ExtractionResult {
107
+ content: "test".to_string(),
108
+ mime_type: "text/plain".to_string(),
109
+ metadata: crate::types::Metadata::default(),
110
+ tables: vec![],
111
+ detected_languages: None,
112
+ chunks: None,
113
+ images: None,
114
+ djot_content: None,
115
+ pages: None,
116
+ elements: None,
117
+ };
118
+
119
+ let config = ExtractionConfig::default();
120
+
121
+ assert!(validator.should_validate(&result, &config));
122
+ }
123
+
124
+ #[test]
125
+ fn test_validator_priority_default() {
126
+ let validator = MockValidator { should_fail: false };
127
+ assert_eq!(validator.priority(), 50);
128
+ }
129
+
130
+ #[tokio::test]
131
+ async fn test_validator_plugin_interface() {
132
+ let validator = MockValidator { should_fail: false };
133
+
134
+ assert_eq!(validator.name(), "mock-validator");
135
+ assert_eq!(validator.version(), "1.0.0");
136
+ assert!(validator.initialize().is_ok());
137
+ assert!(validator.shutdown().is_ok());
138
+ }
139
+
140
+ #[tokio::test]
141
+ async fn test_validator_empty_content() {
142
+ let validator = MockValidator { should_fail: false };
143
+
144
+ let result = ExtractionResult {
145
+ content: String::new(),
146
+ mime_type: "text/plain".to_string(),
147
+ metadata: crate::types::Metadata::default(),
148
+ tables: vec![],
149
+ detected_languages: None,
150
+ chunks: None,
151
+ images: None,
152
+ djot_content: None,
153
+ pages: None,
154
+ elements: None,
155
+ };
156
+
157
+ let config = ExtractionConfig::default();
158
+ assert!(validator.validate(&result, &config).await.is_ok());
159
+ }
160
+
161
+ #[test]
162
+ fn test_validator_should_validate_conditional() {
163
+ struct PdfOnlyValidator;
164
+
165
+ impl Plugin for PdfOnlyValidator {
166
+ fn name(&self) -> &str {
167
+ "pdf-only"
168
+ }
169
+ fn version(&self) -> String {
170
+ "1.0.0".to_string()
171
+ }
172
+ fn initialize(&self) -> Result<()> {
173
+ Ok(())
174
+ }
175
+ fn shutdown(&self) -> Result<()> {
176
+ Ok(())
177
+ }
178
+ }
179
+
180
+ #[async_trait]
181
+ impl Validator for PdfOnlyValidator {
182
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
183
+ Ok(())
184
+ }
185
+
186
+ fn should_validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
187
+ result.mime_type == "application/pdf"
188
+ }
189
+ }
190
+
191
+ let validator = PdfOnlyValidator;
192
+ let config = ExtractionConfig::default();
193
+
194
+ let pdf_result = ExtractionResult {
195
+ content: "test".to_string(),
196
+ mime_type: "application/pdf".to_string(),
197
+ metadata: crate::types::Metadata::default(),
198
+ tables: vec![],
199
+ detected_languages: None,
200
+ chunks: None,
201
+ images: None,
202
+ djot_content: None,
203
+ pages: None,
204
+ elements: None,
205
+ };
206
+
207
+ let txt_result = ExtractionResult {
208
+ content: "test".to_string(),
209
+ mime_type: "text/plain".to_string(),
210
+ metadata: crate::types::Metadata::default(),
211
+ tables: vec![],
212
+ detected_languages: None,
213
+ chunks: None,
214
+ images: None,
215
+ djot_content: None,
216
+ pages: None,
217
+ elements: None,
218
+ };
219
+
220
+ assert!(validator.should_validate(&pdf_result, &config));
221
+ assert!(!validator.should_validate(&txt_result, &config));
222
+ }
223
+
224
+ #[test]
225
+ fn test_validator_priority_ranges() {
226
+ struct HighPriorityValidator;
227
+ struct LowPriorityValidator;
228
+
229
+ impl Plugin for HighPriorityValidator {
230
+ fn name(&self) -> &str {
231
+ "high-priority"
232
+ }
233
+ fn version(&self) -> String {
234
+ "1.0.0".to_string()
235
+ }
236
+ fn initialize(&self) -> Result<()> {
237
+ Ok(())
238
+ }
239
+ fn shutdown(&self) -> Result<()> {
240
+ Ok(())
241
+ }
242
+ }
243
+
244
+ impl Plugin for LowPriorityValidator {
245
+ fn name(&self) -> &str {
246
+ "low-priority"
247
+ }
248
+ fn version(&self) -> String {
249
+ "1.0.0".to_string()
250
+ }
251
+ fn initialize(&self) -> Result<()> {
252
+ Ok(())
253
+ }
254
+ fn shutdown(&self) -> Result<()> {
255
+ Ok(())
256
+ }
257
+ }
258
+
259
+ #[async_trait]
260
+ impl Validator for HighPriorityValidator {
261
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
262
+ Ok(())
263
+ }
264
+
265
+ fn priority(&self) -> i32 {
266
+ 100
267
+ }
268
+ }
269
+
270
+ #[async_trait]
271
+ impl Validator for LowPriorityValidator {
272
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
273
+ Ok(())
274
+ }
275
+
276
+ fn priority(&self) -> i32 {
277
+ 10
278
+ }
279
+ }
280
+
281
+ let high = HighPriorityValidator;
282
+ let low = LowPriorityValidator;
283
+
284
+ assert_eq!(high.priority(), 100);
285
+ assert_eq!(low.priority(), 10);
286
+ assert!(high.priority() > low.priority());
287
+ }
288
+
289
+ #[tokio::test]
290
+ async fn test_validator_error_message() {
291
+ let validator = MockValidator { should_fail: true };
292
+
293
+ let result = ExtractionResult {
294
+ content: "test".to_string(),
295
+ mime_type: "text/plain".to_string(),
296
+ metadata: crate::types::Metadata::default(),
297
+ tables: vec![],
298
+ detected_languages: None,
299
+ chunks: None,
300
+ images: None,
301
+ djot_content: None,
302
+ pages: None,
303
+ elements: None,
304
+ };
305
+
306
+ let config = ExtractionConfig::default();
307
+ let err = validator.validate(&result, &config).await.unwrap_err();
308
+
309
+ match err {
310
+ KreuzbergError::Validation { message: msg, .. } => {
311
+ assert_eq!(msg, "Validation failed");
312
+ }
313
+ _ => panic!("Expected Validation error"),
314
+ }
315
+ }
316
+
317
+ #[tokio::test]
318
+ async fn test_validator_with_metadata() {
319
+ let validator = MockValidator { should_fail: false };
320
+
321
+ let mut additional = HashMap::new();
322
+ additional.insert("quality_score".to_string(), serde_json::json!(0.95));
323
+
324
+ let result = ExtractionResult {
325
+ content: "test".to_string(),
326
+ mime_type: "text/plain".to_string(),
327
+ metadata: crate::types::Metadata {
328
+ additional,
329
+ ..Default::default()
330
+ },
331
+ pages: None,
332
+ tables: vec![],
333
+ detected_languages: None,
334
+ chunks: None,
335
+ images: None,
336
+ djot_content: None,
337
+ elements: None,
338
+ };
339
+
340
+ let config = ExtractionConfig::default();
341
+ assert!(validator.validate(&result, &config).await.is_ok());
342
+ }
343
+
344
+ #[tokio::test]
345
+ async fn test_validator_with_tables() {
346
+ use crate::types::Table;
347
+
348
+ let validator = MockValidator { should_fail: false };
349
+
350
+ let table = Table {
351
+ cells: vec![vec!["A".to_string(), "B".to_string()]],
352
+ markdown: "| A | B |".to_string(),
353
+ page_number: 0,
354
+ };
355
+
356
+ let result = ExtractionResult {
357
+ content: "test".to_string(),
358
+ mime_type: "text/plain".to_string(),
359
+ metadata: crate::types::Metadata::default(),
360
+ tables: vec![table],
361
+ detected_languages: None,
362
+ chunks: None,
363
+ images: None,
364
+ djot_content: None,
365
+ pages: None,
366
+ elements: None,
367
+ };
368
+
369
+ let config = ExtractionConfig::default();
370
+ assert!(validator.validate(&result, &config).await.is_ok());
371
+ }
372
+
373
+ #[tokio::test]
374
+ async fn test_validator_different_mime_types() {
375
+ let validator = MockValidator { should_fail: false };
376
+ let config = ExtractionConfig::default();
377
+
378
+ let mime_types = vec![
379
+ "text/plain",
380
+ "application/pdf",
381
+ "application/json",
382
+ "text/html",
383
+ "image/png",
384
+ ];
385
+
386
+ for mime_type in mime_types {
387
+ let result = ExtractionResult {
388
+ content: "test".to_string(),
389
+ mime_type: mime_type.to_string(),
390
+ metadata: crate::types::Metadata::default(),
391
+ tables: vec![],
392
+ detected_languages: None,
393
+ chunks: None,
394
+ images: None,
395
+ djot_content: None,
396
+ pages: None,
397
+ elements: None,
398
+ };
399
+
400
+ assert!(validator.validate(&result, &config).await.is_ok());
401
+ }
402
+ }
403
+
404
+ #[tokio::test]
405
+ async fn test_validator_long_content() {
406
+ let validator = MockValidator { should_fail: false };
407
+
408
+ let result = ExtractionResult {
409
+ content: "test content ".repeat(10000),
410
+ mime_type: "text/plain".to_string(),
411
+ metadata: crate::types::Metadata::default(),
412
+ tables: vec![],
413
+ detected_languages: None,
414
+ chunks: None,
415
+ images: None,
416
+ djot_content: None,
417
+ pages: None,
418
+ elements: None,
419
+ };
420
+
421
+ let config = ExtractionConfig::default();
422
+ assert!(validator.validate(&result, &config).await.is_ok());
423
+ }
424
+ }