kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,238 @@
1
+ //! Validator registry implementation.
2
+
3
+ use crate::Result;
4
+ use crate::plugins::Validator;
5
+ use indexmap::IndexMap;
6
+ use std::collections::BTreeMap;
7
+ use std::sync::Arc;
8
+
9
+ /// Registry for validator plugins.
10
+ ///
11
+ /// Manages validators with priority-based execution order.
12
+ pub struct ValidatorRegistry {
13
+ validators: BTreeMap<i32, IndexMap<String, Arc<dyn Validator>>>,
14
+ }
15
+
16
+ impl ValidatorRegistry {
17
+ /// Create a new empty validator registry.
18
+ pub fn new() -> Self {
19
+ Self {
20
+ validators: BTreeMap::new(),
21
+ }
22
+ }
23
+
24
+ /// Register a validator.
25
+ ///
26
+ /// # Arguments
27
+ ///
28
+ /// * `validator` - The validator to register
29
+ pub fn register(&mut self, validator: Arc<dyn Validator>) -> Result<()> {
30
+ let name = validator.name().to_string();
31
+ let priority = validator.priority();
32
+
33
+ super::validate_plugin_name(&name)?;
34
+
35
+ validator.initialize()?;
36
+
37
+ self.validators.entry(priority).or_default().insert(name, validator);
38
+
39
+ Ok(())
40
+ }
41
+
42
+ /// Get all validators in priority order.
43
+ ///
44
+ /// # Returns
45
+ ///
46
+ /// Vector of validators in priority order (highest first).
47
+ pub fn get_all(&self) -> Vec<Arc<dyn Validator>> {
48
+ let mut result = Vec::new();
49
+
50
+ for (_priority, validators) in self.validators.iter().rev() {
51
+ for validator in validators.values() {
52
+ result.push(Arc::clone(validator));
53
+ }
54
+ }
55
+
56
+ result
57
+ }
58
+
59
+ /// List all registered validator names.
60
+ pub fn list(&self) -> Vec<String> {
61
+ let mut names = std::collections::HashSet::new();
62
+ for validators in self.validators.values() {
63
+ names.extend(validators.keys().cloned());
64
+ }
65
+ names.into_iter().collect()
66
+ }
67
+
68
+ /// Remove a validator from the registry.
69
+ pub fn remove(&mut self, name: &str) -> Result<()> {
70
+ let mut validator_to_shutdown: Option<Arc<dyn Validator>> = None;
71
+
72
+ for validators in self.validators.values_mut() {
73
+ if let Some(validator) = validators.shift_remove(name)
74
+ && validator_to_shutdown.is_none()
75
+ {
76
+ validator_to_shutdown = Some(validator);
77
+ }
78
+ }
79
+
80
+ if let Some(validator) = validator_to_shutdown {
81
+ validator.shutdown()?;
82
+ }
83
+
84
+ self.validators.retain(|_, validators| !validators.is_empty());
85
+
86
+ Ok(())
87
+ }
88
+
89
+ /// Shutdown all validators and clear the registry.
90
+ pub fn shutdown_all(&mut self) -> Result<()> {
91
+ let names = self.list();
92
+ for name in names {
93
+ self.remove(&name)?;
94
+ }
95
+ Ok(())
96
+ }
97
+ }
98
+
99
+ impl Default for ValidatorRegistry {
100
+ fn default() -> Self {
101
+ Self::new()
102
+ }
103
+ }
104
+
105
+ #[cfg(test)]
106
+ mod tests {
107
+ use super::*;
108
+ use crate::KreuzbergError;
109
+ use crate::core::config::ExtractionConfig;
110
+ use crate::plugins::Plugin;
111
+ use crate::types::ExtractionResult;
112
+ use async_trait::async_trait;
113
+
114
+ struct MockValidator {
115
+ name: String,
116
+ priority: i32,
117
+ }
118
+
119
+ impl Plugin for MockValidator {
120
+ fn name(&self) -> &str {
121
+ &self.name
122
+ }
123
+ fn version(&self) -> String {
124
+ "1.0.0".to_string()
125
+ }
126
+ fn initialize(&self) -> Result<()> {
127
+ Ok(())
128
+ }
129
+ fn shutdown(&self) -> Result<()> {
130
+ Ok(())
131
+ }
132
+ }
133
+
134
+ #[async_trait]
135
+ impl Validator for MockValidator {
136
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
137
+ Ok(())
138
+ }
139
+
140
+ fn priority(&self) -> i32 {
141
+ self.priority
142
+ }
143
+ }
144
+
145
+ #[test]
146
+ fn test_validator_registry() {
147
+ let mut registry = ValidatorRegistry::new();
148
+
149
+ let high_priority = Arc::new(MockValidator {
150
+ name: "high-priority".to_string(),
151
+ priority: 100,
152
+ });
153
+
154
+ let low_priority = Arc::new(MockValidator {
155
+ name: "low-priority".to_string(),
156
+ priority: 10,
157
+ });
158
+
159
+ registry.register(high_priority).unwrap();
160
+ registry.register(low_priority).unwrap();
161
+
162
+ let validators = registry.get_all();
163
+ assert_eq!(validators.len(), 2);
164
+ assert_eq!(validators[0].name(), "high-priority");
165
+ assert_eq!(validators[1].name(), "low-priority");
166
+ }
167
+
168
+ #[test]
169
+ fn test_validator_registry_remove() {
170
+ let mut registry = ValidatorRegistry::new();
171
+
172
+ let validator = Arc::new(MockValidator {
173
+ name: "test-validator".to_string(),
174
+ priority: 50,
175
+ });
176
+
177
+ registry.register(validator).unwrap();
178
+ assert_eq!(registry.get_all().len(), 1);
179
+
180
+ registry.remove("test-validator").unwrap();
181
+ assert_eq!(registry.get_all().len(), 0);
182
+ }
183
+
184
+ #[test]
185
+ fn test_validator_registry_default() {
186
+ let registry = ValidatorRegistry::default();
187
+ assert_eq!(registry.get_all().len(), 0);
188
+ }
189
+
190
+ #[test]
191
+ fn test_validator_registry_invalid_name_empty() {
192
+ let mut registry = ValidatorRegistry::new();
193
+
194
+ let validator = Arc::new(MockValidator {
195
+ name: "".to_string(),
196
+ priority: 50,
197
+ });
198
+
199
+ let result = registry.register(validator);
200
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
201
+ }
202
+
203
+ #[test]
204
+ fn test_validator_registry_invalid_name_whitespace() {
205
+ let mut registry = ValidatorRegistry::new();
206
+
207
+ let validator = Arc::new(MockValidator {
208
+ name: "my validator".to_string(),
209
+ priority: 50,
210
+ });
211
+
212
+ let result = registry.register(validator);
213
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
214
+ }
215
+
216
+ #[test]
217
+ fn test_validator_registry_shutdown_all() {
218
+ let mut registry = ValidatorRegistry::new();
219
+
220
+ let validator1 = Arc::new(MockValidator {
221
+ name: "validator1".to_string(),
222
+ priority: 100,
223
+ });
224
+
225
+ let validator2 = Arc::new(MockValidator {
226
+ name: "validator2".to_string(),
227
+ priority: 50,
228
+ });
229
+
230
+ registry.register(validator1).unwrap();
231
+ registry.register(validator2).unwrap();
232
+
233
+ assert_eq!(registry.get_all().len(), 2);
234
+
235
+ registry.shutdown_all().unwrap();
236
+ assert_eq!(registry.get_all().len(), 0);
237
+ }
238
+ }
@@ -0,0 +1,424 @@
1
+ //! Validator plugin system.
2
+ //!
3
+ //! This module provides the trait and registry for implementing custom validators.
4
+
5
+ mod registry;
6
+ mod r#trait;
7
+
8
+ // Re-export trait for backward compatibility
9
+ pub use r#trait::Validator;
10
+
11
+ // Re-export registry functions for backward compatibility
12
+ pub use registry::{clear_validators, list_validators, register_validator, unregister_validator};
13
+
14
+ #[cfg(test)]
15
+ mod tests {
16
+ use super::*;
17
+ use crate::KreuzbergError;
18
+ use crate::Result;
19
+ use crate::core::config::ExtractionConfig;
20
+ use crate::plugins::Plugin;
21
+ use crate::types::ExtractionResult;
22
+ use async_trait::async_trait;
23
+ use std::collections::HashMap;
24
+
25
+ struct MockValidator {
26
+ should_fail: bool,
27
+ }
28
+
29
+ impl Plugin for MockValidator {
30
+ fn name(&self) -> &str {
31
+ "mock-validator"
32
+ }
33
+
34
+ fn version(&self) -> String {
35
+ "1.0.0".to_string()
36
+ }
37
+
38
+ fn initialize(&self) -> Result<()> {
39
+ Ok(())
40
+ }
41
+
42
+ fn shutdown(&self) -> Result<()> {
43
+ Ok(())
44
+ }
45
+ }
46
+
47
+ #[async_trait]
48
+ impl Validator for MockValidator {
49
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
50
+ if self.should_fail {
51
+ Err(KreuzbergError::validation("Validation failed".to_string()))
52
+ } else {
53
+ Ok(())
54
+ }
55
+ }
56
+ }
57
+
58
+ #[tokio::test]
59
+ async fn test_validator_success() {
60
+ let validator = MockValidator { should_fail: false };
61
+
62
+ let result = ExtractionResult {
63
+ content: "test content".to_string(),
64
+ mime_type: "text/plain".to_string(),
65
+ metadata: crate::types::Metadata::default(),
66
+ tables: vec![],
67
+ detected_languages: None,
68
+ chunks: None,
69
+ images: None,
70
+ djot_content: None,
71
+ pages: None,
72
+ elements: None,
73
+ };
74
+
75
+ let config = ExtractionConfig::default();
76
+ assert!(validator.validate(&result, &config).await.is_ok());
77
+ }
78
+
79
+ #[tokio::test]
80
+ async fn test_validator_failure() {
81
+ let validator = MockValidator { should_fail: true };
82
+
83
+ let result = ExtractionResult {
84
+ content: "test content".to_string(),
85
+ mime_type: "text/plain".to_string(),
86
+ metadata: crate::types::Metadata::default(),
87
+ tables: vec![],
88
+ detected_languages: None,
89
+ chunks: None,
90
+ images: None,
91
+ djot_content: None,
92
+ pages: None,
93
+ elements: None,
94
+ };
95
+
96
+ let config = ExtractionConfig::default();
97
+ let validation_result = validator.validate(&result, &config).await;
98
+
99
+ assert!(matches!(validation_result, Err(KreuzbergError::Validation { .. })));
100
+ }
101
+
102
+ #[test]
103
+ fn test_validator_should_validate_default() {
104
+ let validator = MockValidator { should_fail: false };
105
+
106
+ let result = ExtractionResult {
107
+ content: "test".to_string(),
108
+ mime_type: "text/plain".to_string(),
109
+ metadata: crate::types::Metadata::default(),
110
+ tables: vec![],
111
+ detected_languages: None,
112
+ chunks: None,
113
+ images: None,
114
+ djot_content: None,
115
+ pages: None,
116
+ elements: None,
117
+ };
118
+
119
+ let config = ExtractionConfig::default();
120
+
121
+ assert!(validator.should_validate(&result, &config));
122
+ }
123
+
124
+ #[test]
125
+ fn test_validator_priority_default() {
126
+ let validator = MockValidator { should_fail: false };
127
+ assert_eq!(validator.priority(), 50);
128
+ }
129
+
130
+ #[tokio::test]
131
+ async fn test_validator_plugin_interface() {
132
+ let validator = MockValidator { should_fail: false };
133
+
134
+ assert_eq!(validator.name(), "mock-validator");
135
+ assert_eq!(validator.version(), "1.0.0");
136
+ assert!(validator.initialize().is_ok());
137
+ assert!(validator.shutdown().is_ok());
138
+ }
139
+
140
+ #[tokio::test]
141
+ async fn test_validator_empty_content() {
142
+ let validator = MockValidator { should_fail: false };
143
+
144
+ let result = ExtractionResult {
145
+ content: String::new(),
146
+ mime_type: "text/plain".to_string(),
147
+ metadata: crate::types::Metadata::default(),
148
+ tables: vec![],
149
+ detected_languages: None,
150
+ chunks: None,
151
+ images: None,
152
+ djot_content: None,
153
+ pages: None,
154
+ elements: None,
155
+ };
156
+
157
+ let config = ExtractionConfig::default();
158
+ assert!(validator.validate(&result, &config).await.is_ok());
159
+ }
160
+
161
+ #[test]
162
+ fn test_validator_should_validate_conditional() {
163
+ struct PdfOnlyValidator;
164
+
165
+ impl Plugin for PdfOnlyValidator {
166
+ fn name(&self) -> &str {
167
+ "pdf-only"
168
+ }
169
+ fn version(&self) -> String {
170
+ "1.0.0".to_string()
171
+ }
172
+ fn initialize(&self) -> Result<()> {
173
+ Ok(())
174
+ }
175
+ fn shutdown(&self) -> Result<()> {
176
+ Ok(())
177
+ }
178
+ }
179
+
180
+ #[async_trait]
181
+ impl Validator for PdfOnlyValidator {
182
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
183
+ Ok(())
184
+ }
185
+
186
+ fn should_validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
187
+ result.mime_type == "application/pdf"
188
+ }
189
+ }
190
+
191
+ let validator = PdfOnlyValidator;
192
+ let config = ExtractionConfig::default();
193
+
194
+ let pdf_result = ExtractionResult {
195
+ content: "test".to_string(),
196
+ mime_type: "application/pdf".to_string(),
197
+ metadata: crate::types::Metadata::default(),
198
+ tables: vec![],
199
+ detected_languages: None,
200
+ chunks: None,
201
+ images: None,
202
+ djot_content: None,
203
+ pages: None,
204
+ elements: None,
205
+ };
206
+
207
+ let txt_result = ExtractionResult {
208
+ content: "test".to_string(),
209
+ mime_type: "text/plain".to_string(),
210
+ metadata: crate::types::Metadata::default(),
211
+ tables: vec![],
212
+ detected_languages: None,
213
+ chunks: None,
214
+ images: None,
215
+ djot_content: None,
216
+ pages: None,
217
+ elements: None,
218
+ };
219
+
220
+ assert!(validator.should_validate(&pdf_result, &config));
221
+ assert!(!validator.should_validate(&txt_result, &config));
222
+ }
223
+
224
+ #[test]
225
+ fn test_validator_priority_ranges() {
226
+ struct HighPriorityValidator;
227
+ struct LowPriorityValidator;
228
+
229
+ impl Plugin for HighPriorityValidator {
230
+ fn name(&self) -> &str {
231
+ "high-priority"
232
+ }
233
+ fn version(&self) -> String {
234
+ "1.0.0".to_string()
235
+ }
236
+ fn initialize(&self) -> Result<()> {
237
+ Ok(())
238
+ }
239
+ fn shutdown(&self) -> Result<()> {
240
+ Ok(())
241
+ }
242
+ }
243
+
244
+ impl Plugin for LowPriorityValidator {
245
+ fn name(&self) -> &str {
246
+ "low-priority"
247
+ }
248
+ fn version(&self) -> String {
249
+ "1.0.0".to_string()
250
+ }
251
+ fn initialize(&self) -> Result<()> {
252
+ Ok(())
253
+ }
254
+ fn shutdown(&self) -> Result<()> {
255
+ Ok(())
256
+ }
257
+ }
258
+
259
+ #[async_trait]
260
+ impl Validator for HighPriorityValidator {
261
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
262
+ Ok(())
263
+ }
264
+
265
+ fn priority(&self) -> i32 {
266
+ 100
267
+ }
268
+ }
269
+
270
+ #[async_trait]
271
+ impl Validator for LowPriorityValidator {
272
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
273
+ Ok(())
274
+ }
275
+
276
+ fn priority(&self) -> i32 {
277
+ 10
278
+ }
279
+ }
280
+
281
+ let high = HighPriorityValidator;
282
+ let low = LowPriorityValidator;
283
+
284
+ assert_eq!(high.priority(), 100);
285
+ assert_eq!(low.priority(), 10);
286
+ assert!(high.priority() > low.priority());
287
+ }
288
+
289
+ #[tokio::test]
290
+ async fn test_validator_error_message() {
291
+ let validator = MockValidator { should_fail: true };
292
+
293
+ let result = ExtractionResult {
294
+ content: "test".to_string(),
295
+ mime_type: "text/plain".to_string(),
296
+ metadata: crate::types::Metadata::default(),
297
+ tables: vec![],
298
+ detected_languages: None,
299
+ chunks: None,
300
+ images: None,
301
+ djot_content: None,
302
+ pages: None,
303
+ elements: None,
304
+ };
305
+
306
+ let config = ExtractionConfig::default();
307
+ let err = validator.validate(&result, &config).await.unwrap_err();
308
+
309
+ match err {
310
+ KreuzbergError::Validation { message: msg, .. } => {
311
+ assert_eq!(msg, "Validation failed");
312
+ }
313
+ _ => panic!("Expected Validation error"),
314
+ }
315
+ }
316
+
317
+ #[tokio::test]
318
+ async fn test_validator_with_metadata() {
319
+ let validator = MockValidator { should_fail: false };
320
+
321
+ let mut additional = HashMap::new();
322
+ additional.insert("quality_score".to_string(), serde_json::json!(0.95));
323
+
324
+ let result = ExtractionResult {
325
+ content: "test".to_string(),
326
+ mime_type: "text/plain".to_string(),
327
+ metadata: crate::types::Metadata {
328
+ additional,
329
+ ..Default::default()
330
+ },
331
+ pages: None,
332
+ tables: vec![],
333
+ detected_languages: None,
334
+ chunks: None,
335
+ images: None,
336
+ djot_content: None,
337
+ elements: None,
338
+ };
339
+
340
+ let config = ExtractionConfig::default();
341
+ assert!(validator.validate(&result, &config).await.is_ok());
342
+ }
343
+
344
+ #[tokio::test]
345
+ async fn test_validator_with_tables() {
346
+ use crate::types::Table;
347
+
348
+ let validator = MockValidator { should_fail: false };
349
+
350
+ let table = Table {
351
+ cells: vec![vec!["A".to_string(), "B".to_string()]],
352
+ markdown: "| A | B |".to_string(),
353
+ page_number: 0,
354
+ };
355
+
356
+ let result = ExtractionResult {
357
+ content: "test".to_string(),
358
+ mime_type: "text/plain".to_string(),
359
+ metadata: crate::types::Metadata::default(),
360
+ tables: vec![table],
361
+ detected_languages: None,
362
+ chunks: None,
363
+ images: None,
364
+ djot_content: None,
365
+ pages: None,
366
+ elements: None,
367
+ };
368
+
369
+ let config = ExtractionConfig::default();
370
+ assert!(validator.validate(&result, &config).await.is_ok());
371
+ }
372
+
373
+ #[tokio::test]
374
+ async fn test_validator_different_mime_types() {
375
+ let validator = MockValidator { should_fail: false };
376
+ let config = ExtractionConfig::default();
377
+
378
+ let mime_types = vec![
379
+ "text/plain",
380
+ "application/pdf",
381
+ "application/json",
382
+ "text/html",
383
+ "image/png",
384
+ ];
385
+
386
+ for mime_type in mime_types {
387
+ let result = ExtractionResult {
388
+ content: "test".to_string(),
389
+ mime_type: mime_type.to_string(),
390
+ metadata: crate::types::Metadata::default(),
391
+ tables: vec![],
392
+ detected_languages: None,
393
+ chunks: None,
394
+ images: None,
395
+ djot_content: None,
396
+ pages: None,
397
+ elements: None,
398
+ };
399
+
400
+ assert!(validator.validate(&result, &config).await.is_ok());
401
+ }
402
+ }
403
+
404
+ #[tokio::test]
405
+ async fn test_validator_long_content() {
406
+ let validator = MockValidator { should_fail: false };
407
+
408
+ let result = ExtractionResult {
409
+ content: "test content ".repeat(10000),
410
+ mime_type: "text/plain".to_string(),
411
+ metadata: crate::types::Metadata::default(),
412
+ tables: vec![],
413
+ detected_languages: None,
414
+ chunks: None,
415
+ images: None,
416
+ djot_content: None,
417
+ pages: None,
418
+ elements: None,
419
+ };
420
+
421
+ let config = ExtractionConfig::default();
422
+ assert!(validator.validate(&result, &config).await.is_ok());
423
+ }
424
+ }