kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,355 @@
1
+ //! Validator registry management.
2
+ //!
3
+ //! This module provides functions for managing the global validator registry.
4
+
5
+ use super::r#trait::Validator;
6
+ use std::sync::Arc;
7
+
8
+ /// Register a validator with the global registry.
9
+ ///
10
+ /// The validator will be registered with its default priority and will be called
11
+ /// during extraction validation. The validator's `name()` method is used as the
12
+ /// registration name.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `validator` - The validator implementation wrapped in Arc
17
+ ///
18
+ /// # Returns
19
+ ///
20
+ /// - `Ok(())` if registration succeeded
21
+ /// - `Err(...)` if validation failed or initialization failed
22
+ ///
23
+ /// # Errors
24
+ ///
25
+ /// - `KreuzbergError::Validation` - Invalid validator name (empty or contains whitespace)
26
+ /// - Any error from the validator's `initialize()` method
27
+ ///
28
+ /// # Example
29
+ ///
30
+ /// ```rust
31
+ /// use kreuzberg::plugins::{Plugin, Validator, register_validator};
32
+ /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
33
+ /// use async_trait::async_trait;
34
+ /// use std::sync::Arc;
35
+ ///
36
+ /// struct MinLengthValidator { min_length: usize }
37
+ ///
38
+ /// impl Plugin for MinLengthValidator {
39
+ /// fn name(&self) -> &str { "min-length" }
40
+ /// fn version(&self) -> String { "1.0.0".to_string() }
41
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
42
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
43
+ /// }
44
+ ///
45
+ /// #[async_trait]
46
+ /// impl Validator for MinLengthValidator {
47
+ /// async fn validate(&self, result: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
48
+ /// if result.content.len() < self.min_length {
49
+ /// return Err(KreuzbergError::validation(
50
+ /// format!("Content too short: {} < {}", result.content.len(), self.min_length)
51
+ /// ));
52
+ /// }
53
+ /// Ok(())
54
+ /// }
55
+ /// }
56
+ ///
57
+ /// # tokio_test::block_on(async {
58
+ /// let validator = Arc::new(MinLengthValidator { min_length: 10 });
59
+ /// register_validator(validator)?;
60
+ /// # Ok::<(), KreuzbergError>(())
61
+ /// # });
62
+ /// ```
63
+ pub fn register_validator(validator: Arc<dyn Validator>) -> crate::Result<()> {
64
+ use crate::plugins::registry::get_validator_registry;
65
+
66
+ let registry = get_validator_registry();
67
+ let mut registry = registry
68
+ .write()
69
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
70
+
71
+ registry.register(validator)
72
+ }
73
+
74
+ /// Unregister a validator by name.
75
+ ///
76
+ /// Removes the validator from the global registry and calls its `shutdown()` method.
77
+ ///
78
+ /// # Arguments
79
+ ///
80
+ /// * `name` - Name of the validator to unregister
81
+ ///
82
+ /// # Returns
83
+ ///
84
+ /// - `Ok(())` if the validator was unregistered or didn't exist
85
+ /// - `Err(...)` if the shutdown method failed
86
+ ///
87
+ /// # Example
88
+ ///
89
+ /// ```rust
90
+ /// use kreuzberg::plugins::unregister_validator;
91
+ ///
92
+ /// # tokio_test::block_on(async {
93
+ /// unregister_validator("min-length")?;
94
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
95
+ /// # });
96
+ /// ```
97
+ pub fn unregister_validator(name: &str) -> crate::Result<()> {
98
+ use crate::plugins::registry::get_validator_registry;
99
+
100
+ let registry = get_validator_registry();
101
+ let mut registry = registry
102
+ .write()
103
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
104
+
105
+ registry.remove(name)
106
+ }
107
+
108
+ /// List all registered validators.
109
+ ///
110
+ /// Returns the names of all validators currently registered in the global registry.
111
+ ///
112
+ /// # Returns
113
+ ///
114
+ /// A vector of validator names.
115
+ ///
116
+ /// # Example
117
+ ///
118
+ /// ```rust
119
+ /// use kreuzberg::plugins::list_validators;
120
+ ///
121
+ /// # tokio_test::block_on(async {
122
+ /// let validators = list_validators()?;
123
+ /// for name in validators {
124
+ /// println!("Registered validator: {}", name);
125
+ /// }
126
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
127
+ /// # });
128
+ /// ```
129
+ pub fn list_validators() -> crate::Result<Vec<String>> {
130
+ use crate::plugins::registry::get_validator_registry;
131
+
132
+ let registry = get_validator_registry();
133
+ let registry = registry
134
+ .read()
135
+ .expect("~keep Failed to acquire read lock on validator registry"); // ~keep
136
+
137
+ Ok(registry.list())
138
+ }
139
+
140
+ /// Clear all validators from the global registry.
141
+ ///
142
+ /// Removes all validators and calls their `shutdown()` methods.
143
+ ///
144
+ /// # Returns
145
+ ///
146
+ /// - `Ok(())` if all validators were cleared successfully
147
+ /// - `Err(...)` if any shutdown method failed
148
+ ///
149
+ /// # Example
150
+ ///
151
+ /// ```rust
152
+ /// use kreuzberg::plugins::clear_validators;
153
+ ///
154
+ /// # tokio_test::block_on(async {
155
+ /// clear_validators()?;
156
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
157
+ /// # });
158
+ /// ```
159
+ pub fn clear_validators() -> crate::Result<()> {
160
+ use crate::plugins::registry::get_validator_registry;
161
+
162
+ let registry = get_validator_registry();
163
+ let mut registry = registry
164
+ .write()
165
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
166
+
167
+ registry.shutdown_all()
168
+ }
169
+
170
+ #[cfg(test)]
171
+ mod tests {
172
+ use super::*;
173
+ use crate::KreuzbergError;
174
+ use crate::Result;
175
+ use crate::core::config::ExtractionConfig;
176
+ use crate::plugins::Plugin;
177
+ use crate::types::ExtractionResult;
178
+ use async_trait::async_trait;
179
+
180
+ struct MockValidator {
181
+ should_fail: bool,
182
+ }
183
+
184
+ impl Plugin for MockValidator {
185
+ fn name(&self) -> &str {
186
+ "mock-validator"
187
+ }
188
+
189
+ fn version(&self) -> String {
190
+ "1.0.0".to_string()
191
+ }
192
+
193
+ fn initialize(&self) -> Result<()> {
194
+ Ok(())
195
+ }
196
+
197
+ fn shutdown(&self) -> Result<()> {
198
+ Ok(())
199
+ }
200
+ }
201
+
202
+ #[async_trait]
203
+ impl Validator for MockValidator {
204
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
205
+ if self.should_fail {
206
+ Err(KreuzbergError::validation("Validation failed".to_string()))
207
+ } else {
208
+ Ok(())
209
+ }
210
+ }
211
+ }
212
+
213
+ #[test]
214
+ #[serial_test::serial]
215
+ fn test_register_validator() {
216
+ use std::sync::Arc;
217
+
218
+ let validator = Arc::new(MockValidator { should_fail: false });
219
+ let result = super::register_validator(validator);
220
+ assert!(result.is_ok());
221
+
222
+ let _ = super::unregister_validator("mock-validator");
223
+ }
224
+
225
+ #[test]
226
+ #[serial_test::serial]
227
+ fn test_unregister_validator() {
228
+ use std::sync::Arc;
229
+
230
+ let validator = Arc::new(MockValidator { should_fail: false });
231
+ super::register_validator(validator).unwrap();
232
+
233
+ let result = super::unregister_validator("mock-validator");
234
+ assert!(result.is_ok());
235
+ }
236
+
237
+ #[test]
238
+ #[serial_test::serial]
239
+ fn test_unregister_nonexistent_validator() {
240
+ let result = super::unregister_validator("nonexistent-validator-xyz");
241
+ assert!(result.is_ok());
242
+ }
243
+
244
+ #[test]
245
+ #[serial_test::serial]
246
+ fn test_list_validators() {
247
+ use std::sync::Arc;
248
+
249
+ super::clear_validators().unwrap();
250
+
251
+ let validator1 = Arc::new(MockValidator { should_fail: false });
252
+ let validator2 = Arc::new(MockValidator { should_fail: false });
253
+
254
+ let list_before = super::list_validators().unwrap();
255
+ assert_eq!(list_before.len(), 0);
256
+
257
+ super::register_validator(validator1).unwrap();
258
+ super::register_validator(validator2).unwrap();
259
+
260
+ let list = super::list_validators().unwrap();
261
+ assert_eq!(list.len(), 1);
262
+ assert!(list.contains(&"mock-validator".to_string()));
263
+
264
+ super::unregister_validator("mock-validator").unwrap();
265
+ }
266
+
267
+ #[test]
268
+ #[serial_test::serial]
269
+ fn test_clear_validators() {
270
+ use std::sync::Arc;
271
+
272
+ super::clear_validators().unwrap();
273
+
274
+ let validator1 = Arc::new(MockValidator { should_fail: false });
275
+ let validator2 = Arc::new(MockValidator { should_fail: false });
276
+
277
+ super::register_validator(validator1).unwrap();
278
+ super::register_validator(validator2).unwrap();
279
+
280
+ let list_before = super::list_validators().unwrap();
281
+ assert!(!list_before.is_empty());
282
+
283
+ let result = super::clear_validators();
284
+ assert!(result.is_ok());
285
+
286
+ let list = super::list_validators().unwrap();
287
+ assert_eq!(list.len(), 0);
288
+ }
289
+
290
+ #[test]
291
+ #[serial_test::serial]
292
+ fn test_register_validator_with_invalid_name() {
293
+ use std::sync::Arc;
294
+
295
+ struct InvalidNameValidator;
296
+ impl Plugin for InvalidNameValidator {
297
+ fn name(&self) -> &str {
298
+ "invalid name with spaces"
299
+ }
300
+ fn version(&self) -> String {
301
+ "1.0.0".to_string()
302
+ }
303
+ fn initialize(&self) -> Result<()> {
304
+ Ok(())
305
+ }
306
+ fn shutdown(&self) -> Result<()> {
307
+ Ok(())
308
+ }
309
+ }
310
+
311
+ #[async_trait]
312
+ impl Validator for InvalidNameValidator {
313
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
314
+ Ok(())
315
+ }
316
+ }
317
+
318
+ let validator = Arc::new(InvalidNameValidator);
319
+ let result = super::register_validator(validator);
320
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
321
+ }
322
+
323
+ #[test]
324
+ #[serial_test::serial]
325
+ fn test_register_validator_with_empty_name() {
326
+ use std::sync::Arc;
327
+
328
+ struct EmptyNameValidator;
329
+ impl Plugin for EmptyNameValidator {
330
+ fn name(&self) -> &str {
331
+ ""
332
+ }
333
+ fn version(&self) -> String {
334
+ "1.0.0".to_string()
335
+ }
336
+ fn initialize(&self) -> Result<()> {
337
+ Ok(())
338
+ }
339
+ fn shutdown(&self) -> Result<()> {
340
+ Ok(())
341
+ }
342
+ }
343
+
344
+ #[async_trait]
345
+ impl Validator for EmptyNameValidator {
346
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
347
+ Ok(())
348
+ }
349
+ }
350
+
351
+ let validator = Arc::new(EmptyNameValidator);
352
+ let result = super::register_validator(validator);
353
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
354
+ }
355
+ }
@@ -0,0 +1,276 @@
1
+ //! Validator plugin trait.
2
+ //!
3
+ //! This module defines the trait for implementing custom validation logic.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::ExtractionConfig;
7
+ use crate::plugins::Plugin;
8
+ use crate::types::ExtractionResult;
9
+ use async_trait::async_trait;
10
+
11
+ /// Trait for validator plugins.
12
+ ///
13
+ /// Validators check extraction results for quality, completeness, or correctness.
14
+ /// Unlike post-processors, validator errors **fail fast** - if a validator returns
15
+ /// an error, the extraction fails immediately.
16
+ ///
17
+ /// # Use Cases
18
+ ///
19
+ /// - **Quality Gates**: Ensure extracted content meets minimum quality standards
20
+ /// - **Compliance**: Verify content meets regulatory requirements
21
+ /// - **Content Filtering**: Reject documents containing unwanted content
22
+ /// - **Format Validation**: Verify extracted content structure
23
+ /// - **Security Checks**: Scan for malicious content
24
+ ///
25
+ /// # Error Handling
26
+ ///
27
+ /// Validator errors are **fatal** - they cause the extraction to fail and bubble up
28
+ /// to the caller. Use validators for hard requirements that must be met.
29
+ ///
30
+ /// For non-fatal checks, use post-processors instead.
31
+ ///
32
+ /// # Thread Safety
33
+ ///
34
+ /// Validators must be thread-safe (`Send + Sync`).
35
+ ///
36
+ /// # Example
37
+ ///
38
+ /// ```rust
39
+ /// use kreuzberg::plugins::{Plugin, Validator};
40
+ /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
41
+ /// use async_trait::async_trait;
42
+ ///
43
+ /// /// Validate that extracted content has minimum length
44
+ /// struct MinimumLengthValidator {
45
+ /// min_length: usize,
46
+ /// }
47
+ ///
48
+ /// impl Plugin for MinimumLengthValidator {
49
+ /// fn name(&self) -> &str { "min-length-validator" }
50
+ /// fn version(&self) -> String { "1.0.0".to_string() }
51
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
52
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
53
+ /// }
54
+ ///
55
+ /// #[async_trait]
56
+ /// impl Validator for MinimumLengthValidator {
57
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
58
+ /// -> Result<()> {
59
+ /// if result.content.len() < self.min_length {
60
+ /// return Err(KreuzbergError::validation(format!(
61
+ /// "Content too short: {} < {} characters",
62
+ /// result.content.len(),
63
+ /// self.min_length
64
+ /// )));
65
+ /// }
66
+ /// Ok(())
67
+ /// }
68
+ /// }
69
+ /// ```
70
+ #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
71
+ #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
72
+ pub trait Validator: Plugin {
73
+ /// Validate an extraction result.
74
+ ///
75
+ /// Check the extraction result and return `Ok(())` if valid, or an error
76
+ /// if validation fails.
77
+ ///
78
+ /// # Arguments
79
+ ///
80
+ /// * `result` - The extraction result to validate
81
+ /// * `config` - Extraction configuration
82
+ ///
83
+ /// # Returns
84
+ ///
85
+ /// - `Ok(())` if validation passes
86
+ /// - `Err(...)` if validation fails (extraction will fail)
87
+ ///
88
+ /// # Errors
89
+ ///
90
+ /// - `KreuzbergError::Validation` - Validation failed
91
+ /// - Any other error type appropriate for the failure
92
+ ///
93
+ /// # Example - Content Length Validation
94
+ ///
95
+ /// ```rust
96
+ /// # use kreuzberg::plugins::{Plugin, Validator};
97
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
98
+ /// # use async_trait::async_trait;
99
+ /// # struct ContentLengthValidator { min: usize, max: usize }
100
+ /// # impl Plugin for ContentLengthValidator {
101
+ /// # fn name(&self) -> &str { "length-validator" }
102
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
103
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
104
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
105
+ /// # }
106
+ /// # #[async_trait]
107
+ /// # impl Validator for ContentLengthValidator {
108
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
109
+ /// -> Result<()> {
110
+ /// let length = result.content.len();
111
+ ///
112
+ /// if length < self.min {
113
+ /// return Err(KreuzbergError::validation(format!(
114
+ /// "Content too short: {} < {} characters",
115
+ /// length, self.min
116
+ /// )));
117
+ /// }
118
+ ///
119
+ /// if length > self.max {
120
+ /// return Err(KreuzbergError::validation(format!(
121
+ /// "Content too long: {} > {} characters",
122
+ /// length, self.max
123
+ /// )));
124
+ /// }
125
+ ///
126
+ /// Ok(())
127
+ /// }
128
+ /// # }
129
+ /// ```
130
+ ///
131
+ /// # Example - Quality Score Validation
132
+ ///
133
+ /// ```rust
134
+ /// # use kreuzberg::plugins::{Plugin, Validator};
135
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
136
+ /// # use async_trait::async_trait;
137
+ /// # struct QualityValidator { min_score: f64 }
138
+ /// # impl Plugin for QualityValidator {
139
+ /// # fn name(&self) -> &str { "quality-validator" }
140
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
141
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
142
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
143
+ /// # }
144
+ /// # #[async_trait]
145
+ /// # impl Validator for QualityValidator {
146
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
147
+ /// -> Result<()> {
148
+ /// // Check if quality_score exists in metadata
149
+ /// let score = result.metadata
150
+ /// .additional
151
+ /// .get("quality_score")
152
+ /// .and_then(|v| v.as_f64())
153
+ /// .unwrap_or(0.0);
154
+ ///
155
+ /// if score < self.min_score {
156
+ /// return Err(KreuzbergError::validation(format!(
157
+ /// "Quality score too low: {} < {}",
158
+ /// score, self.min_score
159
+ /// )));
160
+ /// }
161
+ ///
162
+ /// Ok(())
163
+ /// }
164
+ /// # }
165
+ /// ```
166
+ ///
167
+ /// # Example - Security Validation
168
+ ///
169
+ /// ```rust
170
+ /// # use kreuzberg::plugins::{Plugin, Validator};
171
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
172
+ /// # use async_trait::async_trait;
173
+ /// # struct SecurityValidator { blocked_patterns: Vec<String> }
174
+ /// # impl Plugin for SecurityValidator {
175
+ /// # fn name(&self) -> &str { "security-validator" }
176
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
177
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
178
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
179
+ /// # }
180
+ /// # #[async_trait]
181
+ /// # impl Validator for SecurityValidator {
182
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
183
+ /// -> Result<()> {
184
+ /// // Check for blocked patterns
185
+ /// for pattern in &self.blocked_patterns {
186
+ /// if result.content.contains(pattern) {
187
+ /// return Err(KreuzbergError::validation(format!(
188
+ /// "Content contains blocked pattern: {}",
189
+ /// pattern
190
+ /// )));
191
+ /// }
192
+ /// }
193
+ ///
194
+ /// Ok(())
195
+ /// }
196
+ /// # }
197
+ /// ```
198
+ async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> Result<()>;
199
+
200
+ /// Optional: Check if this validator should run for a given result.
201
+ ///
202
+ /// Allows conditional validation based on MIME type, metadata, or content.
203
+ /// Defaults to `true` (always run).
204
+ ///
205
+ /// # Arguments
206
+ ///
207
+ /// * `result` - The extraction result to check
208
+ /// * `config` - Extraction configuration
209
+ ///
210
+ /// # Returns
211
+ ///
212
+ /// `true` if the validator should run, `false` to skip.
213
+ ///
214
+ /// # Example
215
+ ///
216
+ /// ```rust
217
+ /// # use kreuzberg::plugins::{Plugin, Validator};
218
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
219
+ /// # use async_trait::async_trait;
220
+ /// # struct PdfValidator;
221
+ /// # impl Plugin for PdfValidator {
222
+ /// # fn name(&self) -> &str { "pdf-validator" }
223
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
224
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
225
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
226
+ /// # }
227
+ /// # #[async_trait]
228
+ /// # impl Validator for PdfValidator {
229
+ /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
230
+ /// /// Only validate PDF documents
231
+ /// fn should_validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
232
+ /// result.mime_type == "application/pdf"
233
+ /// }
234
+ /// # }
235
+ /// ```
236
+ fn should_validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
237
+ true
238
+ }
239
+
240
+ /// Optional: Get the validation priority.
241
+ ///
242
+ /// Higher priority validators run first. Useful for ordering validation checks
243
+ /// (e.g., run cheap validations before expensive ones).
244
+ ///
245
+ /// Default priority is 50.
246
+ ///
247
+ /// # Returns
248
+ ///
249
+ /// Priority value (higher = runs earlier).
250
+ ///
251
+ /// # Example
252
+ ///
253
+ /// ```rust
254
+ /// # use kreuzberg::plugins::{Plugin, Validator};
255
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
256
+ /// # use async_trait::async_trait;
257
+ /// # struct FastValidator;
258
+ /// # impl Plugin for FastValidator {
259
+ /// # fn name(&self) -> &str { "fast-validator" }
260
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
261
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
262
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
263
+ /// # }
264
+ /// # #[async_trait]
265
+ /// # impl Validator for FastValidator {
266
+ /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
267
+ /// /// Run this validator first (it's fast)
268
+ /// fn priority(&self) -> i32 {
269
+ /// 100
270
+ /// }
271
+ /// # }
272
+ /// ```
273
+ fn priority(&self) -> i32 {
274
+ 50
275
+ }
276
+ }
@@ -0,0 +1,40 @@
1
+ //! Asian language stopwords.
2
+ //!
3
+ //! Includes: Chinese (zh), Japanese (ja), Korean (ko), Thai (th),
4
+ //! Vietnamese (vi), Hindi (hi), Bengali (bn), Gujarati (gu),
5
+ //! Kannada (kn), Malayalam (ml), Marathi (mr), Tamil (ta),
6
+ //! Telugu (te), Nepali (ne), Sinhala (si), Urdu (ur).
7
+
8
+ use ahash::{AHashMap, AHashSet};
9
+
10
+ /// Macro to generate embedded stopwords for Asian languages.
11
+ macro_rules! embed_stopwords {
12
+ ($map:expr, $($lang:literal),* $(,)?) => {
13
+ $(
14
+ {
15
+ const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
16
+ match serde_json::from_str::<Vec<String>>(JSON) {
17
+ Ok(words) => {
18
+ let set: AHashSet<String> = words.into_iter().collect();
19
+ $map.insert($lang.to_string(), set);
20
+ }
21
+ Err(e) => {
22
+ panic!(
23
+ "Failed to parse embedded stopwords for language '{}': {}. \
24
+ This indicates corrupted or malformed JSON in the embedded stopwords data. \
25
+ Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
26
+ $lang, e
27
+ );
28
+ }
29
+ }
30
+ }
31
+ )*
32
+ };
33
+ }
34
+
35
+ /// Load Asian language stopwords into the provided map.
36
+ pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
37
+ embed_stopwords!(
38
+ map, "zh", "ja", "ko", "th", "vi", "hi", "bn", "gu", "kn", "ml", "mr", "ta", "te", "ne", "si", "ur"
39
+ );
40
+ }