kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,355 @@
1
+ //! Validator registry management.
2
+ //!
3
+ //! This module provides functions for managing the global validator registry.
4
+
5
+ use super::r#trait::Validator;
6
+ use std::sync::Arc;
7
+
8
+ /// Register a validator with the global registry.
9
+ ///
10
+ /// The validator will be registered with its default priority and will be called
11
+ /// during extraction validation. The validator's `name()` method is used as the
12
+ /// registration name.
13
+ ///
14
+ /// # Arguments
15
+ ///
16
+ /// * `validator` - The validator implementation wrapped in Arc
17
+ ///
18
+ /// # Returns
19
+ ///
20
+ /// - `Ok(())` if registration succeeded
21
+ /// - `Err(...)` if validation failed or initialization failed
22
+ ///
23
+ /// # Errors
24
+ ///
25
+ /// - `KreuzbergError::Validation` - Invalid validator name (empty or contains whitespace)
26
+ /// - Any error from the validator's `initialize()` method
27
+ ///
28
+ /// # Example
29
+ ///
30
+ /// ```rust
31
+ /// use kreuzberg::plugins::{Plugin, Validator, register_validator};
32
+ /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
33
+ /// use async_trait::async_trait;
34
+ /// use std::sync::Arc;
35
+ ///
36
+ /// struct MinLengthValidator { min_length: usize }
37
+ ///
38
+ /// impl Plugin for MinLengthValidator {
39
+ /// fn name(&self) -> &str { "min-length" }
40
+ /// fn version(&self) -> String { "1.0.0".to_string() }
41
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
42
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
43
+ /// }
44
+ ///
45
+ /// #[async_trait]
46
+ /// impl Validator for MinLengthValidator {
47
+ /// async fn validate(&self, result: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
48
+ /// if result.content.len() < self.min_length {
49
+ /// return Err(KreuzbergError::validation(
50
+ /// format!("Content too short: {} < {}", result.content.len(), self.min_length)
51
+ /// ));
52
+ /// }
53
+ /// Ok(())
54
+ /// }
55
+ /// }
56
+ ///
57
+ /// # tokio_test::block_on(async {
58
+ /// let validator = Arc::new(MinLengthValidator { min_length: 10 });
59
+ /// register_validator(validator)?;
60
+ /// # Ok::<(), KreuzbergError>(())
61
+ /// # });
62
+ /// ```
63
+ pub fn register_validator(validator: Arc<dyn Validator>) -> crate::Result<()> {
64
+ use crate::plugins::registry::get_validator_registry;
65
+
66
+ let registry = get_validator_registry();
67
+ let mut registry = registry
68
+ .write()
69
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
70
+
71
+ registry.register(validator)
72
+ }
73
+
74
+ /// Unregister a validator by name.
75
+ ///
76
+ /// Removes the validator from the global registry and calls its `shutdown()` method.
77
+ ///
78
+ /// # Arguments
79
+ ///
80
+ /// * `name` - Name of the validator to unregister
81
+ ///
82
+ /// # Returns
83
+ ///
84
+ /// - `Ok(())` if the validator was unregistered or didn't exist
85
+ /// - `Err(...)` if the shutdown method failed
86
+ ///
87
+ /// # Example
88
+ ///
89
+ /// ```rust
90
+ /// use kreuzberg::plugins::unregister_validator;
91
+ ///
92
+ /// # tokio_test::block_on(async {
93
+ /// unregister_validator("min-length")?;
94
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
95
+ /// # });
96
+ /// ```
97
+ pub fn unregister_validator(name: &str) -> crate::Result<()> {
98
+ use crate::plugins::registry::get_validator_registry;
99
+
100
+ let registry = get_validator_registry();
101
+ let mut registry = registry
102
+ .write()
103
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
104
+
105
+ registry.remove(name)
106
+ }
107
+
108
+ /// List all registered validators.
109
+ ///
110
+ /// Returns the names of all validators currently registered in the global registry.
111
+ ///
112
+ /// # Returns
113
+ ///
114
+ /// A vector of validator names.
115
+ ///
116
+ /// # Example
117
+ ///
118
+ /// ```rust
119
+ /// use kreuzberg::plugins::list_validators;
120
+ ///
121
+ /// # tokio_test::block_on(async {
122
+ /// let validators = list_validators()?;
123
+ /// for name in validators {
124
+ /// println!("Registered validator: {}", name);
125
+ /// }
126
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
127
+ /// # });
128
+ /// ```
129
+ pub fn list_validators() -> crate::Result<Vec<String>> {
130
+ use crate::plugins::registry::get_validator_registry;
131
+
132
+ let registry = get_validator_registry();
133
+ let registry = registry
134
+ .read()
135
+ .expect("~keep Failed to acquire read lock on validator registry"); // ~keep
136
+
137
+ Ok(registry.list())
138
+ }
139
+
140
+ /// Clear all validators from the global registry.
141
+ ///
142
+ /// Removes all validators and calls their `shutdown()` methods.
143
+ ///
144
+ /// # Returns
145
+ ///
146
+ /// - `Ok(())` if all validators were cleared successfully
147
+ /// - `Err(...)` if any shutdown method failed
148
+ ///
149
+ /// # Example
150
+ ///
151
+ /// ```rust
152
+ /// use kreuzberg::plugins::clear_validators;
153
+ ///
154
+ /// # tokio_test::block_on(async {
155
+ /// clear_validators()?;
156
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
157
+ /// # });
158
+ /// ```
159
+ pub fn clear_validators() -> crate::Result<()> {
160
+ use crate::plugins::registry::get_validator_registry;
161
+
162
+ let registry = get_validator_registry();
163
+ let mut registry = registry
164
+ .write()
165
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
166
+
167
+ registry.shutdown_all()
168
+ }
169
+
170
+ #[cfg(test)]
171
+ mod tests {
172
+ use super::*;
173
+ use crate::KreuzbergError;
174
+ use crate::Result;
175
+ use crate::core::config::ExtractionConfig;
176
+ use crate::plugins::Plugin;
177
+ use crate::types::ExtractionResult;
178
+ use async_trait::async_trait;
179
+
180
+ struct MockValidator {
181
+ should_fail: bool,
182
+ }
183
+
184
+ impl Plugin for MockValidator {
185
+ fn name(&self) -> &str {
186
+ "mock-validator"
187
+ }
188
+
189
+ fn version(&self) -> String {
190
+ "1.0.0".to_string()
191
+ }
192
+
193
+ fn initialize(&self) -> Result<()> {
194
+ Ok(())
195
+ }
196
+
197
+ fn shutdown(&self) -> Result<()> {
198
+ Ok(())
199
+ }
200
+ }
201
+
202
+ #[async_trait]
203
+ impl Validator for MockValidator {
204
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
205
+ if self.should_fail {
206
+ Err(KreuzbergError::validation("Validation failed".to_string()))
207
+ } else {
208
+ Ok(())
209
+ }
210
+ }
211
+ }
212
+
213
+ #[test]
214
+ #[serial_test::serial]
215
+ fn test_register_validator() {
216
+ use std::sync::Arc;
217
+
218
+ let validator = Arc::new(MockValidator { should_fail: false });
219
+ let result = super::register_validator(validator);
220
+ assert!(result.is_ok());
221
+
222
+ let _ = super::unregister_validator("mock-validator");
223
+ }
224
+
225
+ #[test]
226
+ #[serial_test::serial]
227
+ fn test_unregister_validator() {
228
+ use std::sync::Arc;
229
+
230
+ let validator = Arc::new(MockValidator { should_fail: false });
231
+ super::register_validator(validator).unwrap();
232
+
233
+ let result = super::unregister_validator("mock-validator");
234
+ assert!(result.is_ok());
235
+ }
236
+
237
+ #[test]
238
+ #[serial_test::serial]
239
+ fn test_unregister_nonexistent_validator() {
240
+ let result = super::unregister_validator("nonexistent-validator-xyz");
241
+ assert!(result.is_ok());
242
+ }
243
+
244
+ #[test]
245
+ #[serial_test::serial]
246
+ fn test_list_validators() {
247
+ use std::sync::Arc;
248
+
249
+ super::clear_validators().unwrap();
250
+
251
+ let validator1 = Arc::new(MockValidator { should_fail: false });
252
+ let validator2 = Arc::new(MockValidator { should_fail: false });
253
+
254
+ let list_before = super::list_validators().unwrap();
255
+ assert_eq!(list_before.len(), 0);
256
+
257
+ super::register_validator(validator1).unwrap();
258
+ super::register_validator(validator2).unwrap();
259
+
260
+ let list = super::list_validators().unwrap();
261
+ assert_eq!(list.len(), 1);
262
+ assert!(list.contains(&"mock-validator".to_string()));
263
+
264
+ super::unregister_validator("mock-validator").unwrap();
265
+ }
266
+
267
+ #[test]
268
+ #[serial_test::serial]
269
+ fn test_clear_validators() {
270
+ use std::sync::Arc;
271
+
272
+ super::clear_validators().unwrap();
273
+
274
+ let validator1 = Arc::new(MockValidator { should_fail: false });
275
+ let validator2 = Arc::new(MockValidator { should_fail: false });
276
+
277
+ super::register_validator(validator1).unwrap();
278
+ super::register_validator(validator2).unwrap();
279
+
280
+ let list_before = super::list_validators().unwrap();
281
+ assert!(!list_before.is_empty());
282
+
283
+ let result = super::clear_validators();
284
+ assert!(result.is_ok());
285
+
286
+ let list = super::list_validators().unwrap();
287
+ assert_eq!(list.len(), 0);
288
+ }
289
+
290
+ #[test]
291
+ #[serial_test::serial]
292
+ fn test_register_validator_with_invalid_name() {
293
+ use std::sync::Arc;
294
+
295
+ struct InvalidNameValidator;
296
+ impl Plugin for InvalidNameValidator {
297
+ fn name(&self) -> &str {
298
+ "invalid name with spaces"
299
+ }
300
+ fn version(&self) -> String {
301
+ "1.0.0".to_string()
302
+ }
303
+ fn initialize(&self) -> Result<()> {
304
+ Ok(())
305
+ }
306
+ fn shutdown(&self) -> Result<()> {
307
+ Ok(())
308
+ }
309
+ }
310
+
311
+ #[async_trait]
312
+ impl Validator for InvalidNameValidator {
313
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
314
+ Ok(())
315
+ }
316
+ }
317
+
318
+ let validator = Arc::new(InvalidNameValidator);
319
+ let result = super::register_validator(validator);
320
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
321
+ }
322
+
323
+ #[test]
324
+ #[serial_test::serial]
325
+ fn test_register_validator_with_empty_name() {
326
+ use std::sync::Arc;
327
+
328
+ struct EmptyNameValidator;
329
+ impl Plugin for EmptyNameValidator {
330
+ fn name(&self) -> &str {
331
+ ""
332
+ }
333
+ fn version(&self) -> String {
334
+ "1.0.0".to_string()
335
+ }
336
+ fn initialize(&self) -> Result<()> {
337
+ Ok(())
338
+ }
339
+ fn shutdown(&self) -> Result<()> {
340
+ Ok(())
341
+ }
342
+ }
343
+
344
+ #[async_trait]
345
+ impl Validator for EmptyNameValidator {
346
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
347
+ Ok(())
348
+ }
349
+ }
350
+
351
+ let validator = Arc::new(EmptyNameValidator);
352
+ let result = super::register_validator(validator);
353
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
354
+ }
355
+ }
@@ -0,0 +1,276 @@
1
+ //! Validator plugin trait.
2
+ //!
3
+ //! This module defines the trait for implementing custom validation logic.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::ExtractionConfig;
7
+ use crate::plugins::Plugin;
8
+ use crate::types::ExtractionResult;
9
+ use async_trait::async_trait;
10
+
11
+ /// Trait for validator plugins.
12
+ ///
13
+ /// Validators check extraction results for quality, completeness, or correctness.
14
+ /// Unlike post-processors, validator errors **fail fast** - if a validator returns
15
+ /// an error, the extraction fails immediately.
16
+ ///
17
+ /// # Use Cases
18
+ ///
19
+ /// - **Quality Gates**: Ensure extracted content meets minimum quality standards
20
+ /// - **Compliance**: Verify content meets regulatory requirements
21
+ /// - **Content Filtering**: Reject documents containing unwanted content
22
+ /// - **Format Validation**: Verify extracted content structure
23
+ /// - **Security Checks**: Scan for malicious content
24
+ ///
25
+ /// # Error Handling
26
+ ///
27
+ /// Validator errors are **fatal** - they cause the extraction to fail and bubble up
28
+ /// to the caller. Use validators for hard requirements that must be met.
29
+ ///
30
+ /// For non-fatal checks, use post-processors instead.
31
+ ///
32
+ /// # Thread Safety
33
+ ///
34
+ /// Validators must be thread-safe (`Send + Sync`).
35
+ ///
36
+ /// # Example
37
+ ///
38
+ /// ```rust
39
+ /// use kreuzberg::plugins::{Plugin, Validator};
40
+ /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
41
+ /// use async_trait::async_trait;
42
+ ///
43
+ /// /// Validate that extracted content has minimum length
44
+ /// struct MinimumLengthValidator {
45
+ /// min_length: usize,
46
+ /// }
47
+ ///
48
+ /// impl Plugin for MinimumLengthValidator {
49
+ /// fn name(&self) -> &str { "min-length-validator" }
50
+ /// fn version(&self) -> String { "1.0.0".to_string() }
51
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
52
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
53
+ /// }
54
+ ///
55
+ /// #[async_trait]
56
+ /// impl Validator for MinimumLengthValidator {
57
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
58
+ /// -> Result<()> {
59
+ /// if result.content.len() < self.min_length {
60
+ /// return Err(KreuzbergError::validation(format!(
61
+ /// "Content too short: {} < {} characters",
62
+ /// result.content.len(),
63
+ /// self.min_length
64
+ /// )));
65
+ /// }
66
+ /// Ok(())
67
+ /// }
68
+ /// }
69
+ /// ```
70
+ #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
71
+ #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
72
+ pub trait Validator: Plugin {
73
+ /// Validate an extraction result.
74
+ ///
75
+ /// Check the extraction result and return `Ok(())` if valid, or an error
76
+ /// if validation fails.
77
+ ///
78
+ /// # Arguments
79
+ ///
80
+ /// * `result` - The extraction result to validate
81
+ /// * `config` - Extraction configuration
82
+ ///
83
+ /// # Returns
84
+ ///
85
+ /// - `Ok(())` if validation passes
86
+ /// - `Err(...)` if validation fails (extraction will fail)
87
+ ///
88
+ /// # Errors
89
+ ///
90
+ /// - `KreuzbergError::Validation` - Validation failed
91
+ /// - Any other error type appropriate for the failure
92
+ ///
93
+ /// # Example - Content Length Validation
94
+ ///
95
+ /// ```rust
96
+ /// # use kreuzberg::plugins::{Plugin, Validator};
97
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
98
+ /// # use async_trait::async_trait;
99
+ /// # struct ContentLengthValidator { min: usize, max: usize }
100
+ /// # impl Plugin for ContentLengthValidator {
101
+ /// # fn name(&self) -> &str { "length-validator" }
102
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
103
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
104
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
105
+ /// # }
106
+ /// # #[async_trait]
107
+ /// # impl Validator for ContentLengthValidator {
108
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
109
+ /// -> Result<()> {
110
+ /// let length = result.content.len();
111
+ ///
112
+ /// if length < self.min {
113
+ /// return Err(KreuzbergError::validation(format!(
114
+ /// "Content too short: {} < {} characters",
115
+ /// length, self.min
116
+ /// )));
117
+ /// }
118
+ ///
119
+ /// if length > self.max {
120
+ /// return Err(KreuzbergError::validation(format!(
121
+ /// "Content too long: {} > {} characters",
122
+ /// length, self.max
123
+ /// )));
124
+ /// }
125
+ ///
126
+ /// Ok(())
127
+ /// }
128
+ /// # }
129
+ /// ```
130
+ ///
131
+ /// # Example - Quality Score Validation
132
+ ///
133
+ /// ```rust
134
+ /// # use kreuzberg::plugins::{Plugin, Validator};
135
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
136
+ /// # use async_trait::async_trait;
137
+ /// # struct QualityValidator { min_score: f64 }
138
+ /// # impl Plugin for QualityValidator {
139
+ /// # fn name(&self) -> &str { "quality-validator" }
140
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
141
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
142
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
143
+ /// # }
144
+ /// # #[async_trait]
145
+ /// # impl Validator for QualityValidator {
146
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
147
+ /// -> Result<()> {
148
+ /// // Check if quality_score exists in metadata
149
+ /// let score = result.metadata
150
+ /// .additional
151
+ /// .get("quality_score")
152
+ /// .and_then(|v| v.as_f64())
153
+ /// .unwrap_or(0.0);
154
+ ///
155
+ /// if score < self.min_score {
156
+ /// return Err(KreuzbergError::validation(format!(
157
+ /// "Quality score too low: {} < {}",
158
+ /// score, self.min_score
159
+ /// )));
160
+ /// }
161
+ ///
162
+ /// Ok(())
163
+ /// }
164
+ /// # }
165
+ /// ```
166
+ ///
167
+ /// # Example - Security Validation
168
+ ///
169
+ /// ```rust
170
+ /// # use kreuzberg::plugins::{Plugin, Validator};
171
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
172
+ /// # use async_trait::async_trait;
173
+ /// # struct SecurityValidator { blocked_patterns: Vec<String> }
174
+ /// # impl Plugin for SecurityValidator {
175
+ /// # fn name(&self) -> &str { "security-validator" }
176
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
177
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
178
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
179
+ /// # }
180
+ /// # #[async_trait]
181
+ /// # impl Validator for SecurityValidator {
182
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
183
+ /// -> Result<()> {
184
+ /// // Check for blocked patterns
185
+ /// for pattern in &self.blocked_patterns {
186
+ /// if result.content.contains(pattern) {
187
+ /// return Err(KreuzbergError::validation(format!(
188
+ /// "Content contains blocked pattern: {}",
189
+ /// pattern
190
+ /// )));
191
+ /// }
192
+ /// }
193
+ ///
194
+ /// Ok(())
195
+ /// }
196
+ /// # }
197
+ /// ```
198
+ async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> Result<()>;
199
+
200
+ /// Optional: Check if this validator should run for a given result.
201
+ ///
202
+ /// Allows conditional validation based on MIME type, metadata, or content.
203
+ /// Defaults to `true` (always run).
204
+ ///
205
+ /// # Arguments
206
+ ///
207
+ /// * `result` - The extraction result to check
208
+ /// * `config` - Extraction configuration
209
+ ///
210
+ /// # Returns
211
+ ///
212
+ /// `true` if the validator should run, `false` to skip.
213
+ ///
214
+ /// # Example
215
+ ///
216
+ /// ```rust
217
+ /// # use kreuzberg::plugins::{Plugin, Validator};
218
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
219
+ /// # use async_trait::async_trait;
220
+ /// # struct PdfValidator;
221
+ /// # impl Plugin for PdfValidator {
222
+ /// # fn name(&self) -> &str { "pdf-validator" }
223
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
224
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
225
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
226
+ /// # }
227
+ /// # #[async_trait]
228
+ /// # impl Validator for PdfValidator {
229
+ /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
230
+ /// /// Only validate PDF documents
231
+ /// fn should_validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
232
+ /// result.mime_type == "application/pdf"
233
+ /// }
234
+ /// # }
235
+ /// ```
236
+ fn should_validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
237
+ true
238
+ }
239
+
240
+ /// Optional: Get the validation priority.
241
+ ///
242
+ /// Higher priority validators run first. Useful for ordering validation checks
243
+ /// (e.g., run cheap validations before expensive ones).
244
+ ///
245
+ /// Default priority is 50.
246
+ ///
247
+ /// # Returns
248
+ ///
249
+ /// Priority value (higher = runs earlier).
250
+ ///
251
+ /// # Example
252
+ ///
253
+ /// ```rust
254
+ /// # use kreuzberg::plugins::{Plugin, Validator};
255
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
256
+ /// # use async_trait::async_trait;
257
+ /// # struct FastValidator;
258
+ /// # impl Plugin for FastValidator {
259
+ /// # fn name(&self) -> &str { "fast-validator" }
260
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
261
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
262
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
263
+ /// # }
264
+ /// # #[async_trait]
265
+ /// # impl Validator for FastValidator {
266
+ /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
267
+ /// /// Run this validator first (it's fast)
268
+ /// fn priority(&self) -> i32 {
269
+ /// 100
270
+ /// }
271
+ /// # }
272
+ /// ```
273
+ fn priority(&self) -> i32 {
274
+ 50
275
+ }
276
+ }
@@ -0,0 +1,40 @@
1
+ //! Asian language stopwords.
2
+ //!
3
+ //! Includes: Chinese (zh), Japanese (ja), Korean (ko), Thai (th),
4
+ //! Vietnamese (vi), Hindi (hi), Bengali (bn), Gujarati (gu),
5
+ //! Kannada (kn), Malayalam (ml), Marathi (mr), Tamil (ta),
6
+ //! Telugu (te), Nepali (ne), Sinhala (si), Urdu (ur).
7
+
8
+ use ahash::{AHashMap, AHashSet};
9
+
10
+ /// Macro to generate embedded stopwords for Asian languages.
11
+ macro_rules! embed_stopwords {
12
+ ($map:expr, $($lang:literal),* $(,)?) => {
13
+ $(
14
+ {
15
+ const JSON: &str = include_str!(concat!("../../../stopwords/", $lang, "_stopwords.json"));
16
+ match serde_json::from_str::<Vec<String>>(JSON) {
17
+ Ok(words) => {
18
+ let set: AHashSet<String> = words.into_iter().collect();
19
+ $map.insert($lang.to_string(), set);
20
+ }
21
+ Err(e) => {
22
+ panic!(
23
+ "Failed to parse embedded stopwords for language '{}': {}. \
24
+ This indicates corrupted or malformed JSON in the embedded stopwords data. \
25
+ Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
26
+ $lang, e
27
+ );
28
+ }
29
+ }
30
+ }
31
+ )*
32
+ };
33
+ }
34
+
35
+ /// Load Asian language stopwords into the provided map.
36
+ pub(in crate::stopwords) fn load_stopwords(map: &mut AHashMap<String, AHashSet<String>>) {
37
+ embed_stopwords!(
38
+ map, "zh", "ja", "ko", "th", "vi", "hi", "bn", "gu", "kn", "ml", "mr", "ta", "te", "ne", "si", "ur"
39
+ );
40
+ }