kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,975 @@
1
+ //! Pipeline orchestration tests.
2
+
3
+ use super::*;
4
+ use crate::core::config::OutputFormat;
5
+ use crate::types::Metadata;
6
+ use lazy_static::lazy_static;
7
+
8
+ const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
9
+ #[cfg(feature = "quality")]
10
+ const QUALITY_VALIDATION_MARKER: &str = "quality_validation_test";
11
+ const POSTPROCESSOR_VALIDATION_MARKER: &str = "postprocessor_validation_test";
12
+ const ORDER_VALIDATION_MARKER: &str = "order_validation_test";
13
+
14
+ lazy_static! {
15
+ static ref REGISTRY_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());
16
+ }
17
+
18
+ #[tokio::test]
19
+ async fn test_run_pipeline_basic() {
20
+ let mut result = ExtractionResult {
21
+ content: "test".to_string(),
22
+ mime_type: "text/plain".to_string(),
23
+ metadata: Metadata::default(),
24
+ tables: vec![],
25
+ detected_languages: None,
26
+ chunks: None,
27
+ images: None,
28
+ djot_content: None,
29
+ pages: None,
30
+ elements: None,
31
+ };
32
+ result.metadata.additional.insert(
33
+ VALIDATION_MARKER_KEY.to_string(),
34
+ serde_json::json!(ORDER_VALIDATION_MARKER),
35
+ );
36
+ let config = ExtractionConfig::default();
37
+
38
+ let processed = run_pipeline(result, &config).await.unwrap();
39
+ assert_eq!(processed.content, "test");
40
+ }
41
+
42
+ #[tokio::test]
43
+ #[cfg(feature = "quality")]
44
+ async fn test_pipeline_with_quality_processing() {
45
+ let result = ExtractionResult {
46
+ content: "This is a test document with some meaningful content.".to_string(),
47
+ mime_type: "text/plain".to_string(),
48
+ metadata: Metadata::default(),
49
+ tables: vec![],
50
+ detected_languages: None,
51
+ chunks: None,
52
+ images: None,
53
+ djot_content: None,
54
+ pages: None,
55
+ elements: None,
56
+ };
57
+ let config = ExtractionConfig {
58
+ enable_quality_processing: true,
59
+ ..Default::default()
60
+ };
61
+
62
+ let processed = run_pipeline(result, &config).await.unwrap();
63
+ assert!(processed.metadata.additional.contains_key("quality_score"));
64
+ }
65
+
66
+ #[tokio::test]
67
+ async fn test_pipeline_without_quality_processing() {
68
+ let result = ExtractionResult {
69
+ content: "test".to_string(),
70
+ mime_type: "text/plain".to_string(),
71
+ metadata: Metadata::default(),
72
+ tables: vec![],
73
+ detected_languages: None,
74
+ chunks: None,
75
+ images: None,
76
+ djot_content: None,
77
+ pages: None,
78
+ elements: None,
79
+ };
80
+ let config = ExtractionConfig {
81
+ enable_quality_processing: false,
82
+ ..Default::default()
83
+ };
84
+
85
+ let processed = run_pipeline(result, &config).await.unwrap();
86
+ assert!(!processed.metadata.additional.contains_key("quality_score"));
87
+ }
88
+
89
+ #[tokio::test]
90
+ #[cfg(feature = "chunking")]
91
+ async fn test_pipeline_with_chunking() {
92
+ let result = ExtractionResult {
93
+ content: "This is a long text that should be chunked. ".repeat(100),
94
+ mime_type: "text/plain".to_string(),
95
+ metadata: Metadata::default(),
96
+ tables: vec![],
97
+ detected_languages: None,
98
+ chunks: None,
99
+ images: None,
100
+ djot_content: None,
101
+ pages: None,
102
+ elements: None,
103
+ };
104
+ let config = ExtractionConfig {
105
+ chunking: Some(crate::ChunkingConfig {
106
+ max_chars: 500,
107
+ max_overlap: 50,
108
+ embedding: None,
109
+ preset: None,
110
+ }),
111
+ ..Default::default()
112
+ };
113
+
114
+ let processed = run_pipeline(result, &config).await.unwrap();
115
+ assert!(processed.metadata.additional.contains_key("chunk_count"));
116
+ let chunk_count = processed.metadata.additional.get("chunk_count").unwrap();
117
+ assert!(chunk_count.as_u64().unwrap() > 1);
118
+ }
119
+
120
+ #[tokio::test]
121
+ async fn test_pipeline_without_chunking() {
122
+ let result = ExtractionResult {
123
+ content: "test".to_string(),
124
+ mime_type: "text/plain".to_string(),
125
+ metadata: Metadata::default(),
126
+ tables: vec![],
127
+ detected_languages: None,
128
+ chunks: None,
129
+ images: None,
130
+ djot_content: None,
131
+ pages: None,
132
+ elements: None,
133
+ };
134
+ let config = ExtractionConfig {
135
+ chunking: None,
136
+ ..Default::default()
137
+ };
138
+
139
+ let processed = run_pipeline(result, &config).await.unwrap();
140
+ assert!(!processed.metadata.additional.contains_key("chunk_count"));
141
+ }
142
+
143
+ #[tokio::test]
144
+ async fn test_pipeline_preserves_metadata() {
145
+ use std::collections::HashMap;
146
+ let mut additional = HashMap::new();
147
+ additional.insert("source".to_string(), serde_json::json!("test"));
148
+ additional.insert("page".to_string(), serde_json::json!(1));
149
+
150
+ let result = ExtractionResult {
151
+ content: "test".to_string(),
152
+ mime_type: "text/plain".to_string(),
153
+ metadata: Metadata {
154
+ additional,
155
+ ..Default::default()
156
+ },
157
+ pages: None,
158
+ tables: vec![],
159
+ detected_languages: None,
160
+ chunks: None,
161
+ images: None,
162
+ djot_content: None,
163
+ elements: None,
164
+ };
165
+ let config = ExtractionConfig::default();
166
+
167
+ let processed = run_pipeline(result, &config).await.unwrap();
168
+ assert_eq!(
169
+ processed.metadata.additional.get("source").unwrap(),
170
+ &serde_json::json!("test")
171
+ );
172
+ assert_eq!(
173
+ processed.metadata.additional.get("page").unwrap(),
174
+ &serde_json::json!(1)
175
+ );
176
+ }
177
+
178
+ #[tokio::test]
179
+ async fn test_pipeline_preserves_tables() {
180
+ use crate::types::Table;
181
+
182
+ let table = Table {
183
+ cells: vec![vec!["A".to_string(), "B".to_string()]],
184
+ markdown: "| A | B |".to_string(),
185
+ page_number: 0,
186
+ };
187
+
188
+ let result = ExtractionResult {
189
+ content: "test".to_string(),
190
+ mime_type: "text/plain".to_string(),
191
+ metadata: Metadata::default(),
192
+ tables: vec![table],
193
+ detected_languages: None,
194
+ chunks: None,
195
+ images: None,
196
+ djot_content: None,
197
+ pages: None,
198
+ elements: None,
199
+ };
200
+ let config = ExtractionConfig::default();
201
+
202
+ let processed = run_pipeline(result, &config).await.unwrap();
203
+ assert_eq!(processed.tables.len(), 1);
204
+ assert_eq!(processed.tables[0].cells.len(), 1);
205
+ }
206
+
207
+ #[tokio::test]
208
+ async fn test_pipeline_empty_content() {
209
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
210
+
211
+ {
212
+ let registry = crate::plugins::registry::get_post_processor_registry();
213
+ registry.write().unwrap().shutdown_all().unwrap();
214
+ }
215
+ {
216
+ let registry = crate::plugins::registry::get_validator_registry();
217
+ registry.write().unwrap().shutdown_all().unwrap();
218
+ }
219
+
220
+ let result = ExtractionResult {
221
+ content: String::new(),
222
+ mime_type: "text/plain".to_string(),
223
+ metadata: Metadata::default(),
224
+ tables: vec![],
225
+ detected_languages: None,
226
+ chunks: None,
227
+ images: None,
228
+ djot_content: None,
229
+ pages: None,
230
+ elements: None,
231
+ };
232
+ let config = ExtractionConfig::default();
233
+
234
+ drop(_guard);
235
+
236
+ let processed = run_pipeline(result, &config).await.unwrap();
237
+ assert_eq!(processed.content, "");
238
+ }
239
+
240
+ #[tokio::test]
241
+ #[cfg(feature = "chunking")]
242
+ async fn test_pipeline_with_all_features() {
243
+ let result = ExtractionResult {
244
+ content: "This is a comprehensive test document. ".repeat(50),
245
+ mime_type: "text/plain".to_string(),
246
+ metadata: Metadata::default(),
247
+ tables: vec![],
248
+ detected_languages: None,
249
+ chunks: None,
250
+ images: None,
251
+ djot_content: None,
252
+ pages: None,
253
+ elements: None,
254
+ };
255
+ let config = ExtractionConfig {
256
+ enable_quality_processing: true,
257
+ chunking: Some(crate::ChunkingConfig {
258
+ max_chars: 500,
259
+ max_overlap: 50,
260
+ embedding: None,
261
+ preset: None,
262
+ }),
263
+ ..Default::default()
264
+ };
265
+
266
+ let processed = run_pipeline(result, &config).await.unwrap();
267
+ assert!(processed.metadata.additional.contains_key("quality_score"));
268
+ assert!(processed.metadata.additional.contains_key("chunk_count"));
269
+ }
270
+
271
+ #[tokio::test]
272
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
273
+ #[ignore = "Requires test isolation - run with --test-threads=1 or individually with --include-ignored"]
274
+ #[allow(clippy::await_holding_lock)]
275
+ async fn test_pipeline_with_keyword_extraction() {
276
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
277
+ crate::plugins::registry::get_validator_registry()
278
+ .write()
279
+ .unwrap()
280
+ .shutdown_all()
281
+ .unwrap();
282
+ crate::plugins::registry::get_post_processor_registry()
283
+ .write()
284
+ .unwrap()
285
+ .shutdown_all()
286
+ .unwrap();
287
+
288
+ let _ = crate::keywords::register_keyword_processor();
289
+
290
+ let result = ExtractionResult {
291
+ content: r#"
292
+ Machine learning is a branch of artificial intelligence that focuses on
293
+ building systems that can learn from data. Deep learning is a subset of
294
+ machine learning that uses neural networks with multiple layers.
295
+ Natural language processing enables computers to understand human language.
296
+ "#
297
+ .to_string(),
298
+ mime_type: "text/plain".to_string(),
299
+ metadata: Metadata::default(),
300
+ tables: vec![],
301
+ detected_languages: None,
302
+ chunks: None,
303
+ images: None,
304
+ djot_content: None,
305
+ pages: None,
306
+ elements: None,
307
+ };
308
+
309
+ #[cfg(feature = "keywords-yake")]
310
+ let keyword_config = crate::keywords::KeywordConfig::yake();
311
+
312
+ #[cfg(all(feature = "keywords-rake", not(feature = "keywords-yake")))]
313
+ let keyword_config = crate::keywords::KeywordConfig::rake();
314
+
315
+ let config = ExtractionConfig {
316
+ keywords: Some(keyword_config),
317
+ ..Default::default()
318
+ };
319
+
320
+ let processed = run_pipeline(result, &config).await.unwrap();
321
+
322
+ assert!(processed.metadata.additional.contains_key("keywords"));
323
+
324
+ let keywords_value = processed.metadata.additional.get("keywords").unwrap();
325
+ assert!(keywords_value.is_array());
326
+
327
+ let keywords = keywords_value.as_array().unwrap();
328
+ assert!(!keywords.is_empty(), "Should have extracted keywords");
329
+
330
+ let first_keyword = &keywords[0];
331
+ assert!(first_keyword.is_object());
332
+ assert!(first_keyword.get("text").is_some());
333
+ assert!(first_keyword.get("score").is_some());
334
+ assert!(first_keyword.get("algorithm").is_some());
335
+ }
336
+
337
+ #[tokio::test]
338
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
339
+ async fn test_pipeline_without_keyword_config() {
340
+ {
341
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
342
+ }
343
+ let result = ExtractionResult {
344
+ content: "Machine learning and artificial intelligence.".to_string(),
345
+ mime_type: "text/plain".to_string(),
346
+ metadata: Metadata::default(),
347
+ tables: vec![],
348
+ detected_languages: None,
349
+ chunks: None,
350
+ images: None,
351
+ djot_content: None,
352
+ pages: None,
353
+ elements: None,
354
+ };
355
+
356
+ let config = ExtractionConfig {
357
+ keywords: None,
358
+ ..Default::default()
359
+ };
360
+
361
+ let processed = run_pipeline(result, &config).await.unwrap();
362
+
363
+ assert!(!processed.metadata.additional.contains_key("keywords"));
364
+ }
365
+
366
+ #[tokio::test]
367
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
368
+ async fn test_pipeline_keyword_extraction_short_content() {
369
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
370
+ crate::plugins::registry::get_validator_registry()
371
+ .write()
372
+ .unwrap()
373
+ .shutdown_all()
374
+ .unwrap();
375
+ crate::plugins::registry::get_post_processor_registry()
376
+ .write()
377
+ .unwrap()
378
+ .shutdown_all()
379
+ .unwrap();
380
+
381
+ let result = ExtractionResult {
382
+ content: "Short text".to_string(),
383
+ mime_type: "text/plain".to_string(),
384
+ metadata: Metadata::default(),
385
+ tables: vec![],
386
+ detected_languages: None,
387
+ chunks: None,
388
+ images: None,
389
+ djot_content: None,
390
+ pages: None,
391
+ elements: None,
392
+ };
393
+
394
+ #[cfg(feature = "keywords-yake")]
395
+ let keyword_config = crate::keywords::KeywordConfig::yake();
396
+
397
+ #[cfg(all(feature = "keywords-rake", not(feature = "keywords-yake")))]
398
+ let keyword_config = crate::keywords::KeywordConfig::rake();
399
+
400
+ let config = ExtractionConfig {
401
+ keywords: Some(keyword_config),
402
+ ..Default::default()
403
+ };
404
+
405
+ drop(_guard);
406
+
407
+ let processed = run_pipeline(result, &config).await.unwrap();
408
+
409
+ assert!(!processed.metadata.additional.contains_key("keywords"));
410
+ }
411
+
412
+ #[tokio::test]
413
+ async fn test_postprocessor_runs_before_validator() {
414
+ use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
415
+ use async_trait::async_trait;
416
+ use std::sync::Arc;
417
+
418
+ struct TestPostProcessor;
419
+ impl Plugin for TestPostProcessor {
420
+ fn name(&self) -> &str {
421
+ "test-processor"
422
+ }
423
+ fn version(&self) -> String {
424
+ "1.0.0".to_string()
425
+ }
426
+ fn initialize(&self) -> Result<()> {
427
+ Ok(())
428
+ }
429
+ fn shutdown(&self) -> Result<()> {
430
+ Ok(())
431
+ }
432
+ }
433
+
434
+ #[async_trait]
435
+ impl PostProcessor for TestPostProcessor {
436
+ async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
437
+ result
438
+ .metadata
439
+ .additional
440
+ .insert("processed".to_string(), serde_json::json!(true));
441
+ Ok(())
442
+ }
443
+
444
+ fn processing_stage(&self) -> ProcessingStage {
445
+ ProcessingStage::Middle
446
+ }
447
+ }
448
+
449
+ struct TestValidator;
450
+ impl Plugin for TestValidator {
451
+ fn name(&self) -> &str {
452
+ "test-validator"
453
+ }
454
+ fn version(&self) -> String {
455
+ "1.0.0".to_string()
456
+ }
457
+ fn initialize(&self) -> Result<()> {
458
+ Ok(())
459
+ }
460
+ fn shutdown(&self) -> Result<()> {
461
+ Ok(())
462
+ }
463
+ }
464
+
465
+ #[async_trait]
466
+ impl Validator for TestValidator {
467
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
468
+ let should_validate = result
469
+ .metadata
470
+ .additional
471
+ .get(VALIDATION_MARKER_KEY)
472
+ .and_then(|v| v.as_str())
473
+ == Some(POSTPROCESSOR_VALIDATION_MARKER);
474
+
475
+ if !should_validate {
476
+ return Ok(());
477
+ }
478
+
479
+ let processed = result
480
+ .metadata
481
+ .additional
482
+ .get("processed")
483
+ .and_then(|v| v.as_bool())
484
+ .unwrap_or(false);
485
+
486
+ if !processed {
487
+ return Err(crate::KreuzbergError::Validation {
488
+ message: "Post-processor did not run before validator".to_string(),
489
+ source: None,
490
+ });
491
+ }
492
+ Ok(())
493
+ }
494
+ }
495
+
496
+ let pp_registry = crate::plugins::registry::get_post_processor_registry();
497
+ let val_registry = crate::plugins::registry::get_validator_registry();
498
+
499
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
500
+ clear_processor_cache().unwrap();
501
+ pp_registry.write().unwrap().shutdown_all().unwrap();
502
+ val_registry.write().unwrap().shutdown_all().unwrap();
503
+ clear_processor_cache().unwrap();
504
+
505
+ {
506
+ let mut registry = pp_registry.write().unwrap();
507
+ registry.register(Arc::new(TestPostProcessor), 0).unwrap();
508
+ }
509
+
510
+ {
511
+ let mut registry = val_registry.write().unwrap();
512
+ registry.register(Arc::new(TestValidator)).unwrap();
513
+ }
514
+
515
+ // Clear the cache after registering new processors so it rebuilds with the test processors
516
+ clear_processor_cache().unwrap();
517
+
518
+ let mut result = ExtractionResult {
519
+ content: "test".to_string(),
520
+ mime_type: "text/plain".to_string(),
521
+ metadata: Metadata::default(),
522
+ tables: vec![],
523
+ detected_languages: None,
524
+ chunks: None,
525
+ images: None,
526
+ djot_content: None,
527
+ pages: None,
528
+ elements: None,
529
+ };
530
+ result.metadata.additional.insert(
531
+ VALIDATION_MARKER_KEY.to_string(),
532
+ serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
533
+ );
534
+
535
+ let config = ExtractionConfig {
536
+ postprocessor: Some(crate::core::config::PostProcessorConfig {
537
+ enabled: true,
538
+ enabled_set: None,
539
+ disabled_set: None,
540
+ enabled_processors: None,
541
+ disabled_processors: None,
542
+ }),
543
+ ..Default::default()
544
+ };
545
+ drop(_guard);
546
+
547
+ let processed = run_pipeline(result, &config).await;
548
+
549
+ pp_registry.write().unwrap().shutdown_all().unwrap();
550
+ val_registry.write().unwrap().shutdown_all().unwrap();
551
+
552
+ assert!(processed.is_ok(), "Validator should have seen post-processor metadata");
553
+ let processed = processed.unwrap();
554
+ assert_eq!(
555
+ processed.metadata.additional.get("processed"),
556
+ Some(&serde_json::json!(true)),
557
+ "Post-processor metadata should be present"
558
+ );
559
+ }
560
+
561
+ #[tokio::test]
562
+ #[cfg(feature = "quality")]
563
+ async fn test_quality_processing_runs_before_validator() {
564
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
565
+ use crate::plugins::{Plugin, Validator};
566
+ use async_trait::async_trait;
567
+ use std::sync::Arc;
568
+
569
+ struct QualityValidator;
570
+ impl Plugin for QualityValidator {
571
+ fn name(&self) -> &str {
572
+ "quality-validator"
573
+ }
574
+ fn version(&self) -> String {
575
+ "1.0.0".to_string()
576
+ }
577
+ fn initialize(&self) -> Result<()> {
578
+ Ok(())
579
+ }
580
+ fn shutdown(&self) -> Result<()> {
581
+ Ok(())
582
+ }
583
+ }
584
+
585
+ #[async_trait]
586
+ impl Validator for QualityValidator {
587
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
588
+ let should_validate = result
589
+ .metadata
590
+ .additional
591
+ .get(VALIDATION_MARKER_KEY)
592
+ .and_then(|v| v.as_str())
593
+ == Some(QUALITY_VALIDATION_MARKER);
594
+
595
+ if !should_validate {
596
+ return Ok(());
597
+ }
598
+
599
+ if !result.metadata.additional.contains_key("quality_score") {
600
+ return Err(crate::KreuzbergError::Validation {
601
+ message: "Quality processing did not run before validator".to_string(),
602
+ source: None,
603
+ });
604
+ }
605
+ Ok(())
606
+ }
607
+ }
608
+
609
+ let val_registry = crate::plugins::registry::get_validator_registry();
610
+ {
611
+ let mut registry = val_registry.write().unwrap();
612
+ registry.register(Arc::new(QualityValidator)).unwrap();
613
+ }
614
+
615
+ let mut result = ExtractionResult {
616
+ content: "This is meaningful test content for quality scoring.".to_string(),
617
+ mime_type: "text/plain".to_string(),
618
+ metadata: Metadata::default(),
619
+ tables: vec![],
620
+ detected_languages: None,
621
+ chunks: None,
622
+ images: None,
623
+ djot_content: None,
624
+ pages: None,
625
+ elements: None,
626
+ };
627
+ result.metadata.additional.insert(
628
+ VALIDATION_MARKER_KEY.to_string(),
629
+ serde_json::json!(QUALITY_VALIDATION_MARKER),
630
+ );
631
+
632
+ let config = ExtractionConfig {
633
+ enable_quality_processing: true,
634
+ ..Default::default()
635
+ };
636
+
637
+ drop(_guard);
638
+
639
+ let processed = run_pipeline(result, &config).await;
640
+
641
+ {
642
+ let mut registry = val_registry.write().unwrap();
643
+ registry.remove("quality-validator").unwrap();
644
+ }
645
+
646
+ assert!(processed.is_ok(), "Validator should have seen quality_score");
647
+ }
648
+
649
+ #[tokio::test]
650
+ async fn test_multiple_postprocessors_run_before_validator() {
651
+ use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
652
+ use async_trait::async_trait;
653
+ use std::sync::Arc;
654
+
655
+ struct EarlyProcessor;
656
+ impl Plugin for EarlyProcessor {
657
+ fn name(&self) -> &str {
658
+ "early-proc"
659
+ }
660
+ fn version(&self) -> String {
661
+ "1.0.0".to_string()
662
+ }
663
+ fn initialize(&self) -> Result<()> {
664
+ Ok(())
665
+ }
666
+ fn shutdown(&self) -> Result<()> {
667
+ Ok(())
668
+ }
669
+ }
670
+
671
+ #[async_trait]
672
+ impl PostProcessor for EarlyProcessor {
673
+ async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
674
+ let mut order = result
675
+ .metadata
676
+ .additional
677
+ .get("execution_order")
678
+ .and_then(|v| v.as_array())
679
+ .cloned()
680
+ .unwrap_or_default();
681
+ order.push(serde_json::json!("early"));
682
+ result
683
+ .metadata
684
+ .additional
685
+ .insert("execution_order".to_string(), serde_json::json!(order));
686
+ Ok(())
687
+ }
688
+
689
+ fn processing_stage(&self) -> ProcessingStage {
690
+ ProcessingStage::Early
691
+ }
692
+ }
693
+
694
+ struct LateProcessor;
695
+ impl Plugin for LateProcessor {
696
+ fn name(&self) -> &str {
697
+ "late-proc"
698
+ }
699
+ fn version(&self) -> String {
700
+ "1.0.0".to_string()
701
+ }
702
+ fn initialize(&self) -> Result<()> {
703
+ Ok(())
704
+ }
705
+ fn shutdown(&self) -> Result<()> {
706
+ Ok(())
707
+ }
708
+ }
709
+
710
+ #[async_trait]
711
+ impl PostProcessor for LateProcessor {
712
+ async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
713
+ let mut order = result
714
+ .metadata
715
+ .additional
716
+ .get("execution_order")
717
+ .and_then(|v| v.as_array())
718
+ .cloned()
719
+ .unwrap_or_default();
720
+ order.push(serde_json::json!("late"));
721
+ result
722
+ .metadata
723
+ .additional
724
+ .insert("execution_order".to_string(), serde_json::json!(order));
725
+ Ok(())
726
+ }
727
+
728
+ fn processing_stage(&self) -> ProcessingStage {
729
+ ProcessingStage::Late
730
+ }
731
+ }
732
+
733
+ struct OrderValidator;
734
+ impl Plugin for OrderValidator {
735
+ fn name(&self) -> &str {
736
+ "order-validator"
737
+ }
738
+ fn version(&self) -> String {
739
+ "1.0.0".to_string()
740
+ }
741
+ fn initialize(&self) -> Result<()> {
742
+ Ok(())
743
+ }
744
+ fn shutdown(&self) -> Result<()> {
745
+ Ok(())
746
+ }
747
+ }
748
+
749
+ #[async_trait]
750
+ impl Validator for OrderValidator {
751
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
752
+ let should_validate = result
753
+ .metadata
754
+ .additional
755
+ .get(VALIDATION_MARKER_KEY)
756
+ .and_then(|v| v.as_str())
757
+ == Some(ORDER_VALIDATION_MARKER);
758
+
759
+ if !should_validate {
760
+ return Ok(());
761
+ }
762
+
763
+ let order = result
764
+ .metadata
765
+ .additional
766
+ .get("execution_order")
767
+ .and_then(|v| v.as_array())
768
+ .ok_or_else(|| crate::KreuzbergError::Validation {
769
+ message: "No execution order found".to_string(),
770
+ source: None,
771
+ })?;
772
+
773
+ if order.len() != 2 {
774
+ return Err(crate::KreuzbergError::Validation {
775
+ message: format!("Expected 2 processors to run, got {}", order.len()),
776
+ source: None,
777
+ });
778
+ }
779
+
780
+ if order[0] != "early" || order[1] != "late" {
781
+ return Err(crate::KreuzbergError::Validation {
782
+ message: format!("Wrong execution order: {:?}", order),
783
+ source: None,
784
+ });
785
+ }
786
+
787
+ Ok(())
788
+ }
789
+ }
790
+
791
+ let pp_registry = crate::plugins::registry::get_post_processor_registry();
792
+ let val_registry = crate::plugins::registry::get_validator_registry();
793
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
794
+
795
+ pp_registry.write().unwrap().shutdown_all().unwrap();
796
+ val_registry.write().unwrap().shutdown_all().unwrap();
797
+ clear_processor_cache().unwrap();
798
+
799
+ {
800
+ let mut registry = pp_registry.write().unwrap();
801
+ registry.register(Arc::new(EarlyProcessor), 0).unwrap();
802
+ registry.register(Arc::new(LateProcessor), 0).unwrap();
803
+ }
804
+
805
+ {
806
+ let mut registry = val_registry.write().unwrap();
807
+ registry.register(Arc::new(OrderValidator)).unwrap();
808
+ }
809
+
810
+ // Clear the cache after registering new processors so it rebuilds with the test processors
811
+ clear_processor_cache().unwrap();
812
+
813
+ let result = ExtractionResult {
814
+ content: "test".to_string(),
815
+ mime_type: "text/plain".to_string(),
816
+ metadata: Metadata::default(),
817
+ tables: vec![],
818
+ detected_languages: None,
819
+ chunks: None,
820
+ images: None,
821
+ djot_content: None,
822
+ pages: None,
823
+ elements: None,
824
+ };
825
+
826
+ let config = ExtractionConfig::default();
827
+ drop(_guard);
828
+
829
+ let processed = run_pipeline(result, &config).await;
830
+
831
+ pp_registry.write().unwrap().shutdown_all().unwrap();
832
+ val_registry.write().unwrap().shutdown_all().unwrap();
833
+ clear_processor_cache().unwrap();
834
+
835
+ assert!(processed.is_ok(), "All processors should run before validator");
836
+ }
837
+
838
+ #[tokio::test]
839
+ async fn test_run_pipeline_with_output_format_plain() {
840
+ let result = ExtractionResult {
841
+ content: "test content".to_string(),
842
+ mime_type: "text/plain".to_string(),
843
+ metadata: Metadata::default(),
844
+ tables: vec![],
845
+ detected_languages: None,
846
+ chunks: None,
847
+ images: None,
848
+ pages: None,
849
+ djot_content: None,
850
+ elements: None,
851
+ };
852
+
853
+ let config = crate::core::config::ExtractionConfig {
854
+ output_format: OutputFormat::Plain,
855
+ ..Default::default()
856
+ };
857
+
858
+ let processed = run_pipeline(result, &config).await.unwrap();
859
+ assert_eq!(processed.content, "test content");
860
+ }
861
+
862
+ #[tokio::test]
863
+ async fn test_run_pipeline_with_output_format_djot() {
864
+ use crate::types::{BlockType, DjotContent, FormattedBlock, InlineElement, InlineType};
865
+
866
+ let result = ExtractionResult {
867
+ content: "test content".to_string(),
868
+ mime_type: "text/djot".to_string(),
869
+ metadata: Metadata::default(),
870
+ tables: vec![],
871
+ detected_languages: None,
872
+ chunks: None,
873
+ images: None,
874
+ pages: None,
875
+ elements: None,
876
+ djot_content: Some(DjotContent {
877
+ plain_text: "test content".to_string(),
878
+ blocks: vec![FormattedBlock {
879
+ block_type: BlockType::Paragraph,
880
+ level: None,
881
+ inline_content: vec![InlineElement {
882
+ element_type: InlineType::Text,
883
+ content: "test content".to_string(),
884
+ attributes: None,
885
+ metadata: None,
886
+ }],
887
+ attributes: None,
888
+ language: None,
889
+ code: None,
890
+ children: vec![],
891
+ }],
892
+ metadata: Metadata::default(),
893
+ tables: vec![],
894
+ images: vec![],
895
+ links: vec![],
896
+ footnotes: vec![],
897
+ attributes: std::collections::HashMap::new(),
898
+ }),
899
+ };
900
+
901
+ let config = crate::core::config::ExtractionConfig {
902
+ output_format: OutputFormat::Djot,
903
+ ..Default::default()
904
+ };
905
+
906
+ let processed = run_pipeline(result, &config).await.unwrap();
907
+ // The content should still be present
908
+ assert!(!processed.content.is_empty());
909
+ }
910
+
911
+ #[tokio::test]
912
+ async fn test_run_pipeline_with_output_format_html() {
913
+ let result = ExtractionResult {
914
+ content: "test content".to_string(),
915
+ mime_type: "text/plain".to_string(),
916
+ metadata: Metadata::default(),
917
+ tables: vec![],
918
+ detected_languages: None,
919
+ chunks: None,
920
+ images: None,
921
+ pages: None,
922
+ djot_content: None,
923
+ elements: None,
924
+ };
925
+
926
+ let config = crate::core::config::ExtractionConfig {
927
+ output_format: OutputFormat::Html,
928
+ ..Default::default()
929
+ };
930
+
931
+ let processed = run_pipeline(result, &config).await.unwrap();
932
+ // For non-djot documents, HTML wraps content in <pre> tags
933
+ assert!(processed.content.contains("<pre>"));
934
+ assert!(processed.content.contains("test content"));
935
+ assert!(processed.content.contains("</pre>"));
936
+ }
937
+
938
+ #[tokio::test]
939
+ async fn test_run_pipeline_applies_output_format_last() {
940
+ // This test verifies that output format is applied after all other processing
941
+ use crate::types::DjotContent;
942
+
943
+ let result = ExtractionResult {
944
+ content: "test".to_string(),
945
+ mime_type: "text/plain".to_string(),
946
+ metadata: Metadata::default(),
947
+ tables: vec![],
948
+ detected_languages: None,
949
+ chunks: None,
950
+ images: None,
951
+ pages: None,
952
+ elements: None,
953
+ djot_content: Some(DjotContent {
954
+ plain_text: "test".to_string(),
955
+ blocks: vec![],
956
+ metadata: Metadata::default(),
957
+ tables: vec![],
958
+ images: vec![],
959
+ links: vec![],
960
+ footnotes: vec![],
961
+ attributes: std::collections::HashMap::new(),
962
+ }),
963
+ };
964
+
965
+ let config = crate::core::config::ExtractionConfig {
966
+ output_format: OutputFormat::Djot,
967
+ // Disable other processing to ensure pipeline runs cleanly
968
+ enable_quality_processing: false,
969
+ ..Default::default()
970
+ };
971
+
972
+ let processed = run_pipeline(result, &config).await.unwrap();
973
+ // The result should have gone through the pipeline successfully
974
+ assert!(processed.djot_content.is_some());
975
+ }