kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,975 @@
1
+ //! Pipeline orchestration tests.
2
+
3
+ use super::*;
4
+ use crate::core::config::OutputFormat;
5
+ use crate::types::Metadata;
6
+ use lazy_static::lazy_static;
7
+
8
+ const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
9
+ #[cfg(feature = "quality")]
10
+ const QUALITY_VALIDATION_MARKER: &str = "quality_validation_test";
11
+ const POSTPROCESSOR_VALIDATION_MARKER: &str = "postprocessor_validation_test";
12
+ const ORDER_VALIDATION_MARKER: &str = "order_validation_test";
13
+
14
+ lazy_static! {
15
+ static ref REGISTRY_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());
16
+ }
17
+
18
+ #[tokio::test]
19
+ async fn test_run_pipeline_basic() {
20
+ let mut result = ExtractionResult {
21
+ content: "test".to_string(),
22
+ mime_type: "text/plain".to_string(),
23
+ metadata: Metadata::default(),
24
+ tables: vec![],
25
+ detected_languages: None,
26
+ chunks: None,
27
+ images: None,
28
+ djot_content: None,
29
+ pages: None,
30
+ elements: None,
31
+ };
32
+ result.metadata.additional.insert(
33
+ VALIDATION_MARKER_KEY.to_string(),
34
+ serde_json::json!(ORDER_VALIDATION_MARKER),
35
+ );
36
+ let config = ExtractionConfig::default();
37
+
38
+ let processed = run_pipeline(result, &config).await.unwrap();
39
+ assert_eq!(processed.content, "test");
40
+ }
41
+
42
+ #[tokio::test]
43
+ #[cfg(feature = "quality")]
44
+ async fn test_pipeline_with_quality_processing() {
45
+ let result = ExtractionResult {
46
+ content: "This is a test document with some meaningful content.".to_string(),
47
+ mime_type: "text/plain".to_string(),
48
+ metadata: Metadata::default(),
49
+ tables: vec![],
50
+ detected_languages: None,
51
+ chunks: None,
52
+ images: None,
53
+ djot_content: None,
54
+ pages: None,
55
+ elements: None,
56
+ };
57
+ let config = ExtractionConfig {
58
+ enable_quality_processing: true,
59
+ ..Default::default()
60
+ };
61
+
62
+ let processed = run_pipeline(result, &config).await.unwrap();
63
+ assert!(processed.metadata.additional.contains_key("quality_score"));
64
+ }
65
+
66
+ #[tokio::test]
67
+ async fn test_pipeline_without_quality_processing() {
68
+ let result = ExtractionResult {
69
+ content: "test".to_string(),
70
+ mime_type: "text/plain".to_string(),
71
+ metadata: Metadata::default(),
72
+ tables: vec![],
73
+ detected_languages: None,
74
+ chunks: None,
75
+ images: None,
76
+ djot_content: None,
77
+ pages: None,
78
+ elements: None,
79
+ };
80
+ let config = ExtractionConfig {
81
+ enable_quality_processing: false,
82
+ ..Default::default()
83
+ };
84
+
85
+ let processed = run_pipeline(result, &config).await.unwrap();
86
+ assert!(!processed.metadata.additional.contains_key("quality_score"));
87
+ }
88
+
89
+ #[tokio::test]
90
+ #[cfg(feature = "chunking")]
91
+ async fn test_pipeline_with_chunking() {
92
+ let result = ExtractionResult {
93
+ content: "This is a long text that should be chunked. ".repeat(100),
94
+ mime_type: "text/plain".to_string(),
95
+ metadata: Metadata::default(),
96
+ tables: vec![],
97
+ detected_languages: None,
98
+ chunks: None,
99
+ images: None,
100
+ djot_content: None,
101
+ pages: None,
102
+ elements: None,
103
+ };
104
+ let config = ExtractionConfig {
105
+ chunking: Some(crate::ChunkingConfig {
106
+ max_chars: 500,
107
+ max_overlap: 50,
108
+ embedding: None,
109
+ preset: None,
110
+ }),
111
+ ..Default::default()
112
+ };
113
+
114
+ let processed = run_pipeline(result, &config).await.unwrap();
115
+ assert!(processed.metadata.additional.contains_key("chunk_count"));
116
+ let chunk_count = processed.metadata.additional.get("chunk_count").unwrap();
117
+ assert!(chunk_count.as_u64().unwrap() > 1);
118
+ }
119
+
120
+ #[tokio::test]
121
+ async fn test_pipeline_without_chunking() {
122
+ let result = ExtractionResult {
123
+ content: "test".to_string(),
124
+ mime_type: "text/plain".to_string(),
125
+ metadata: Metadata::default(),
126
+ tables: vec![],
127
+ detected_languages: None,
128
+ chunks: None,
129
+ images: None,
130
+ djot_content: None,
131
+ pages: None,
132
+ elements: None,
133
+ };
134
+ let config = ExtractionConfig {
135
+ chunking: None,
136
+ ..Default::default()
137
+ };
138
+
139
+ let processed = run_pipeline(result, &config).await.unwrap();
140
+ assert!(!processed.metadata.additional.contains_key("chunk_count"));
141
+ }
142
+
143
+ #[tokio::test]
144
+ async fn test_pipeline_preserves_metadata() {
145
+ use std::collections::HashMap;
146
+ let mut additional = HashMap::new();
147
+ additional.insert("source".to_string(), serde_json::json!("test"));
148
+ additional.insert("page".to_string(), serde_json::json!(1));
149
+
150
+ let result = ExtractionResult {
151
+ content: "test".to_string(),
152
+ mime_type: "text/plain".to_string(),
153
+ metadata: Metadata {
154
+ additional,
155
+ ..Default::default()
156
+ },
157
+ pages: None,
158
+ tables: vec![],
159
+ detected_languages: None,
160
+ chunks: None,
161
+ images: None,
162
+ djot_content: None,
163
+ elements: None,
164
+ };
165
+ let config = ExtractionConfig::default();
166
+
167
+ let processed = run_pipeline(result, &config).await.unwrap();
168
+ assert_eq!(
169
+ processed.metadata.additional.get("source").unwrap(),
170
+ &serde_json::json!("test")
171
+ );
172
+ assert_eq!(
173
+ processed.metadata.additional.get("page").unwrap(),
174
+ &serde_json::json!(1)
175
+ );
176
+ }
177
+
178
+ #[tokio::test]
179
+ async fn test_pipeline_preserves_tables() {
180
+ use crate::types::Table;
181
+
182
+ let table = Table {
183
+ cells: vec![vec!["A".to_string(), "B".to_string()]],
184
+ markdown: "| A | B |".to_string(),
185
+ page_number: 0,
186
+ };
187
+
188
+ let result = ExtractionResult {
189
+ content: "test".to_string(),
190
+ mime_type: "text/plain".to_string(),
191
+ metadata: Metadata::default(),
192
+ tables: vec![table],
193
+ detected_languages: None,
194
+ chunks: None,
195
+ images: None,
196
+ djot_content: None,
197
+ pages: None,
198
+ elements: None,
199
+ };
200
+ let config = ExtractionConfig::default();
201
+
202
+ let processed = run_pipeline(result, &config).await.unwrap();
203
+ assert_eq!(processed.tables.len(), 1);
204
+ assert_eq!(processed.tables[0].cells.len(), 1);
205
+ }
206
+
207
+ #[tokio::test]
208
+ async fn test_pipeline_empty_content() {
209
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
210
+
211
+ {
212
+ let registry = crate::plugins::registry::get_post_processor_registry();
213
+ registry.write().unwrap().shutdown_all().unwrap();
214
+ }
215
+ {
216
+ let registry = crate::plugins::registry::get_validator_registry();
217
+ registry.write().unwrap().shutdown_all().unwrap();
218
+ }
219
+
220
+ let result = ExtractionResult {
221
+ content: String::new(),
222
+ mime_type: "text/plain".to_string(),
223
+ metadata: Metadata::default(),
224
+ tables: vec![],
225
+ detected_languages: None,
226
+ chunks: None,
227
+ images: None,
228
+ djot_content: None,
229
+ pages: None,
230
+ elements: None,
231
+ };
232
+ let config = ExtractionConfig::default();
233
+
234
+ drop(_guard);
235
+
236
+ let processed = run_pipeline(result, &config).await.unwrap();
237
+ assert_eq!(processed.content, "");
238
+ }
239
+
240
+ #[tokio::test]
241
+ #[cfg(feature = "chunking")]
242
+ async fn test_pipeline_with_all_features() {
243
+ let result = ExtractionResult {
244
+ content: "This is a comprehensive test document. ".repeat(50),
245
+ mime_type: "text/plain".to_string(),
246
+ metadata: Metadata::default(),
247
+ tables: vec![],
248
+ detected_languages: None,
249
+ chunks: None,
250
+ images: None,
251
+ djot_content: None,
252
+ pages: None,
253
+ elements: None,
254
+ };
255
+ let config = ExtractionConfig {
256
+ enable_quality_processing: true,
257
+ chunking: Some(crate::ChunkingConfig {
258
+ max_chars: 500,
259
+ max_overlap: 50,
260
+ embedding: None,
261
+ preset: None,
262
+ }),
263
+ ..Default::default()
264
+ };
265
+
266
+ let processed = run_pipeline(result, &config).await.unwrap();
267
+ assert!(processed.metadata.additional.contains_key("quality_score"));
268
+ assert!(processed.metadata.additional.contains_key("chunk_count"));
269
+ }
270
+
271
+ #[tokio::test]
272
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
273
+ #[ignore = "Requires test isolation - run with --test-threads=1 or individually with --include-ignored"]
274
+ #[allow(clippy::await_holding_lock)]
275
+ async fn test_pipeline_with_keyword_extraction() {
276
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
277
+ crate::plugins::registry::get_validator_registry()
278
+ .write()
279
+ .unwrap()
280
+ .shutdown_all()
281
+ .unwrap();
282
+ crate::plugins::registry::get_post_processor_registry()
283
+ .write()
284
+ .unwrap()
285
+ .shutdown_all()
286
+ .unwrap();
287
+
288
+ let _ = crate::keywords::register_keyword_processor();
289
+
290
+ let result = ExtractionResult {
291
+ content: r#"
292
+ Machine learning is a branch of artificial intelligence that focuses on
293
+ building systems that can learn from data. Deep learning is a subset of
294
+ machine learning that uses neural networks with multiple layers.
295
+ Natural language processing enables computers to understand human language.
296
+ "#
297
+ .to_string(),
298
+ mime_type: "text/plain".to_string(),
299
+ metadata: Metadata::default(),
300
+ tables: vec![],
301
+ detected_languages: None,
302
+ chunks: None,
303
+ images: None,
304
+ djot_content: None,
305
+ pages: None,
306
+ elements: None,
307
+ };
308
+
309
+ #[cfg(feature = "keywords-yake")]
310
+ let keyword_config = crate::keywords::KeywordConfig::yake();
311
+
312
+ #[cfg(all(feature = "keywords-rake", not(feature = "keywords-yake")))]
313
+ let keyword_config = crate::keywords::KeywordConfig::rake();
314
+
315
+ let config = ExtractionConfig {
316
+ keywords: Some(keyword_config),
317
+ ..Default::default()
318
+ };
319
+
320
+ let processed = run_pipeline(result, &config).await.unwrap();
321
+
322
+ assert!(processed.metadata.additional.contains_key("keywords"));
323
+
324
+ let keywords_value = processed.metadata.additional.get("keywords").unwrap();
325
+ assert!(keywords_value.is_array());
326
+
327
+ let keywords = keywords_value.as_array().unwrap();
328
+ assert!(!keywords.is_empty(), "Should have extracted keywords");
329
+
330
+ let first_keyword = &keywords[0];
331
+ assert!(first_keyword.is_object());
332
+ assert!(first_keyword.get("text").is_some());
333
+ assert!(first_keyword.get("score").is_some());
334
+ assert!(first_keyword.get("algorithm").is_some());
335
+ }
336
+
337
+ #[tokio::test]
338
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
339
+ async fn test_pipeline_without_keyword_config() {
340
+ {
341
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
342
+ }
343
+ let result = ExtractionResult {
344
+ content: "Machine learning and artificial intelligence.".to_string(),
345
+ mime_type: "text/plain".to_string(),
346
+ metadata: Metadata::default(),
347
+ tables: vec![],
348
+ detected_languages: None,
349
+ chunks: None,
350
+ images: None,
351
+ djot_content: None,
352
+ pages: None,
353
+ elements: None,
354
+ };
355
+
356
+ let config = ExtractionConfig {
357
+ keywords: None,
358
+ ..Default::default()
359
+ };
360
+
361
+ let processed = run_pipeline(result, &config).await.unwrap();
362
+
363
+ assert!(!processed.metadata.additional.contains_key("keywords"));
364
+ }
365
+
366
+ #[tokio::test]
367
+ #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
368
+ async fn test_pipeline_keyword_extraction_short_content() {
369
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
370
+ crate::plugins::registry::get_validator_registry()
371
+ .write()
372
+ .unwrap()
373
+ .shutdown_all()
374
+ .unwrap();
375
+ crate::plugins::registry::get_post_processor_registry()
376
+ .write()
377
+ .unwrap()
378
+ .shutdown_all()
379
+ .unwrap();
380
+
381
+ let result = ExtractionResult {
382
+ content: "Short text".to_string(),
383
+ mime_type: "text/plain".to_string(),
384
+ metadata: Metadata::default(),
385
+ tables: vec![],
386
+ detected_languages: None,
387
+ chunks: None,
388
+ images: None,
389
+ djot_content: None,
390
+ pages: None,
391
+ elements: None,
392
+ };
393
+
394
+ #[cfg(feature = "keywords-yake")]
395
+ let keyword_config = crate::keywords::KeywordConfig::yake();
396
+
397
+ #[cfg(all(feature = "keywords-rake", not(feature = "keywords-yake")))]
398
+ let keyword_config = crate::keywords::KeywordConfig::rake();
399
+
400
+ let config = ExtractionConfig {
401
+ keywords: Some(keyword_config),
402
+ ..Default::default()
403
+ };
404
+
405
+ drop(_guard);
406
+
407
+ let processed = run_pipeline(result, &config).await.unwrap();
408
+
409
+ assert!(!processed.metadata.additional.contains_key("keywords"));
410
+ }
411
+
412
+ #[tokio::test]
413
+ async fn test_postprocessor_runs_before_validator() {
414
+ use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
415
+ use async_trait::async_trait;
416
+ use std::sync::Arc;
417
+
418
+ struct TestPostProcessor;
419
+ impl Plugin for TestPostProcessor {
420
+ fn name(&self) -> &str {
421
+ "test-processor"
422
+ }
423
+ fn version(&self) -> String {
424
+ "1.0.0".to_string()
425
+ }
426
+ fn initialize(&self) -> Result<()> {
427
+ Ok(())
428
+ }
429
+ fn shutdown(&self) -> Result<()> {
430
+ Ok(())
431
+ }
432
+ }
433
+
434
+ #[async_trait]
435
+ impl PostProcessor for TestPostProcessor {
436
+ async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
437
+ result
438
+ .metadata
439
+ .additional
440
+ .insert("processed".to_string(), serde_json::json!(true));
441
+ Ok(())
442
+ }
443
+
444
+ fn processing_stage(&self) -> ProcessingStage {
445
+ ProcessingStage::Middle
446
+ }
447
+ }
448
+
449
+ struct TestValidator;
450
+ impl Plugin for TestValidator {
451
+ fn name(&self) -> &str {
452
+ "test-validator"
453
+ }
454
+ fn version(&self) -> String {
455
+ "1.0.0".to_string()
456
+ }
457
+ fn initialize(&self) -> Result<()> {
458
+ Ok(())
459
+ }
460
+ fn shutdown(&self) -> Result<()> {
461
+ Ok(())
462
+ }
463
+ }
464
+
465
+ #[async_trait]
466
+ impl Validator for TestValidator {
467
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
468
+ let should_validate = result
469
+ .metadata
470
+ .additional
471
+ .get(VALIDATION_MARKER_KEY)
472
+ .and_then(|v| v.as_str())
473
+ == Some(POSTPROCESSOR_VALIDATION_MARKER);
474
+
475
+ if !should_validate {
476
+ return Ok(());
477
+ }
478
+
479
+ let processed = result
480
+ .metadata
481
+ .additional
482
+ .get("processed")
483
+ .and_then(|v| v.as_bool())
484
+ .unwrap_or(false);
485
+
486
+ if !processed {
487
+ return Err(crate::KreuzbergError::Validation {
488
+ message: "Post-processor did not run before validator".to_string(),
489
+ source: None,
490
+ });
491
+ }
492
+ Ok(())
493
+ }
494
+ }
495
+
496
+ let pp_registry = crate::plugins::registry::get_post_processor_registry();
497
+ let val_registry = crate::plugins::registry::get_validator_registry();
498
+
499
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
500
+ clear_processor_cache().unwrap();
501
+ pp_registry.write().unwrap().shutdown_all().unwrap();
502
+ val_registry.write().unwrap().shutdown_all().unwrap();
503
+ clear_processor_cache().unwrap();
504
+
505
+ {
506
+ let mut registry = pp_registry.write().unwrap();
507
+ registry.register(Arc::new(TestPostProcessor), 0).unwrap();
508
+ }
509
+
510
+ {
511
+ let mut registry = val_registry.write().unwrap();
512
+ registry.register(Arc::new(TestValidator)).unwrap();
513
+ }
514
+
515
+ // Clear the cache after registering new processors so it rebuilds with the test processors
516
+ clear_processor_cache().unwrap();
517
+
518
+ let mut result = ExtractionResult {
519
+ content: "test".to_string(),
520
+ mime_type: "text/plain".to_string(),
521
+ metadata: Metadata::default(),
522
+ tables: vec![],
523
+ detected_languages: None,
524
+ chunks: None,
525
+ images: None,
526
+ djot_content: None,
527
+ pages: None,
528
+ elements: None,
529
+ };
530
+ result.metadata.additional.insert(
531
+ VALIDATION_MARKER_KEY.to_string(),
532
+ serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
533
+ );
534
+
535
+ let config = ExtractionConfig {
536
+ postprocessor: Some(crate::core::config::PostProcessorConfig {
537
+ enabled: true,
538
+ enabled_set: None,
539
+ disabled_set: None,
540
+ enabled_processors: None,
541
+ disabled_processors: None,
542
+ }),
543
+ ..Default::default()
544
+ };
545
+ drop(_guard);
546
+
547
+ let processed = run_pipeline(result, &config).await;
548
+
549
+ pp_registry.write().unwrap().shutdown_all().unwrap();
550
+ val_registry.write().unwrap().shutdown_all().unwrap();
551
+
552
+ assert!(processed.is_ok(), "Validator should have seen post-processor metadata");
553
+ let processed = processed.unwrap();
554
+ assert_eq!(
555
+ processed.metadata.additional.get("processed"),
556
+ Some(&serde_json::json!(true)),
557
+ "Post-processor metadata should be present"
558
+ );
559
+ }
560
+
561
+ #[tokio::test]
562
+ #[cfg(feature = "quality")]
563
+ async fn test_quality_processing_runs_before_validator() {
564
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
565
+ use crate::plugins::{Plugin, Validator};
566
+ use async_trait::async_trait;
567
+ use std::sync::Arc;
568
+
569
+ struct QualityValidator;
570
+ impl Plugin for QualityValidator {
571
+ fn name(&self) -> &str {
572
+ "quality-validator"
573
+ }
574
+ fn version(&self) -> String {
575
+ "1.0.0".to_string()
576
+ }
577
+ fn initialize(&self) -> Result<()> {
578
+ Ok(())
579
+ }
580
+ fn shutdown(&self) -> Result<()> {
581
+ Ok(())
582
+ }
583
+ }
584
+
585
+ #[async_trait]
586
+ impl Validator for QualityValidator {
587
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
588
+ let should_validate = result
589
+ .metadata
590
+ .additional
591
+ .get(VALIDATION_MARKER_KEY)
592
+ .and_then(|v| v.as_str())
593
+ == Some(QUALITY_VALIDATION_MARKER);
594
+
595
+ if !should_validate {
596
+ return Ok(());
597
+ }
598
+
599
+ if !result.metadata.additional.contains_key("quality_score") {
600
+ return Err(crate::KreuzbergError::Validation {
601
+ message: "Quality processing did not run before validator".to_string(),
602
+ source: None,
603
+ });
604
+ }
605
+ Ok(())
606
+ }
607
+ }
608
+
609
+ let val_registry = crate::plugins::registry::get_validator_registry();
610
+ {
611
+ let mut registry = val_registry.write().unwrap();
612
+ registry.register(Arc::new(QualityValidator)).unwrap();
613
+ }
614
+
615
+ let mut result = ExtractionResult {
616
+ content: "This is meaningful test content for quality scoring.".to_string(),
617
+ mime_type: "text/plain".to_string(),
618
+ metadata: Metadata::default(),
619
+ tables: vec![],
620
+ detected_languages: None,
621
+ chunks: None,
622
+ images: None,
623
+ djot_content: None,
624
+ pages: None,
625
+ elements: None,
626
+ };
627
+ result.metadata.additional.insert(
628
+ VALIDATION_MARKER_KEY.to_string(),
629
+ serde_json::json!(QUALITY_VALIDATION_MARKER),
630
+ );
631
+
632
+ let config = ExtractionConfig {
633
+ enable_quality_processing: true,
634
+ ..Default::default()
635
+ };
636
+
637
+ drop(_guard);
638
+
639
+ let processed = run_pipeline(result, &config).await;
640
+
641
+ {
642
+ let mut registry = val_registry.write().unwrap();
643
+ registry.remove("quality-validator").unwrap();
644
+ }
645
+
646
+ assert!(processed.is_ok(), "Validator should have seen quality_score");
647
+ }
648
+
649
+ #[tokio::test]
650
+ async fn test_multiple_postprocessors_run_before_validator() {
651
+ use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
652
+ use async_trait::async_trait;
653
+ use std::sync::Arc;
654
+
655
+ struct EarlyProcessor;
656
+ impl Plugin for EarlyProcessor {
657
+ fn name(&self) -> &str {
658
+ "early-proc"
659
+ }
660
+ fn version(&self) -> String {
661
+ "1.0.0".to_string()
662
+ }
663
+ fn initialize(&self) -> Result<()> {
664
+ Ok(())
665
+ }
666
+ fn shutdown(&self) -> Result<()> {
667
+ Ok(())
668
+ }
669
+ }
670
+
671
+ #[async_trait]
672
+ impl PostProcessor for EarlyProcessor {
673
+ async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
674
+ let mut order = result
675
+ .metadata
676
+ .additional
677
+ .get("execution_order")
678
+ .and_then(|v| v.as_array())
679
+ .cloned()
680
+ .unwrap_or_default();
681
+ order.push(serde_json::json!("early"));
682
+ result
683
+ .metadata
684
+ .additional
685
+ .insert("execution_order".to_string(), serde_json::json!(order));
686
+ Ok(())
687
+ }
688
+
689
+ fn processing_stage(&self) -> ProcessingStage {
690
+ ProcessingStage::Early
691
+ }
692
+ }
693
+
694
+ struct LateProcessor;
695
+ impl Plugin for LateProcessor {
696
+ fn name(&self) -> &str {
697
+ "late-proc"
698
+ }
699
+ fn version(&self) -> String {
700
+ "1.0.0".to_string()
701
+ }
702
+ fn initialize(&self) -> Result<()> {
703
+ Ok(())
704
+ }
705
+ fn shutdown(&self) -> Result<()> {
706
+ Ok(())
707
+ }
708
+ }
709
+
710
+ #[async_trait]
711
+ impl PostProcessor for LateProcessor {
712
+ async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
713
+ let mut order = result
714
+ .metadata
715
+ .additional
716
+ .get("execution_order")
717
+ .and_then(|v| v.as_array())
718
+ .cloned()
719
+ .unwrap_or_default();
720
+ order.push(serde_json::json!("late"));
721
+ result
722
+ .metadata
723
+ .additional
724
+ .insert("execution_order".to_string(), serde_json::json!(order));
725
+ Ok(())
726
+ }
727
+
728
+ fn processing_stage(&self) -> ProcessingStage {
729
+ ProcessingStage::Late
730
+ }
731
+ }
732
+
733
+ struct OrderValidator;
734
+ impl Plugin for OrderValidator {
735
+ fn name(&self) -> &str {
736
+ "order-validator"
737
+ }
738
+ fn version(&self) -> String {
739
+ "1.0.0".to_string()
740
+ }
741
+ fn initialize(&self) -> Result<()> {
742
+ Ok(())
743
+ }
744
+ fn shutdown(&self) -> Result<()> {
745
+ Ok(())
746
+ }
747
+ }
748
+
749
+ #[async_trait]
750
+ impl Validator for OrderValidator {
751
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
752
+ let should_validate = result
753
+ .metadata
754
+ .additional
755
+ .get(VALIDATION_MARKER_KEY)
756
+ .and_then(|v| v.as_str())
757
+ == Some(ORDER_VALIDATION_MARKER);
758
+
759
+ if !should_validate {
760
+ return Ok(());
761
+ }
762
+
763
+ let order = result
764
+ .metadata
765
+ .additional
766
+ .get("execution_order")
767
+ .and_then(|v| v.as_array())
768
+ .ok_or_else(|| crate::KreuzbergError::Validation {
769
+ message: "No execution order found".to_string(),
770
+ source: None,
771
+ })?;
772
+
773
+ if order.len() != 2 {
774
+ return Err(crate::KreuzbergError::Validation {
775
+ message: format!("Expected 2 processors to run, got {}", order.len()),
776
+ source: None,
777
+ });
778
+ }
779
+
780
+ if order[0] != "early" || order[1] != "late" {
781
+ return Err(crate::KreuzbergError::Validation {
782
+ message: format!("Wrong execution order: {:?}", order),
783
+ source: None,
784
+ });
785
+ }
786
+
787
+ Ok(())
788
+ }
789
+ }
790
+
791
+ let pp_registry = crate::plugins::registry::get_post_processor_registry();
792
+ let val_registry = crate::plugins::registry::get_validator_registry();
793
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
794
+
795
+ pp_registry.write().unwrap().shutdown_all().unwrap();
796
+ val_registry.write().unwrap().shutdown_all().unwrap();
797
+ clear_processor_cache().unwrap();
798
+
799
+ {
800
+ let mut registry = pp_registry.write().unwrap();
801
+ registry.register(Arc::new(EarlyProcessor), 0).unwrap();
802
+ registry.register(Arc::new(LateProcessor), 0).unwrap();
803
+ }
804
+
805
+ {
806
+ let mut registry = val_registry.write().unwrap();
807
+ registry.register(Arc::new(OrderValidator)).unwrap();
808
+ }
809
+
810
+ // Clear the cache after registering new processors so it rebuilds with the test processors
811
+ clear_processor_cache().unwrap();
812
+
813
+ let result = ExtractionResult {
814
+ content: "test".to_string(),
815
+ mime_type: "text/plain".to_string(),
816
+ metadata: Metadata::default(),
817
+ tables: vec![],
818
+ detected_languages: None,
819
+ chunks: None,
820
+ images: None,
821
+ djot_content: None,
822
+ pages: None,
823
+ elements: None,
824
+ };
825
+
826
+ let config = ExtractionConfig::default();
827
+ drop(_guard);
828
+
829
+ let processed = run_pipeline(result, &config).await;
830
+
831
+ pp_registry.write().unwrap().shutdown_all().unwrap();
832
+ val_registry.write().unwrap().shutdown_all().unwrap();
833
+ clear_processor_cache().unwrap();
834
+
835
+ assert!(processed.is_ok(), "All processors should run before validator");
836
+ }
837
+
838
+ #[tokio::test]
839
+ async fn test_run_pipeline_with_output_format_plain() {
840
+ let result = ExtractionResult {
841
+ content: "test content".to_string(),
842
+ mime_type: "text/plain".to_string(),
843
+ metadata: Metadata::default(),
844
+ tables: vec![],
845
+ detected_languages: None,
846
+ chunks: None,
847
+ images: None,
848
+ pages: None,
849
+ djot_content: None,
850
+ elements: None,
851
+ };
852
+
853
+ let config = crate::core::config::ExtractionConfig {
854
+ output_format: OutputFormat::Plain,
855
+ ..Default::default()
856
+ };
857
+
858
+ let processed = run_pipeline(result, &config).await.unwrap();
859
+ assert_eq!(processed.content, "test content");
860
+ }
861
+
862
+ #[tokio::test]
863
+ async fn test_run_pipeline_with_output_format_djot() {
864
+ use crate::types::{BlockType, DjotContent, FormattedBlock, InlineElement, InlineType};
865
+
866
+ let result = ExtractionResult {
867
+ content: "test content".to_string(),
868
+ mime_type: "text/djot".to_string(),
869
+ metadata: Metadata::default(),
870
+ tables: vec![],
871
+ detected_languages: None,
872
+ chunks: None,
873
+ images: None,
874
+ pages: None,
875
+ elements: None,
876
+ djot_content: Some(DjotContent {
877
+ plain_text: "test content".to_string(),
878
+ blocks: vec![FormattedBlock {
879
+ block_type: BlockType::Paragraph,
880
+ level: None,
881
+ inline_content: vec![InlineElement {
882
+ element_type: InlineType::Text,
883
+ content: "test content".to_string(),
884
+ attributes: None,
885
+ metadata: None,
886
+ }],
887
+ attributes: None,
888
+ language: None,
889
+ code: None,
890
+ children: vec![],
891
+ }],
892
+ metadata: Metadata::default(),
893
+ tables: vec![],
894
+ images: vec![],
895
+ links: vec![],
896
+ footnotes: vec![],
897
+ attributes: std::collections::HashMap::new(),
898
+ }),
899
+ };
900
+
901
+ let config = crate::core::config::ExtractionConfig {
902
+ output_format: OutputFormat::Djot,
903
+ ..Default::default()
904
+ };
905
+
906
+ let processed = run_pipeline(result, &config).await.unwrap();
907
+ // The content should still be present
908
+ assert!(!processed.content.is_empty());
909
+ }
910
+
911
+ #[tokio::test]
912
+ async fn test_run_pipeline_with_output_format_html() {
913
+ let result = ExtractionResult {
914
+ content: "test content".to_string(),
915
+ mime_type: "text/plain".to_string(),
916
+ metadata: Metadata::default(),
917
+ tables: vec![],
918
+ detected_languages: None,
919
+ chunks: None,
920
+ images: None,
921
+ pages: None,
922
+ djot_content: None,
923
+ elements: None,
924
+ };
925
+
926
+ let config = crate::core::config::ExtractionConfig {
927
+ output_format: OutputFormat::Html,
928
+ ..Default::default()
929
+ };
930
+
931
+ let processed = run_pipeline(result, &config).await.unwrap();
932
+ // For non-djot documents, HTML wraps content in <pre> tags
933
+ assert!(processed.content.contains("<pre>"));
934
+ assert!(processed.content.contains("test content"));
935
+ assert!(processed.content.contains("</pre>"));
936
+ }
937
+
938
+ #[tokio::test]
939
+ async fn test_run_pipeline_applies_output_format_last() {
940
+ // This test verifies that output format is applied after all other processing
941
+ use crate::types::DjotContent;
942
+
943
+ let result = ExtractionResult {
944
+ content: "test".to_string(),
945
+ mime_type: "text/plain".to_string(),
946
+ metadata: Metadata::default(),
947
+ tables: vec![],
948
+ detected_languages: None,
949
+ chunks: None,
950
+ images: None,
951
+ pages: None,
952
+ elements: None,
953
+ djot_content: Some(DjotContent {
954
+ plain_text: "test".to_string(),
955
+ blocks: vec![],
956
+ metadata: Metadata::default(),
957
+ tables: vec![],
958
+ images: vec![],
959
+ links: vec![],
960
+ footnotes: vec![],
961
+ attributes: std::collections::HashMap::new(),
962
+ }),
963
+ };
964
+
965
+ let config = crate::core::config::ExtractionConfig {
966
+ output_format: OutputFormat::Djot,
967
+ // Disable other processing to ensure pipeline runs cleanly
968
+ enable_quality_processing: false,
969
+ ..Default::default()
970
+ };
971
+
972
+ let processed = run_pipeline(result, &config).await.unwrap();
973
+ // The result should have gone through the pipeline successfully
974
+ assert!(processed.djot_content.is_some());
975
+ }