kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -34,7 +34,7 @@ mod html_table_tests {
34
34
  </table>
35
35
  "#;
36
36
 
37
- let result = convert_html_to_markdown(html, None);
37
+ let result = convert_html_to_markdown(html, None, None);
38
38
  assert!(result.is_ok(), "HTML to markdown conversion should succeed");
39
39
 
40
40
  let markdown = result.unwrap();
@@ -76,7 +76,7 @@ mod html_table_tests {
76
76
  </table>
77
77
  "#;
78
78
 
79
- let result = convert_html_to_markdown(html, None);
79
+ let result = convert_html_to_markdown(html, None, None);
80
80
  assert!(result.is_ok(), "Should convert to markdown");
81
81
 
82
82
  let markdown = result.unwrap();
@@ -140,7 +140,7 @@ mod html_table_tests {
140
140
  </table>
141
141
  "#;
142
142
 
143
- let result = convert_html_to_markdown(html, None);
143
+ let result = convert_html_to_markdown(html, None, None);
144
144
  assert!(result.is_ok(), "Should convert complex table");
145
145
 
146
146
  let markdown = result.unwrap();
@@ -191,7 +191,7 @@ mod html_table_tests {
191
191
  </table>
192
192
  "#;
193
193
 
194
- let result = convert_html_to_markdown(html, None);
194
+ let result = convert_html_to_markdown(html, None, None);
195
195
  assert!(result.is_ok(), "Should handle merged cell table");
196
196
 
197
197
  let markdown = result.unwrap();
@@ -245,7 +245,7 @@ mod html_table_tests {
245
245
  </table>
246
246
  "#;
247
247
 
248
- let result = convert_html_to_markdown(html, None);
248
+ let result = convert_html_to_markdown(html, None, None);
249
249
  assert!(result.is_ok(), "Should handle multiple tables");
250
250
 
251
251
  let markdown = result.unwrap();
@@ -300,7 +300,7 @@ mod html_table_tests {
300
300
  </table>
301
301
  "#;
302
302
 
303
- let result = convert_html_to_markdown(html, None);
303
+ let result = convert_html_to_markdown(html, None, None);
304
304
  assert!(result.is_ok(), "Should handle mixed header cells");
305
305
 
306
306
  let markdown = result.unwrap();
@@ -346,7 +346,7 @@ mod html_table_tests {
346
346
  </table>
347
347
  "#;
348
348
 
349
- let result = convert_html_to_markdown(html, None);
349
+ let result = convert_html_to_markdown(html, None, None);
350
350
  assert!(result.is_ok(), "Should handle table with caption");
351
351
 
352
352
  let markdown = result.unwrap();
@@ -382,7 +382,7 @@ mod html_table_tests {
382
382
  fn test_simple_flat_table() {
383
383
  let html = r#"<table><tr><td>A</td><td>B</td></tr><tr><td>C</td><td>D</td></tr></table>"#;
384
384
 
385
- let result = convert_html_to_markdown(html, None);
385
+ let result = convert_html_to_markdown(html, None, None);
386
386
  assert!(result.is_ok(), "Should handle flat table");
387
387
 
388
388
  let markdown = result.unwrap();
@@ -418,7 +418,7 @@ mod html_table_tests {
418
418
  </table>
419
419
  "#;
420
420
 
421
- let result = convert_html_to_markdown(html, None);
421
+ let result = convert_html_to_markdown(html, None, None);
422
422
  assert!(result.is_ok(), "Should handle empty cells");
423
423
 
424
424
  let markdown = result.unwrap();
@@ -456,7 +456,7 @@ mod html_table_tests {
456
456
  </table>
457
457
  "#;
458
458
 
459
- let result = convert_html_to_markdown(html, None);
459
+ let result = convert_html_to_markdown(html, None, None);
460
460
  assert!(result.is_ok(), "Should handle numeric table");
461
461
 
462
462
  let markdown = result.unwrap();
@@ -499,7 +499,7 @@ mod html_table_tests {
499
499
  </table>
500
500
  "#;
501
501
 
502
- let result = convert_html_to_markdown(html, None);
502
+ let result = convert_html_to_markdown(html, None, None);
503
503
  assert!(result.is_ok(), "Should handle unicode characters");
504
504
 
505
505
  let markdown = result.unwrap();
@@ -32,6 +32,7 @@ fn test_ocr_language_english() {
32
32
  backend: "tesseract".to_string(),
33
33
  language: "eng".to_string(),
34
34
  tesseract_config: None,
35
+ output_format: None,
35
36
  }),
36
37
  force_ocr: false,
37
38
  ..Default::default()
@@ -57,6 +58,7 @@ fn test_ocr_language_german() {
57
58
  backend: "tesseract".to_string(),
58
59
  language: "deu".to_string(),
59
60
  tesseract_config: None,
61
+ output_format: None,
60
62
  }),
61
63
  force_ocr: false,
62
64
  ..Default::default()
@@ -95,6 +97,7 @@ fn test_ocr_language_multiple() {
95
97
  backend: "tesseract".to_string(),
96
98
  language: "eng+kor".to_string(),
97
99
  tesseract_config: None,
100
+ output_format: None,
98
101
  }),
99
102
  force_ocr: false,
100
103
  ..Default::default()
@@ -136,6 +139,7 @@ fn test_ocr_psm_auto() {
136
139
  psm: 3,
137
140
  ..Default::default()
138
141
  }),
142
+ output_format: None,
139
143
  }),
140
144
  force_ocr: false,
141
145
  ..Default::default()
@@ -164,6 +168,7 @@ fn test_ocr_psm_single_block() {
164
168
  psm: 6,
165
169
  ..Default::default()
166
170
  }),
171
+ output_format: None,
167
172
  }),
168
173
  force_ocr: false,
169
174
  ..Default::default()
@@ -192,6 +197,7 @@ fn test_ocr_psm_single_line() {
192
197
  psm: 7,
193
198
  ..Default::default()
194
199
  }),
200
+ output_format: None,
195
201
  }),
196
202
  force_ocr: false,
197
203
  ..Default::default()
@@ -218,6 +224,7 @@ fn test_force_ocr_on_text_pdf() {
218
224
  backend: "tesseract".to_string(),
219
225
  language: "eng".to_string(),
220
226
  tesseract_config: None,
227
+ output_format: None,
221
228
  }),
222
229
  force_ocr: true,
223
230
  ..Default::default()
@@ -248,6 +255,7 @@ fn test_force_ocr_disabled() {
248
255
  backend: "tesseract".to_string(),
249
256
  language: "eng".to_string(),
250
257
  tesseract_config: None,
258
+ output_format: None,
251
259
  }),
252
260
  force_ocr: false,
253
261
  ..Default::default()
@@ -283,6 +291,7 @@ fn test_table_detection_enabled() {
283
291
  table_row_threshold_ratio: 0.5,
284
292
  ..Default::default()
285
293
  }),
294
+ output_format: None,
286
295
  }),
287
296
  force_ocr: false,
288
297
  ..Default::default()
@@ -311,6 +320,7 @@ fn test_table_detection_disabled() {
311
320
  enable_table_detection: false,
312
321
  ..Default::default()
313
322
  }),
323
+ output_format: None,
314
324
  }),
315
325
  force_ocr: false,
316
326
  ..Default::default()
@@ -339,6 +349,7 @@ fn test_language_model_ngram_configuration() {
339
349
  language_model_ngram_on: true,
340
350
  ..Default::default()
341
351
  }),
352
+ output_format: None,
342
353
  }),
343
354
  force_ocr: false,
344
355
  ..Default::default()
@@ -368,6 +379,7 @@ fn test_dictionary_correction_enabled() {
368
379
  tessedit_enable_dict_correction: true,
369
380
  ..Default::default()
370
381
  }),
382
+ output_format: None,
371
383
  }),
372
384
  force_ocr: false,
373
385
  ..Default::default()
@@ -397,6 +409,7 @@ fn test_character_whitelist() {
397
409
  tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ".to_string(),
398
410
  ..Default::default()
399
411
  }),
412
+ output_format: None,
400
413
  }),
401
414
  force_ocr: false,
402
415
  ..Default::default()
@@ -425,6 +438,7 @@ fn test_ocr_cache_enabled() {
425
438
  use_cache: true,
426
439
  ..Default::default()
427
440
  }),
441
+ output_format: None,
428
442
  }),
429
443
  force_ocr: false,
430
444
  use_cache: true,
@@ -464,6 +478,7 @@ fn test_ocr_cache_disabled() {
464
478
  use_cache: false,
465
479
  ..Default::default()
466
480
  }),
481
+ output_format: None,
467
482
  }),
468
483
  force_ocr: false,
469
484
  use_cache: false,
@@ -498,6 +513,7 @@ fn test_complex_configuration_combination() {
498
513
  use_cache: true,
499
514
  ..Default::default()
500
515
  }),
516
+ output_format: None,
501
517
  }),
502
518
  force_ocr: false,
503
519
  use_cache: true,
@@ -34,6 +34,7 @@ fn test_ocr_invalid_language_code() {
34
34
  backend: "tesseract".to_string(),
35
35
  language: "invalid_lang_99999".to_string(),
36
36
  tesseract_config: None,
37
+ output_format: None,
37
38
  }),
38
39
  force_ocr: false,
39
40
  ..Default::default()
@@ -74,6 +75,7 @@ fn test_ocr_invalid_psm_mode() {
74
75
  psm: 999,
75
76
  ..Default::default()
76
77
  }),
78
+ output_format: None,
77
79
  }),
78
80
  force_ocr: false,
79
81
  ..Default::default()
@@ -112,6 +114,7 @@ fn test_ocr_invalid_backend_name() {
112
114
  backend: "nonexistent_ocr_backend_xyz".to_string(),
113
115
  language: "eng".to_string(),
114
116
  tesseract_config: None,
117
+ output_format: None,
115
118
  }),
116
119
  force_ocr: false,
117
120
  ..Default::default()
@@ -147,6 +150,7 @@ fn test_ocr_corrupted_image_data() {
147
150
  backend: "tesseract".to_string(),
148
151
  language: "eng".to_string(),
149
152
  tesseract_config: None,
153
+ output_format: None,
150
154
  }),
151
155
  force_ocr: true,
152
156
  ..Default::default()
@@ -177,6 +181,7 @@ fn test_ocr_empty_image() {
177
181
  backend: "tesseract".to_string(),
178
182
  language: "eng".to_string(),
179
183
  tesseract_config: None,
184
+ output_format: None,
180
185
  }),
181
186
  force_ocr: true,
182
187
  ..Default::default()
@@ -207,6 +212,7 @@ fn test_ocr_non_image_data() {
207
212
  backend: "tesseract".to_string(),
208
213
  language: "eng".to_string(),
209
214
  tesseract_config: None,
215
+ output_format: None,
210
216
  }),
211
217
  force_ocr: true,
212
218
  ..Default::default()
@@ -245,6 +251,7 @@ fn test_ocr_extreme_table_threshold() {
245
251
  table_row_threshold_ratio: 10.0,
246
252
  ..Default::default()
247
253
  }),
254
+ output_format: None,
248
255
  }),
249
256
  force_ocr: false,
250
257
  ..Default::default()
@@ -281,6 +288,7 @@ fn test_ocr_negative_psm() {
281
288
  psm: -5,
282
289
  ..Default::default()
283
290
  }),
291
+ output_format: None,
284
292
  }),
285
293
  force_ocr: false,
286
294
  ..Default::default()
@@ -313,6 +321,7 @@ fn test_ocr_empty_whitelist() {
313
321
  tessedit_char_whitelist: "".to_string(),
314
322
  ..Default::default()
315
323
  }),
324
+ output_format: None,
316
325
  }),
317
326
  force_ocr: false,
318
327
  ..Default::default()
@@ -349,6 +358,7 @@ fn test_ocr_conflicting_whitelist_blacklist() {
349
358
  tessedit_char_blacklist: "abc".to_string(),
350
359
  ..Default::default()
351
360
  }),
361
+ output_format: None,
352
362
  }),
353
363
  force_ocr: false,
354
364
  ..Default::default()
@@ -381,6 +391,7 @@ fn test_ocr_empty_language() {
381
391
  backend: "tesseract".to_string(),
382
392
  language: "".to_string(),
383
393
  tesseract_config: None,
394
+ output_format: None,
384
395
  }),
385
396
  force_ocr: false,
386
397
  ..Default::default()
@@ -413,6 +424,7 @@ fn test_ocr_malformed_multi_language() {
413
424
  backend: "tesseract".to_string(),
414
425
  language: "eng++deu++fra".to_string(),
415
426
  tesseract_config: None,
427
+ output_format: None,
416
428
  }),
417
429
  force_ocr: false,
418
430
  ..Default::default()
@@ -446,6 +458,7 @@ fn test_ocr_cache_disabled_then_enabled() {
446
458
  use_cache: false,
447
459
  ..Default::default()
448
460
  }),
461
+ output_format: None,
449
462
  }),
450
463
  force_ocr: false,
451
464
  use_cache: false,
@@ -466,6 +479,7 @@ fn test_ocr_cache_disabled_then_enabled() {
466
479
  use_cache: true,
467
480
  ..Default::default()
468
481
  }),
482
+ output_format: None,
469
483
  }),
470
484
  force_ocr: false,
471
485
  use_cache: true,
@@ -497,6 +511,7 @@ fn test_ocr_concurrent_same_file() {
497
511
  backend: "tesseract".to_string(),
498
512
  language: "eng".to_string(),
499
513
  tesseract_config: None,
514
+ output_format: None,
500
515
  }),
501
516
  force_ocr: false,
502
517
  use_cache: true,
@@ -563,6 +578,7 @@ fn test_ocr_concurrent_different_files() {
563
578
  backend: "tesseract".to_string(),
564
579
  language: "eng".to_string(),
565
580
  tesseract_config: None,
581
+ output_format: None,
566
582
  }),
567
583
  force_ocr: false,
568
584
  use_cache: true,
@@ -632,6 +648,7 @@ fn test_ocr_with_preprocessing_extreme_dpi() {
632
648
  }),
633
649
  ..Default::default()
634
650
  }),
651
+ output_format: None,
635
652
  }),
636
653
  force_ocr: false,
637
654
  ..Default::default()
@@ -677,6 +694,7 @@ fn test_ocr_with_invalid_binarization_method() {
677
694
  }),
678
695
  ..Default::default()
679
696
  }),
697
+ output_format: None,
680
698
  }),
681
699
  force_ocr: false,
682
700
  ..Default::default()
@@ -167,6 +167,7 @@ fn test_ocr_quality_simple_text_high_accuracy() {
167
167
  backend: "tesseract".to_string(),
168
168
  language: "eng".to_string(),
169
169
  tesseract_config: None,
170
+ output_format: None,
170
171
  }),
171
172
  force_ocr: true,
172
173
  ..Default::default()
@@ -241,6 +242,7 @@ fn test_ocr_quality_numeric_accuracy() {
241
242
  backend: "tesseract".to_string(),
242
243
  language: "eng".to_string(),
243
244
  tesseract_config: None,
245
+ output_format: None,
244
246
  }),
245
247
  force_ocr: true,
246
248
  ..Default::default()
@@ -306,6 +308,7 @@ fn test_ocr_quality_layout_preservation() {
306
308
  backend: "tesseract".to_string(),
307
309
  language: "eng".to_string(),
308
310
  tesseract_config: None,
311
+ output_format: None,
309
312
  }),
310
313
  force_ocr: true,
311
314
  ..Default::default()
@@ -365,6 +368,7 @@ fn test_ocr_quality_technical_document() {
365
368
  backend: "tesseract".to_string(),
366
369
  language: "eng".to_string(),
367
370
  tesseract_config: None,
371
+ output_format: None,
368
372
  }),
369
373
  force_ocr: true,
370
374
  ..Default::default()
@@ -409,6 +413,7 @@ fn test_ocr_consistency_across_runs() {
409
413
  backend: "tesseract".to_string(),
410
414
  language: "eng".to_string(),
411
415
  tesseract_config: None,
416
+ output_format: None,
412
417
  }),
413
418
  force_ocr: true,
414
419
  use_cache: false,
@@ -474,6 +479,7 @@ fn test_ocr_consistency_with_different_psm() {
474
479
  psm: 3,
475
480
  ..Default::default()
476
481
  }),
482
+ output_format: None,
477
483
  }),
478
484
  force_ocr: true,
479
485
  ..Default::default()
@@ -487,6 +493,7 @@ fn test_ocr_consistency_with_different_psm() {
487
493
  psm: 6,
488
494
  ..Default::default()
489
495
  }),
496
+ output_format: None,
490
497
  }),
491
498
  force_ocr: true,
492
499
  ..Default::default()
@@ -557,6 +564,7 @@ fn test_ocr_quality_multi_page_consistency() {
557
564
  backend: "tesseract".to_string(),
558
565
  language: "eng".to_string(),
559
566
  tesseract_config: None,
567
+ output_format: None,
560
568
  }),
561
569
  force_ocr: true,
562
570
  ..Default::default()
@@ -608,6 +616,7 @@ fn test_ocr_quality_with_tables() {
608
616
  table_min_confidence: 0.5,
609
617
  ..Default::default()
610
618
  }),
619
+ output_format: None,
611
620
  }),
612
621
  force_ocr: true,
613
622
  ..Default::default()
@@ -198,6 +198,7 @@ fn test_tesseract_api_thread_safety() {
198
198
  backend: "tesseract".to_string(),
199
199
  language: "eng".to_string(),
200
200
  tesseract_config: None,
201
+ output_format: None,
201
202
  }),
202
203
  force_ocr: false,
203
204
  use_cache: false,
@@ -141,6 +141,8 @@ async fn test_pipeline_empty_no_processors() {
141
141
  chunks: None,
142
142
  images: None,
143
143
  pages: None,
144
+ djot_content: None,
145
+ elements: None,
144
146
  };
145
147
  let config = ExtractionConfig::default();
146
148
 
@@ -186,6 +188,8 @@ async fn test_pipeline_single_processor_per_stage() {
186
188
  chunks: None,
187
189
  images: None,
188
190
  pages: None,
191
+ djot_content: None,
192
+ elements: None,
189
193
  };
190
194
  let config = ExtractionConfig::default();
191
195
 
@@ -231,6 +235,8 @@ async fn test_pipeline_multiple_processors_per_stage() {
231
235
  chunks: None,
232
236
  images: None,
233
237
  pages: None,
238
+ djot_content: None,
239
+ elements: None,
234
240
  };
235
241
  let config = ExtractionConfig::default();
236
242
 
@@ -267,6 +273,8 @@ async fn test_pipeline_all_stages_enabled() {
267
273
  chunks: None,
268
274
  images: None,
269
275
  pages: None,
276
+ djot_content: None,
277
+ elements: None,
270
278
  };
271
279
  let config = ExtractionConfig::default();
272
280
 
@@ -301,6 +309,8 @@ async fn test_pipeline_postprocessing_disabled() {
301
309
  chunks: None,
302
310
  images: None,
303
311
  pages: None,
312
+ djot_content: None,
313
+ elements: None,
304
314
  };
305
315
  let config = ExtractionConfig {
306
316
  postprocessor: Some(PostProcessorConfig {
@@ -350,6 +360,8 @@ async fn test_pipeline_early_stage_runs_first() {
350
360
  chunks: None,
351
361
  images: None,
352
362
  pages: None,
363
+ djot_content: None,
364
+ elements: None,
353
365
  };
354
366
  let config = ExtractionConfig::default();
355
367
 
@@ -390,6 +402,8 @@ async fn test_pipeline_middle_stage_runs_second() {
390
402
  chunks: None,
391
403
  images: None,
392
404
  pages: None,
405
+ djot_content: None,
406
+ elements: None,
393
407
  };
394
408
  let config = ExtractionConfig::default();
395
409
 
@@ -426,6 +440,8 @@ async fn test_pipeline_late_stage_runs_last() {
426
440
  chunks: None,
427
441
  images: None,
428
442
  pages: None,
443
+ djot_content: None,
444
+ elements: None,
429
445
  };
430
446
  let config = ExtractionConfig::default();
431
447
 
@@ -462,6 +478,8 @@ async fn test_pipeline_within_stage_priority_order() {
462
478
  chunks: None,
463
479
  images: None,
464
480
  pages: None,
481
+ djot_content: None,
482
+ elements: None,
465
483
  };
466
484
  let config = ExtractionConfig::default();
467
485
 
@@ -527,6 +545,8 @@ async fn test_pipeline_cross_stage_data_flow() {
527
545
  chunks: None,
528
546
  images: None,
529
547
  pages: None,
548
+ djot_content: None,
549
+ elements: None,
530
550
  };
531
551
  let config = ExtractionConfig::default();
532
552
 
@@ -584,6 +604,8 @@ async fn test_pipeline_early_stage_error_recorded() {
584
604
  chunks: None,
585
605
  images: None,
586
606
  pages: None,
607
+ djot_content: None,
608
+ elements: None,
587
609
  };
588
610
  let config = ExtractionConfig::default();
589
611
 
@@ -626,6 +648,8 @@ async fn test_pipeline_middle_stage_error_propagation() {
626
648
  chunks: None,
627
649
  images: None,
628
650
  pages: None,
651
+ djot_content: None,
652
+ elements: None,
629
653
  };
630
654
  let config = ExtractionConfig::default();
631
655
 
@@ -698,6 +722,8 @@ async fn test_pipeline_late_stage_error_doesnt_affect_earlier_stages() {
698
722
  chunks: None,
699
723
  images: None,
700
724
  pages: None,
725
+ djot_content: None,
726
+ elements: None,
701
727
  };
702
728
  let config = ExtractionConfig::default();
703
729
 
@@ -786,6 +812,8 @@ async fn test_pipeline_processor_error_doesnt_stop_other_processors() {
786
812
  chunks: None,
787
813
  images: None,
788
814
  pages: None,
815
+ djot_content: None,
816
+ elements: None,
789
817
  };
790
818
  let config = ExtractionConfig::default();
791
819
 
@@ -864,6 +892,8 @@ async fn test_pipeline_multiple_processor_errors() {
864
892
  chunks: None,
865
893
  images: None,
866
894
  pages: None,
895
+ djot_content: None,
896
+ elements: None,
867
897
  };
868
898
  let config = ExtractionConfig::default();
869
899
 
@@ -906,6 +936,8 @@ async fn test_pipeline_error_context_preservation() {
906
936
  chunks: None,
907
937
  images: None,
908
938
  pages: None,
939
+ djot_content: None,
940
+ elements: None,
909
941
  };
910
942
  let config = ExtractionConfig::default();
911
943
 
@@ -978,6 +1010,8 @@ async fn test_pipeline_metadata_added_in_early_visible_in_middle() {
978
1010
  chunks: None,
979
1011
  images: None,
980
1012
  pages: None,
1013
+ djot_content: None,
1014
+ elements: None,
981
1015
  };
982
1016
  let config = ExtractionConfig::default();
983
1017
 
@@ -1049,6 +1083,8 @@ async fn test_pipeline_content_modified_in_middle_visible_in_late() {
1049
1083
  chunks: None,
1050
1084
  images: None,
1051
1085
  pages: None,
1086
+ djot_content: None,
1087
+ elements: None,
1052
1088
  };
1053
1089
  let config = ExtractionConfig::default();
1054
1090
 
@@ -1118,6 +1154,8 @@ async fn test_pipeline_multiple_processors_modifying_same_metadata() {
1118
1154
  chunks: None,
1119
1155
  images: None,
1120
1156
  pages: None,
1157
+ djot_content: None,
1158
+ elements: None,
1121
1159
  };
1122
1160
  let config = ExtractionConfig::default();
1123
1161
 
@@ -1206,6 +1244,8 @@ async fn test_pipeline_processors_reading_previous_output() {
1206
1244
  chunks: None,
1207
1245
  images: None,
1208
1246
  pages: None,
1247
+ djot_content: None,
1248
+ elements: None,
1209
1249
  };
1210
1250
  let config = ExtractionConfig::default();
1211
1251
 
@@ -1261,6 +1301,8 @@ async fn test_pipeline_large_content_modification() {
1261
1301
  chunks: None,
1262
1302
  images: None,
1263
1303
  pages: None,
1304
+ djot_content: None,
1305
+ elements: None,
1264
1306
  };
1265
1307
  let config = ExtractionConfig::default();
1266
1308
 
@@ -1297,6 +1339,8 @@ async fn test_pipeline_enabled_processors_whitelist() {
1297
1339
  chunks: None,
1298
1340
  images: None,
1299
1341
  pages: None,
1342
+ djot_content: None,
1343
+ elements: None,
1300
1344
  };
1301
1345
  let config = ExtractionConfig {
1302
1346
  postprocessor: Some(PostProcessorConfig {
@@ -1344,6 +1388,8 @@ async fn test_pipeline_disabled_processors_blacklist() {
1344
1388
  chunks: None,
1345
1389
  images: None,
1346
1390
  pages: None,
1391
+ djot_content: None,
1392
+ elements: None,
1347
1393
  };
1348
1394
  let config = ExtractionConfig {
1349
1395
  postprocessor: Some(PostProcessorConfig {
@@ -1391,6 +1437,8 @@ async fn test_pipeline_no_filtering_runs_all() {
1391
1437
  chunks: None,
1392
1438
  images: None,
1393
1439
  pages: None,
1440
+ djot_content: None,
1441
+ elements: None,
1394
1442
  };
1395
1443
  let config = ExtractionConfig::default();
1396
1444
 
@@ -1429,6 +1477,8 @@ async fn test_pipeline_empty_whitelist_runs_none() {
1429
1477
  chunks: None,
1430
1478
  images: None,
1431
1479
  pages: None,
1480
+ djot_content: None,
1481
+ elements: None,
1432
1482
  };
1433
1483
  let config = ExtractionConfig {
1434
1484
  postprocessor: Some(PostProcessorConfig {