kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,86 @@
1
+ //! EPUB ZIP archive and XML parsing utilities.
2
+ //!
3
+ //! Provides low-level parsing functionality for EPUB container structure,
4
+ //! including ZIP archive operations and container.xml parsing.
5
+
6
+ use crate::Result;
7
+ use roxmltree;
8
+ use std::io::Cursor;
9
+ use zip::ZipArchive;
10
+
11
+ /// Parse container.xml to find the OPF file path
12
+ pub(super) fn parse_container_xml(xml: &str) -> Result<String> {
13
+ match roxmltree::Document::parse(xml) {
14
+ Ok(doc) => {
15
+ for node in doc.descendants() {
16
+ if node.tag_name().name() == "rootfile"
17
+ && let Some(full_path) = node.attribute("full-path")
18
+ {
19
+ return Ok(full_path.to_string());
20
+ }
21
+ }
22
+ Err(crate::KreuzbergError::Parsing {
23
+ message: "No rootfile found in container.xml".to_string(),
24
+ source: None,
25
+ })
26
+ }
27
+ Err(e) => Err(crate::KreuzbergError::Parsing {
28
+ message: format!("Failed to parse container.xml: {}", e),
29
+ source: None,
30
+ }),
31
+ }
32
+ }
33
+
34
+ /// Read a file from the ZIP archive
35
+ pub(super) fn read_file_from_zip(archive: &mut ZipArchive<Cursor<Vec<u8>>>, path: &str) -> Result<String> {
36
+ match archive.by_name(path) {
37
+ Ok(mut file) => {
38
+ let mut content = String::new();
39
+ match std::io::Read::read_to_string(&mut file, &mut content) {
40
+ Ok(_) => Ok(content),
41
+ Err(e) => Err(crate::KreuzbergError::Parsing {
42
+ message: format!("Failed to read file from EPUB: {}", e),
43
+ source: None,
44
+ }),
45
+ }
46
+ }
47
+ Err(e) => Err(crate::KreuzbergError::Parsing {
48
+ message: format!("File not found in EPUB: {} ({})", path, e),
49
+ source: None,
50
+ }),
51
+ }
52
+ }
53
+
54
+ /// Resolve a relative path within the manifest directory
55
+ pub(super) fn resolve_path(base_dir: &str, relative_path: &str) -> String {
56
+ if relative_path.starts_with('/') {
57
+ relative_path.trim_start_matches('/').to_string()
58
+ } else if base_dir.is_empty() || base_dir == "." {
59
+ relative_path.to_string()
60
+ } else {
61
+ format!("{}/{}", base_dir.trim_end_matches('/'), relative_path)
62
+ }
63
+ }
64
+
65
+ #[cfg(test)]
66
+ mod tests {
67
+ use super::*;
68
+
69
+ #[test]
70
+ fn test_resolve_path_with_base_dir() {
71
+ let result = resolve_path("OEBPS", "chapter.xhtml");
72
+ assert_eq!(result, "OEBPS/chapter.xhtml");
73
+ }
74
+
75
+ #[test]
76
+ fn test_resolve_path_absolute() {
77
+ let result = resolve_path("OEBPS", "/chapter.xhtml");
78
+ assert_eq!(result, "chapter.xhtml");
79
+ }
80
+
81
+ #[test]
82
+ fn test_resolve_path_empty_base() {
83
+ let result = resolve_path("", "chapter.xhtml");
84
+ assert_eq!(result, "chapter.xhtml");
85
+ }
86
+ }
@@ -140,6 +140,8 @@ impl DocumentExtractor for ExcelExtractor {
140
140
  detected_languages: None,
141
141
  chunks: None,
142
142
  images: None,
143
+ djot_content: None,
144
+ elements: None,
143
145
  })
144
146
  }
145
147
 
@@ -184,6 +186,8 @@ impl DocumentExtractor for ExcelExtractor {
184
186
  detected_languages: None,
185
187
  chunks: None,
186
188
  images: None,
189
+ djot_content: None,
190
+ elements: None,
187
191
  })
188
192
  }
189
193
 
@@ -440,7 +440,9 @@ impl DocumentExtractor for FictionBookExtractor {
440
440
  detected_languages: None,
441
441
  chunks: None,
442
442
  images: None,
443
+ djot_content: None,
443
444
  pages: None,
445
+ elements: None,
444
446
  })
445
447
  }
446
448
 
@@ -0,0 +1,466 @@
1
+ //! Shared frontmatter and metadata utilities for markup extractors.
2
+ //!
3
+ //! This module provides common functionality for extractors that process
4
+ //! documents with YAML frontmatter (Markdown, Djot, etc.).
5
+ //!
6
+ //! This is a core module used by the Djot extractor (always available) and
7
+ //! the enhanced Markdown extractor (requires `office` feature).
8
+
9
+ use crate::types::Metadata;
10
+
11
+ use serde_yaml_ng::Value as YamlValue;
12
+
13
+ /// Extract YAML frontmatter from document content.
14
+ ///
15
+ /// Frontmatter is expected to be delimited by `---` or `...` at the start of the document.
16
+ /// This implementation properly handles edge cases:
17
+ /// - `---` appearing within YAML strings or arrays
18
+ /// - Both `---` and `...` as end delimiters (YAML spec compliant)
19
+ /// - Multiline YAML values containing dashes
20
+ ///
21
+ /// Returns a tuple of (parsed YAML value, remaining content after frontmatter).
22
+ ///
23
+ /// # Examples
24
+ ///
25
+ /// ```rust,ignore
26
+ /// let content = "---\ntitle: Test\n---\n\n# Content";
27
+ /// let (yaml, remaining) = extract_frontmatter(content);
28
+ /// assert!(yaml.is_some());
29
+ /// assert!(remaining.contains("# Content"));
30
+ /// ```
31
+ pub fn extract_frontmatter(content: &str) -> (Option<YamlValue>, String) {
32
+ // Frontmatter must start at the beginning of the document
33
+ if !content.starts_with("---") {
34
+ return (None, content.to_string());
35
+ }
36
+
37
+ // Skip opening delimiter
38
+ let rest = &content[3..];
39
+
40
+ // Find the closing delimiter
41
+ // We need to find "---" or "..." on its own line (not embedded in YAML content)
42
+ // The delimiter must be preceded by a newline and followed by newline or EOF
43
+ let mut end_pos = None;
44
+ let mut search_start = 0;
45
+
46
+ while let Some(pos) = rest[search_start..].find('\n') {
47
+ let absolute_pos = search_start + pos;
48
+ let after_newline = absolute_pos + 1;
49
+
50
+ if after_newline >= rest.len() {
51
+ break;
52
+ }
53
+
54
+ // Check if we have "---" or "..." at the start of a line
55
+ let remaining = &rest[after_newline..];
56
+ if remaining.starts_with("---") || remaining.starts_with("...") {
57
+ // Verify it's on its own line (followed by newline or EOF)
58
+ let delimiter_end = after_newline + 3;
59
+ if delimiter_end >= rest.len() || rest.as_bytes()[delimiter_end] == b'\n' {
60
+ end_pos = Some(absolute_pos);
61
+ break;
62
+ }
63
+ }
64
+
65
+ search_start = after_newline;
66
+ }
67
+
68
+ if let Some(end) = end_pos {
69
+ let frontmatter_str = &rest[..end];
70
+ // Skip past the closing delimiter and any following newline
71
+ let after_delimiter = end + 1; // Skip the newline before delimiter
72
+ let remaining_start = if after_delimiter + 3 < rest.len() {
73
+ // Skip "---" or "..."
74
+ let after_delim = after_delimiter + 3;
75
+ // Skip trailing newline after delimiter if present
76
+ if after_delim < rest.len() && rest.as_bytes()[after_delim] == b'\n' {
77
+ after_delim + 1
78
+ } else {
79
+ after_delim
80
+ }
81
+ } else {
82
+ rest.len()
83
+ };
84
+
85
+ let remaining = if remaining_start < rest.len() {
86
+ &rest[remaining_start..]
87
+ } else {
88
+ ""
89
+ };
90
+
91
+ // Try to parse the frontmatter as YAML
92
+ match serde_yaml_ng::from_str::<YamlValue>(frontmatter_str) {
93
+ Ok(value) => (Some(value), remaining.to_string()),
94
+ Err(_) => (None, content.to_string()),
95
+ }
96
+ } else {
97
+ // No closing delimiter found
98
+ (None, content.to_string())
99
+ }
100
+ }
101
+
102
+ /// Extract metadata from YAML frontmatter.
103
+ ///
104
+ /// Extracts the following YAML fields into Kreuzberg metadata:
105
+ /// - **Standard fields**: title, author, date, description (as subject)
106
+ /// - **Extended fields**: abstract, subject, category, tags, language, version
107
+ /// - **Array fields** (keywords, tags): converted to comma-separated strings
108
+ ///
109
+ /// # Arguments
110
+ ///
111
+ /// * `yaml` - The parsed YAML value from frontmatter
112
+ ///
113
+ /// # Returns
114
+ ///
115
+ /// A `Metadata` struct populated with extracted fields
116
+ ///
117
+ /// # Examples
118
+ ///
119
+ /// ```rust,ignore
120
+ /// let yaml = serde_yaml_ng::from_str("title: Test\nauthor: John").unwrap();
121
+ /// let metadata = extract_metadata_from_yaml(&yaml);
122
+ /// assert_eq!(metadata.additional.get("title"), Some(&"Test".into()));
123
+ /// ```
124
+ pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
125
+ let mut metadata = Metadata::default();
126
+
127
+ // Title
128
+ if let Some(title) = yaml.get("title").and_then(|v| v.as_str()) {
129
+ metadata.additional.insert("title".to_string(), title.into());
130
+ }
131
+
132
+ // Author
133
+ if let Some(author) = yaml.get("author").and_then(|v| v.as_str()) {
134
+ metadata.additional.insert("author".to_string(), author.into());
135
+ }
136
+
137
+ // Date (map to created_at)
138
+ if let Some(date) = yaml.get("date").and_then(|v| v.as_str()) {
139
+ metadata.created_at = Some(date.to_string());
140
+ }
141
+
142
+ // Keywords (support both string and array)
143
+ if let Some(keywords) = yaml.get("keywords") {
144
+ match keywords {
145
+ YamlValue::String(s) => {
146
+ metadata.additional.insert("keywords".to_string(), s.clone().into());
147
+ }
148
+ YamlValue::Sequence(seq) => {
149
+ let keywords_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
150
+ metadata.additional.insert("keywords".to_string(), keywords_str.into());
151
+ }
152
+ _ => {}
153
+ }
154
+ }
155
+
156
+ // Description (map to subject)
157
+ if let Some(description) = yaml.get("description").and_then(|v| v.as_str()) {
158
+ metadata.subject = Some(description.to_string());
159
+ }
160
+
161
+ // Abstract
162
+ if let Some(abstract_text) = yaml.get("abstract").and_then(|v| v.as_str()) {
163
+ metadata.additional.insert("abstract".to_string(), abstract_text.into());
164
+ }
165
+
166
+ // Subject (overrides description if both present)
167
+ if let Some(subject) = yaml.get("subject").and_then(|v| v.as_str()) {
168
+ metadata.subject = Some(subject.to_string());
169
+ }
170
+
171
+ // Category
172
+ if let Some(category) = yaml.get("category").and_then(|v| v.as_str()) {
173
+ metadata.additional.insert("category".to_string(), category.into());
174
+ }
175
+
176
+ // Tags (support both string and array)
177
+ if let Some(tags) = yaml.get("tags") {
178
+ match tags {
179
+ YamlValue::String(s) => {
180
+ metadata.additional.insert("tags".to_string(), s.clone().into());
181
+ }
182
+ YamlValue::Sequence(seq) => {
183
+ let tags_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
184
+ metadata.additional.insert("tags".to_string(), tags_str.into());
185
+ }
186
+ _ => {}
187
+ }
188
+ }
189
+
190
+ // Language
191
+ if let Some(language) = yaml.get("language").and_then(|v| v.as_str()) {
192
+ metadata.additional.insert("language".to_string(), language.into());
193
+ }
194
+
195
+ // Version
196
+ if let Some(version) = yaml.get("version").and_then(|v| v.as_str()) {
197
+ metadata.additional.insert("version".to_string(), version.into());
198
+ }
199
+
200
+ metadata
201
+ }
202
+
203
+ /// Extract first heading as title from content.
204
+ ///
205
+ /// Searches for the first level-1 heading (# Title) in the content
206
+ /// and returns it as a potential title if no title was found in frontmatter.
207
+ ///
208
+ /// # Arguments
209
+ ///
210
+ /// * `content` - The document content to search
211
+ ///
212
+ /// # Returns
213
+ ///
214
+ /// Some(title) if a heading is found, None otherwise
215
+ ///
216
+ /// # Examples
217
+ ///
218
+ /// ```rust,ignore
219
+ /// let content = "# My Document\n\nContent here";
220
+ /// assert_eq!(extract_title_from_content(content), Some("My Document".to_string()));
221
+ /// ```
222
+ pub fn extract_title_from_content(content: &str) -> Option<String> {
223
+ for line in content.lines() {
224
+ if let Some(heading) = line.strip_prefix("# ") {
225
+ return Some(heading.trim().to_string());
226
+ }
227
+ }
228
+ None
229
+ }
230
+
231
+ /// Convert table cells to markdown format.
232
+ ///
233
+ /// Takes a 2D array of cell values and formats them as a markdown table
234
+ /// with header row, separator row, and data rows.
235
+ ///
236
+ /// # Arguments
237
+ ///
238
+ /// * `cells` - A 2D array where cells[0] is the header row
239
+ ///
240
+ /// # Returns
241
+ ///
242
+ /// A string containing the markdown-formatted table
243
+ ///
244
+ /// # Examples
245
+ ///
246
+ /// ```rust,ignore
247
+ /// let cells = vec![
248
+ /// vec!["Name".to_string(), "Age".to_string()],
249
+ /// vec!["Alice".to_string(), "30".to_string()],
250
+ /// ];
251
+ /// let markdown = cells_to_markdown(&cells);
252
+ /// assert!(markdown.contains("| Name | Age |"));
253
+ /// ```
254
+ pub fn cells_to_markdown(cells: &[Vec<String>]) -> String {
255
+ if cells.is_empty() {
256
+ return String::new();
257
+ }
258
+
259
+ let mut md = String::new();
260
+
261
+ // Header row
262
+ md.push('|');
263
+ for cell in &cells[0] {
264
+ md.push(' ');
265
+ md.push_str(cell);
266
+ md.push_str(" |");
267
+ }
268
+ md.push('\n');
269
+
270
+ // Separator row
271
+ md.push('|');
272
+ for _ in &cells[0] {
273
+ md.push_str(" --- |");
274
+ }
275
+ md.push('\n');
276
+
277
+ // Data rows
278
+ for row in &cells[1..] {
279
+ md.push('|');
280
+ for cell in row {
281
+ md.push(' ');
282
+ md.push_str(cell);
283
+ md.push_str(" |");
284
+ }
285
+ md.push('\n');
286
+ }
287
+
288
+ md
289
+ }
290
+
291
+ #[cfg(test)]
292
+ mod tests {
293
+ use super::*;
294
+
295
+ #[test]
296
+ fn test_frontmatter_basic() {
297
+ let content = "---\ntitle: Test\n---\n\n# Content";
298
+ let (yaml, remaining) = extract_frontmatter(content);
299
+
300
+ assert!(yaml.is_some());
301
+ assert!(remaining.contains("# Content"));
302
+
303
+ let metadata = extract_metadata_from_yaml(&yaml.unwrap());
304
+ assert_eq!(metadata.additional.get("title").and_then(|v| v.as_str()), Some("Test"));
305
+ }
306
+
307
+ #[test]
308
+ fn test_frontmatter_with_dashes_in_content() {
309
+ let content = "---\ntitle: Test\ndescription: |\n This has ---\n in the middle\n---\n\n# Body";
310
+ let (yaml, remaining) = extract_frontmatter(content);
311
+
312
+ assert!(yaml.is_some());
313
+ assert!(remaining.contains("# Body"));
314
+ }
315
+
316
+ #[test]
317
+ fn test_frontmatter_with_dots_terminator() {
318
+ let content = "---\ntitle: Test\nauthor: John\n...\n\n# Content";
319
+ let (yaml, remaining) = extract_frontmatter(content);
320
+
321
+ assert!(yaml.is_some());
322
+ assert!(remaining.contains("# Content"));
323
+
324
+ let metadata = extract_metadata_from_yaml(&yaml.unwrap());
325
+ assert_eq!(metadata.additional.get("title").and_then(|v| v.as_str()), Some("Test"));
326
+ }
327
+
328
+ #[test]
329
+ fn test_frontmatter_with_triple_dash_in_string() {
330
+ let content = "---\ntitle: \"Before --- After\"\nauthor: John\n---\n\n# Content";
331
+ let (yaml, remaining) = extract_frontmatter(content);
332
+
333
+ assert!(yaml.is_some());
334
+ assert!(remaining.contains("# Content"));
335
+
336
+ let metadata = extract_metadata_from_yaml(&yaml.unwrap());
337
+ assert_eq!(
338
+ metadata.additional.get("title").and_then(|v| v.as_str()),
339
+ Some("Before --- After")
340
+ );
341
+ }
342
+
343
+ #[test]
344
+ fn test_frontmatter_multiline_string_with_dashes() {
345
+ let content = "---\ntitle: Test\ndescription: |\n Line 1\n ---\n Line 2\n---\n\n# Body";
346
+ let (yaml, remaining) = extract_frontmatter(content);
347
+
348
+ assert!(yaml.is_some());
349
+ assert!(remaining.contains("# Body"));
350
+
351
+ let metadata = extract_metadata_from_yaml(&yaml.unwrap());
352
+ assert_eq!(metadata.additional.get("title").and_then(|v| v.as_str()), Some("Test"));
353
+ }
354
+
355
+ #[test]
356
+ fn test_no_frontmatter() {
357
+ let content = "# Title\n\nContent without frontmatter";
358
+ let (yaml, remaining) = extract_frontmatter(content);
359
+
360
+ assert!(yaml.is_none());
361
+ assert_eq!(remaining, content);
362
+ }
363
+
364
+ #[test]
365
+ fn test_incomplete_frontmatter() {
366
+ let content = "---\ntitle: Test\nauthor: John\n\n# Content";
367
+ let (yaml, remaining) = extract_frontmatter(content);
368
+
369
+ // No closing delimiter, should return None
370
+ assert!(yaml.is_none());
371
+ assert_eq!(remaining, content);
372
+ }
373
+
374
+ #[test]
375
+ fn test_extract_title_from_content() {
376
+ let content = "# My Document\n\nContent here";
377
+ assert_eq!(extract_title_from_content(content), Some("My Document".to_string()));
378
+ }
379
+
380
+ #[test]
381
+ fn test_extract_title_from_content_no_heading() {
382
+ let content = "Content without heading";
383
+ assert_eq!(extract_title_from_content(content), None);
384
+ }
385
+
386
+ #[test]
387
+ fn test_extract_title_from_content_level_2() {
388
+ let content = "## Subheading\n\nContent";
389
+ assert_eq!(extract_title_from_content(content), None);
390
+ }
391
+
392
+ #[test]
393
+ fn test_cells_to_markdown() {
394
+ let cells = vec![
395
+ vec!["Name".to_string(), "Age".to_string()],
396
+ vec!["Alice".to_string(), "30".to_string()],
397
+ vec!["Bob".to_string(), "25".to_string()],
398
+ ];
399
+
400
+ let markdown = cells_to_markdown(&cells);
401
+ assert!(markdown.contains("| Name | Age |"));
402
+ assert!(markdown.contains("| Alice | 30 |"));
403
+ assert!(markdown.contains("| Bob | 25 |"));
404
+ assert!(markdown.contains("| --- | --- |"));
405
+ }
406
+
407
+ #[test]
408
+ fn test_cells_to_markdown_empty() {
409
+ let cells: Vec<Vec<String>> = vec![];
410
+ let markdown = cells_to_markdown(&cells);
411
+ assert_eq!(markdown, "");
412
+ }
413
+
414
+ #[test]
415
+ fn test_metadata_from_yaml_all_fields() {
416
+ let yaml_str = r#"
417
+ title: Test Document
418
+ author: John Doe
419
+ date: 2024-01-15
420
+ keywords:
421
+ - rust
422
+ - testing
423
+ description: A test document
424
+ abstract: This is an abstract
425
+ subject: Test Subject
426
+ category: Documentation
427
+ tags:
428
+ - tag1
429
+ - tag2
430
+ language: en
431
+ version: 1.0
432
+ "#;
433
+
434
+ let yaml: YamlValue = serde_yaml_ng::from_str(yaml_str).unwrap();
435
+ let metadata = extract_metadata_from_yaml(&yaml);
436
+
437
+ assert_eq!(
438
+ metadata.additional.get("title").and_then(|v| v.as_str()),
439
+ Some("Test Document")
440
+ );
441
+ assert_eq!(
442
+ metadata.additional.get("author").and_then(|v| v.as_str()),
443
+ Some("John Doe")
444
+ );
445
+ assert_eq!(metadata.created_at, Some("2024-01-15".to_string()));
446
+ assert!(metadata.additional.contains_key("keywords"));
447
+ assert_eq!(metadata.subject, Some("Test Subject".to_string()));
448
+ assert!(metadata.additional.contains_key("tags"));
449
+ }
450
+
451
+ #[test]
452
+ fn test_metadata_from_yaml_string_arrays() {
453
+ let yaml_str = r#"
454
+ keywords: "single, keyword, string"
455
+ tags: "tag1, tag2"
456
+ "#;
457
+
458
+ let yaml: YamlValue = serde_yaml_ng::from_str(yaml_str).unwrap();
459
+ let metadata = extract_metadata_from_yaml(&yaml);
460
+
461
+ assert_eq!(
462
+ metadata.additional.get("keywords").and_then(|v| v.as_str()),
463
+ Some("single, keyword, string")
464
+ );
465
+ }
466
+ }
@@ -1,7 +1,7 @@
1
1
  //! HTML document extractor.
2
2
 
3
3
  use crate::Result;
4
- use crate::core::config::ExtractionConfig;
4
+ use crate::core::config::{ExtractionConfig, OutputFormat};
5
5
  use crate::extractors::SyncExtractor;
6
6
  use crate::plugins::{DocumentExtractor, Plugin};
7
7
  use crate::text::utf8_validation;
@@ -202,16 +202,24 @@ impl SyncExtractor for HtmlExtractor {
202
202
  .map(|s| s.to_string())
203
203
  .unwrap_or_else(|_| String::from_utf8_lossy(content).to_string());
204
204
 
205
- let (markdown, html_metadata) =
206
- crate::extraction::html::convert_html_to_markdown_with_metadata(&html, config.html_options.clone())?;
205
+ let (content_text, html_metadata) = crate::extraction::html::convert_html_to_markdown_with_metadata(
206
+ &html,
207
+ config.html_options.clone(),
208
+ Some(config.output_format),
209
+ )?;
207
210
 
208
- let tables = extract_html_tables(&markdown)?;
211
+ let tables = extract_html_tables(&content_text)?;
209
212
 
210
- let content_without_frontmatter = markdown;
213
+ // Set mime_type based on actual output format
214
+ let result_mime_type = match config.output_format {
215
+ OutputFormat::Markdown => "text/markdown",
216
+ OutputFormat::Djot => "text/djot",
217
+ _ => mime_type, // Preserve original mime_type for other formats
218
+ };
211
219
 
212
220
  Ok(ExtractionResult {
213
- content: content_without_frontmatter,
214
- mime_type: mime_type.to_string(),
221
+ content: content_text,
222
+ mime_type: result_mime_type.to_string(),
215
223
  metadata: Metadata {
216
224
  format: html_metadata.map(|m| crate::types::FormatMetadata::Html(Box::new(m))),
217
225
  ..Default::default()
@@ -221,6 +229,8 @@ impl SyncExtractor for HtmlExtractor {
221
229
  detected_languages: None,
222
230
  chunks: None,
223
231
  images: None,
232
+ djot_content: None,
233
+ elements: None,
224
234
  })
225
235
  }
226
236
  }
@@ -275,7 +285,7 @@ mod tests {
275
285
 
276
286
  /// Helper function to convert HTML to markdown for testing
277
287
  fn html_to_markdown_for_test(html: &str) -> String {
278
- crate::extraction::html::convert_html_to_markdown(html, None).unwrap()
288
+ crate::extraction::html::convert_html_to_markdown(html, None, None).unwrap()
279
289
  }
280
290
 
281
291
  #[test]
@@ -416,4 +426,66 @@ mod tests {
416
426
  assert_eq!(table.cells[1], vec!["Alice", "30"]);
417
427
  assert_eq!(table.cells[2], vec!["Bob", "25"]);
418
428
  }
429
+
430
+ #[tokio::test]
431
+ async fn test_html_extractor_with_djot_output() {
432
+ let html = r#"
433
+ <html>
434
+ <body>
435
+ <h1>Test Page</h1>
436
+ <p>Content with <strong>emphasis</strong>.</p>
437
+ </body>
438
+ </html>
439
+ "#;
440
+
441
+ let extractor = HtmlExtractor::new();
442
+ let config = ExtractionConfig {
443
+ output_format: OutputFormat::Djot,
444
+ ..Default::default()
445
+ };
446
+
447
+ let result = extractor
448
+ .extract_bytes(html.as_bytes(), "text/html", &config)
449
+ .await
450
+ .unwrap();
451
+
452
+ assert_eq!(result.mime_type, "text/djot");
453
+ assert!(result.content.contains("# Test Page"));
454
+ assert!(result.content.contains("*emphasis*")); // Djot strong syntax
455
+ }
456
+
457
+ #[tokio::test]
458
+ async fn test_html_extractor_djot_double_conversion_prevention() {
459
+ let html = r#"
460
+ <html>
461
+ <body>
462
+ <h1>Test</h1>
463
+ <p>Content with <strong>bold</strong> text.</p>
464
+ </body>
465
+ </html>
466
+ "#;
467
+
468
+ let extractor = HtmlExtractor::new();
469
+ let config = ExtractionConfig {
470
+ output_format: OutputFormat::Djot,
471
+ ..Default::default()
472
+ };
473
+
474
+ let result = extractor
475
+ .extract_bytes(html.as_bytes(), "text/html", &config)
476
+ .await
477
+ .unwrap();
478
+
479
+ // Content should already be in djot format
480
+ assert_eq!(result.mime_type, "text/djot");
481
+ let original_content = result.content.clone();
482
+
483
+ // Simulate pipeline format application
484
+ let mut pipeline_result = result.clone();
485
+ crate::core::pipeline::apply_output_format(&mut pipeline_result, OutputFormat::Djot);
486
+
487
+ // Content should be identical - no re-conversion should occur
488
+ assert_eq!(pipeline_result.content, original_content);
489
+ assert_eq!(pipeline_result.mime_type, "text/djot");
490
+ }
419
491
  }