kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,150 @@
1
+ //! MIME type detection MCP tool.
2
+
3
+ use crate::{detect_mime_type, mcp::errors::map_kreuzberg_error_to_mcp, mcp::params::DetectMimeTypeParams};
4
+ use rmcp::{
5
+ ErrorData as McpError,
6
+ handler::server::wrapper::Parameters,
7
+ model::{CallToolResult, Content, RawContent},
8
+ tool,
9
+ };
10
+
11
+ /// MCP tool methods for MIME type detection.
12
+ pub(in crate::mcp) trait MimeTypeTool {
13
+ /// Detect the MIME type of a file.
14
+ ///
15
+ /// This tool identifies the file format, useful for determining which extractor to use.
16
+ #[tool(
17
+ description = "Detect the MIME type of a file. Returns the detected MIME type string.",
18
+ annotations(title = "Detect MIME Type", read_only_hint = true, idempotent_hint = true)
19
+ )]
20
+ fn detect_mime_type(
21
+ &self,
22
+ Parameters(params): Parameters<DetectMimeTypeParams>,
23
+ ) -> Result<CallToolResult, McpError> {
24
+ let mime_type = detect_mime_type(&params.path, params.use_content).map_err(map_kreuzberg_error_to_mcp)?;
25
+
26
+ Ok(CallToolResult::success(vec![Content::text(mime_type)]))
27
+ }
28
+ }
29
+
30
+ #[cfg(test)]
31
+ mod tests {
32
+ use super::*;
33
+ use crate::ExtractionConfig;
34
+ use std::path::PathBuf;
35
+
36
+ /// Get the path to a test document relative to workspace root.
37
+ fn get_test_path(relative_path: &str) -> String {
38
+ let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
39
+ .parent()
40
+ .unwrap()
41
+ .parent()
42
+ .unwrap()
43
+ .to_path_buf();
44
+
45
+ workspace_root
46
+ .join("test_documents")
47
+ .join(relative_path)
48
+ .to_string_lossy()
49
+ .to_string()
50
+ }
51
+
52
+ // Simple test struct for trait implementation
53
+ struct TestMcpServer;
54
+
55
+ impl MimeTypeTool for TestMcpServer {}
56
+
57
+ #[tokio::test]
58
+ async fn test_detect_mime_type_with_valid_file() {
59
+ let server = TestMcpServer;
60
+ let params = DetectMimeTypeParams {
61
+ path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
62
+ use_content: true,
63
+ };
64
+
65
+ let result = server.detect_mime_type(Parameters(params));
66
+
67
+ assert!(result.is_ok());
68
+ let call_result = result.unwrap();
69
+ if let Some(content) = call_result.content.first() {
70
+ match &content.raw {
71
+ RawContent::Text(text) => {
72
+ assert!(text.text.contains("application/pdf") || text.text.contains("pdf"));
73
+ }
74
+ _ => panic!("Expected text content"),
75
+ }
76
+ } else {
77
+ panic!("Expected content in result");
78
+ }
79
+ }
80
+
81
+ #[tokio::test]
82
+ async fn test_detect_mime_type_without_content_detection() {
83
+ let server = TestMcpServer;
84
+ let params = DetectMimeTypeParams {
85
+ path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
86
+ use_content: false,
87
+ };
88
+
89
+ let result = server.detect_mime_type(Parameters(params));
90
+
91
+ assert!(result.is_ok());
92
+ }
93
+
94
+ #[tokio::test]
95
+ async fn test_detect_mime_type_with_invalid_file() {
96
+ let server = TestMcpServer;
97
+ let params = DetectMimeTypeParams {
98
+ path: "/nonexistent/file.pdf".to_string(),
99
+ use_content: true,
100
+ };
101
+
102
+ let result = server.detect_mime_type(Parameters(params));
103
+
104
+ assert!(result.is_err());
105
+ let error = result.unwrap_err();
106
+ assert!(error.code.0 == -32602 || error.code.0 == -32603);
107
+ }
108
+
109
+ #[tokio::test]
110
+ async fn test_detect_mime_type_with_extension_only() {
111
+ let server = TestMcpServer;
112
+
113
+ let test_file = get_test_path("pdfs_with_tables/tiny.pdf");
114
+
115
+ if std::path::Path::new(&test_file).exists() {
116
+ let params = DetectMimeTypeParams {
117
+ path: test_file.to_string(),
118
+ use_content: false,
119
+ };
120
+
121
+ let result = server.detect_mime_type(Parameters(params));
122
+
123
+ assert!(result.is_ok());
124
+ let call_result = result.unwrap();
125
+ if let Some(content) = call_result.content.first()
126
+ && let RawContent::Text(text) = &content.raw
127
+ {
128
+ assert!(text.text.contains("pdf") || text.text.contains("PDF"));
129
+ }
130
+ }
131
+ }
132
+
133
+ #[tokio::test]
134
+ async fn test_detect_mime_type_with_content_analysis() {
135
+ let server = TestMcpServer;
136
+
137
+ let test_file = get_test_path("pdfs_with_tables/tiny.pdf");
138
+
139
+ if std::path::Path::new(&test_file).exists() {
140
+ let params = DetectMimeTypeParams {
141
+ path: test_file.to_string(),
142
+ use_content: true,
143
+ };
144
+
145
+ let result = server.detect_mime_type(Parameters(params));
146
+
147
+ assert!(result.is_ok());
148
+ }
149
+ }
150
+ }
@@ -0,0 +1,11 @@
1
+ //! MCP tool implementations.
2
+ //!
3
+ //! This module organizes MCP tools by category: extraction, cache management, and MIME detection.
4
+
5
+ mod cache;
6
+ mod extraction;
7
+ mod mime;
8
+
9
+ pub(in crate::mcp) use cache::CacheTool;
10
+ pub(in crate::mcp) use extraction::ExtractionTool;
11
+ pub(in crate::mcp) use mime::MimeTypeTool;
@@ -0,0 +1,96 @@
1
+ //! EasyOCR language support.
2
+ //!
3
+ //! EasyOCR supports 83 languages with broad multilingual coverage.
4
+
5
+ /// Get list of languages supported by EasyOCR.
6
+ ///
7
+ /// # Returns
8
+ ///
9
+ /// A vector of 83 language codes supported by EasyOCR.
10
+ pub(in crate::ocr) fn languages() -> Vec<String> {
11
+ vec![
12
+ "abq".to_string(),
13
+ "ady".to_string(),
14
+ "af".to_string(),
15
+ "ang".to_string(),
16
+ "ar".to_string(),
17
+ "as".to_string(),
18
+ "ava".to_string(),
19
+ "az".to_string(),
20
+ "be".to_string(),
21
+ "bg".to_string(),
22
+ "bh".to_string(),
23
+ "bho".to_string(),
24
+ "bn".to_string(),
25
+ "bs".to_string(),
26
+ "ch_sim".to_string(),
27
+ "ch_tra".to_string(),
28
+ "che".to_string(),
29
+ "cs".to_string(),
30
+ "cy".to_string(),
31
+ "da".to_string(),
32
+ "dar".to_string(),
33
+ "de".to_string(),
34
+ "en".to_string(),
35
+ "es".to_string(),
36
+ "et".to_string(),
37
+ "fa".to_string(),
38
+ "fr".to_string(),
39
+ "ga".to_string(),
40
+ "gom".to_string(),
41
+ "hi".to_string(),
42
+ "hr".to_string(),
43
+ "hu".to_string(),
44
+ "id".to_string(),
45
+ "inh".to_string(),
46
+ "is".to_string(),
47
+ "it".to_string(),
48
+ "ja".to_string(),
49
+ "kbd".to_string(),
50
+ "kn".to_string(),
51
+ "ko".to_string(),
52
+ "ku".to_string(),
53
+ "la".to_string(),
54
+ "lbe".to_string(),
55
+ "lez".to_string(),
56
+ "lt".to_string(),
57
+ "lv".to_string(),
58
+ "mah".to_string(),
59
+ "mai".to_string(),
60
+ "mi".to_string(),
61
+ "mn".to_string(),
62
+ "mr".to_string(),
63
+ "ms".to_string(),
64
+ "mt".to_string(),
65
+ "ne".to_string(),
66
+ "new".to_string(),
67
+ "nl".to_string(),
68
+ "no".to_string(),
69
+ "oc".to_string(),
70
+ "pi".to_string(),
71
+ "pl".to_string(),
72
+ "pt".to_string(),
73
+ "ro".to_string(),
74
+ "ru".to_string(),
75
+ "rs_cyrillic".to_string(),
76
+ "rs_latin".to_string(),
77
+ "sck".to_string(),
78
+ "sk".to_string(),
79
+ "sl".to_string(),
80
+ "sq".to_string(),
81
+ "sv".to_string(),
82
+ "sw".to_string(),
83
+ "ta".to_string(),
84
+ "tab".to_string(),
85
+ "te".to_string(),
86
+ "th".to_string(),
87
+ "tjk".to_string(),
88
+ "tl".to_string(),
89
+ "tr".to_string(),
90
+ "ug".to_string(),
91
+ "uk".to_string(),
92
+ "ur".to_string(),
93
+ "uz".to_string(),
94
+ "vi".to_string(),
95
+ ]
96
+ }
@@ -0,0 +1,7 @@
1
+ //! OCR backend language support modules.
2
+ //!
3
+ //! Each module provides language lists for a specific OCR backend.
4
+
5
+ pub(super) mod easyocr;
6
+ pub(super) mod paddleocr;
7
+ pub(super) mod tesseract;
@@ -0,0 +1,27 @@
1
+ //! PaddleOCR language support.
2
+ //!
3
+ //! PaddleOCR supports 14 optimized languages for production deployments.
4
+
5
+ /// Get list of languages supported by PaddleOCR.
6
+ ///
7
+ /// # Returns
8
+ ///
9
+ /// A vector of 14 language codes supported by PaddleOCR.
10
+ pub(in crate::ocr) fn languages() -> Vec<String> {
11
+ vec![
12
+ "ch".to_string(),
13
+ "en".to_string(),
14
+ "french".to_string(),
15
+ "german".to_string(),
16
+ "korean".to_string(),
17
+ "japan".to_string(),
18
+ "chinese_cht".to_string(),
19
+ "ta".to_string(),
20
+ "te".to_string(),
21
+ "ka".to_string(),
22
+ "latin".to_string(),
23
+ "arabic".to_string(),
24
+ "cyrillic".to_string(),
25
+ "devanagari".to_string(),
26
+ ]
27
+ }
@@ -0,0 +1,134 @@
1
+ //! Tesseract OCR language support.
2
+ //!
3
+ //! Tesseract supports 100+ languages via the Tesseract OCR engine.
4
+
5
+ /// Get list of languages supported by Tesseract OCR.
6
+ ///
7
+ /// # Returns
8
+ ///
9
+ /// A vector of 100+ language codes supported by Tesseract.
10
+ pub(in crate::ocr) fn languages() -> Vec<String> {
11
+ vec![
12
+ "afr".to_string(),
13
+ "amh".to_string(),
14
+ "ara".to_string(),
15
+ "asm".to_string(),
16
+ "aze".to_string(),
17
+ "aze_cyrl".to_string(),
18
+ "bel".to_string(),
19
+ "ben".to_string(),
20
+ "bod".to_string(),
21
+ "bos".to_string(),
22
+ "bre".to_string(),
23
+ "bul".to_string(),
24
+ "cat".to_string(),
25
+ "ceb".to_string(),
26
+ "ces".to_string(),
27
+ "chi_sim".to_string(),
28
+ "chi_tra".to_string(),
29
+ "chr".to_string(),
30
+ "cos".to_string(),
31
+ "cym".to_string(),
32
+ "dan".to_string(),
33
+ "deu".to_string(),
34
+ "div".to_string(),
35
+ "dzo".to_string(),
36
+ "ell".to_string(),
37
+ "eng".to_string(),
38
+ "enm".to_string(),
39
+ "epo".to_string(),
40
+ "equ".to_string(),
41
+ "est".to_string(),
42
+ "eus".to_string(),
43
+ "fao".to_string(),
44
+ "fas".to_string(),
45
+ "fil".to_string(),
46
+ "fin".to_string(),
47
+ "fra".to_string(),
48
+ "frk".to_string(),
49
+ "frm".to_string(),
50
+ "fry".to_string(),
51
+ "gla".to_string(),
52
+ "gle".to_string(),
53
+ "glg".to_string(),
54
+ "grc".to_string(),
55
+ "guj".to_string(),
56
+ "hat".to_string(),
57
+ "heb".to_string(),
58
+ "hin".to_string(),
59
+ "hrv".to_string(),
60
+ "hun".to_string(),
61
+ "hye".to_string(),
62
+ "iku".to_string(),
63
+ "ind".to_string(),
64
+ "isl".to_string(),
65
+ "ita".to_string(),
66
+ "ita_old".to_string(),
67
+ "jav".to_string(),
68
+ "jpn".to_string(),
69
+ "kan".to_string(),
70
+ "kat".to_string(),
71
+ "kat_old".to_string(),
72
+ "kaz".to_string(),
73
+ "khm".to_string(),
74
+ "kir".to_string(),
75
+ "kmr".to_string(),
76
+ "kor".to_string(),
77
+ "lao".to_string(),
78
+ "lat".to_string(),
79
+ "lav".to_string(),
80
+ "lit".to_string(),
81
+ "ltz".to_string(),
82
+ "mal".to_string(),
83
+ "mar".to_string(),
84
+ "mkd".to_string(),
85
+ "mlt".to_string(),
86
+ "mon".to_string(),
87
+ "mri".to_string(),
88
+ "msa".to_string(),
89
+ "mya".to_string(),
90
+ "nep".to_string(),
91
+ "nld".to_string(),
92
+ "nor".to_string(),
93
+ "oci".to_string(),
94
+ "ori".to_string(),
95
+ "osd".to_string(),
96
+ "pan".to_string(),
97
+ "pol".to_string(),
98
+ "por".to_string(),
99
+ "pus".to_string(),
100
+ "que".to_string(),
101
+ "ron".to_string(),
102
+ "rus".to_string(),
103
+ "san".to_string(),
104
+ "sin".to_string(),
105
+ "slk".to_string(),
106
+ "slv".to_string(),
107
+ "snd".to_string(),
108
+ "spa".to_string(),
109
+ "spa_old".to_string(),
110
+ "sqi".to_string(),
111
+ "srp".to_string(),
112
+ "srp_latn".to_string(),
113
+ "sun".to_string(),
114
+ "swa".to_string(),
115
+ "swe".to_string(),
116
+ "syr".to_string(),
117
+ "tam".to_string(),
118
+ "tat".to_string(),
119
+ "tel".to_string(),
120
+ "tgk".to_string(),
121
+ "tha".to_string(),
122
+ "tir".to_string(),
123
+ "ton".to_string(),
124
+ "tur".to_string(),
125
+ "uig".to_string(),
126
+ "ukr".to_string(),
127
+ "urd".to_string(),
128
+ "uzb".to_string(),
129
+ "uzb_cyrl".to_string(),
130
+ "vie".to_string(),
131
+ "yid".to_string(),
132
+ "yor".to_string(),
133
+ ]
134
+ }
@@ -1,7 +1,15 @@
1
1
  use super::error::OcrError;
2
- use html_to_markdown_rs::{ConversionOptions, convert};
2
+ use crate::core::config::OutputFormat as KreuzbergOutputFormat;
3
+ use html_to_markdown_rs::{ConversionOptions, OutputFormat as LibOutputFormat, convert};
3
4
 
4
- pub fn convert_hocr_to_markdown(hocr_html: &str, options: Option<ConversionOptions>) -> Result<String, OcrError> {
5
+ /// Convert hOCR to specified output format (markdown or djot).
6
+ ///
7
+ /// Defaults to Markdown for backward compatibility.
8
+ pub fn convert_hocr_to_markdown(
9
+ hocr_html: &str,
10
+ options: Option<ConversionOptions>,
11
+ output_format: Option<KreuzbergOutputFormat>,
12
+ ) -> Result<String, OcrError> {
5
13
  let use_default = options.is_none();
6
14
  let mut opts = options.unwrap_or_default();
7
15
 
@@ -10,6 +18,14 @@ pub fn convert_hocr_to_markdown(hocr_html: &str, options: Option<ConversionOptio
10
18
  opts.extract_metadata = false;
11
19
  }
12
20
 
21
+ // Set output format
22
+ let format = output_format.unwrap_or(KreuzbergOutputFormat::Markdown);
23
+ opts.output_format = match format {
24
+ KreuzbergOutputFormat::Markdown => LibOutputFormat::Markdown,
25
+ KreuzbergOutputFormat::Djot => LibOutputFormat::Djot,
26
+ KreuzbergOutputFormat::Plain | KreuzbergOutputFormat::Html => LibOutputFormat::Markdown,
27
+ };
28
+
13
29
  convert(hocr_html, Some(opts)).map_err(|e| OcrError::ProcessingFailed(format!("hOCR conversion failed: {}", e)))
14
30
  }
15
31
 
@@ -26,7 +42,7 @@ mod tests {
26
42
  </p>
27
43
  </div>"#;
28
44
 
29
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
45
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
30
46
  assert!(markdown.contains("Hello"));
31
47
  assert!(markdown.contains("World"));
32
48
  }
@@ -40,14 +56,14 @@ mod tests {
40
56
  </p>
41
57
  </div>"#;
42
58
 
43
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
59
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
44
60
  assert!(!markdown.is_empty());
45
61
  }
46
62
 
47
63
  #[test]
48
64
  fn test_empty_hocr() {
49
65
  let hocr = "";
50
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
66
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
51
67
  assert!(markdown.is_empty() || markdown.trim().is_empty());
52
68
  }
53
69
 
@@ -60,7 +76,7 @@ mod tests {
60
76
  </p>
61
77
  </div>"#;
62
78
 
63
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
79
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
64
80
  assert!(!markdown.is_empty());
65
81
  assert!(markdown.contains("Content"));
66
82
  }
@@ -78,7 +94,7 @@ mod tests {
78
94
  </p>
79
95
  </div>"#;
80
96
 
81
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
97
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
82
98
  assert!(markdown.contains("First"));
83
99
  assert!(markdown.contains("Second"));
84
100
  }
@@ -98,7 +114,7 @@ mod tests {
98
114
  </p>
99
115
  </div>"#;
100
116
 
101
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
117
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
102
118
  assert!(!markdown.is_empty());
103
119
  }
104
120
 
@@ -111,7 +127,7 @@ mod tests {
111
127
  </p>
112
128
  </div>"#;
113
129
 
114
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
130
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
115
131
  assert!(!markdown.is_empty());
116
132
  }
117
133
 
@@ -124,7 +140,7 @@ mod tests {
124
140
  </p>
125
141
  </div>"#;
126
142
 
127
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
143
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
128
144
  assert!(!markdown.is_empty());
129
145
  }
130
146
 
@@ -140,7 +156,7 @@ mod tests {
140
156
  </div>
141
157
  </div>"#;
142
158
 
143
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
159
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
144
160
  assert!(markdown.contains("Nested"));
145
161
  }
146
162
 
@@ -151,7 +167,7 @@ mod tests {
151
167
  <span class="ocrx_word">Unclosed
152
168
  </div>"#;
153
169
 
154
- let result = convert_hocr_to_markdown(hocr, None);
170
+ let result = convert_hocr_to_markdown(hocr, None, None);
155
171
  assert!(result.is_ok());
156
172
  }
157
173
 
@@ -163,7 +179,7 @@ mod tests {
163
179
  </p>
164
180
  </div>"#;
165
181
 
166
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
182
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
167
183
  assert!(!markdown.is_empty());
168
184
  }
169
185
 
@@ -179,7 +195,7 @@ mod tests {
179
195
  </ul>
180
196
  </div>"#;
181
197
 
182
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
198
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
183
199
  assert!(markdown.contains("Heading") || markdown.contains("heading") || !markdown.is_empty());
184
200
  }
185
201
 
@@ -193,7 +209,7 @@ mod tests {
193
209
  </p>
194
210
  </div>"#;
195
211
 
196
- let markdown = convert_hocr_to_markdown(hocr, None).unwrap();
212
+ let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
197
213
  assert!(markdown.contains("Ñoño") || !markdown.is_empty());
198
214
  }
199
215
 
@@ -208,9 +224,37 @@ mod tests {
208
224
  }
209
225
  hocr.push_str("</div>");
210
226
 
211
- let result = convert_hocr_to_markdown(&hocr, None);
227
+ let result = convert_hocr_to_markdown(&hocr, None, None);
212
228
  assert!(result.is_ok());
213
229
  let markdown = result.unwrap();
214
230
  assert!(!markdown.is_empty());
215
231
  }
232
+
233
+ #[test]
234
+ fn test_hocr_to_djot_conversion() {
235
+ let hocr = r#"<div class="ocr_page">
236
+ <p class="ocr_par">
237
+ <span class="ocrx_word">Hello</span>
238
+ <span class="ocrx_word">World</span>
239
+ </p>
240
+ </div>"#;
241
+
242
+ let result = convert_hocr_to_markdown(hocr, None, Some(KreuzbergOutputFormat::Djot)).unwrap();
243
+ assert!(result.contains("Hello"));
244
+ assert!(result.contains("World"));
245
+ }
246
+
247
+ #[test]
248
+ fn test_hocr_to_djot_with_formatting() {
249
+ let hocr = r#"<div class="ocr_page">
250
+ <p class="ocr_par">
251
+ <strong class="ocrx_word">Bold</strong>
252
+ <em class="ocrx_word">Italic</em>
253
+ </p>
254
+ </div>"#;
255
+
256
+ let result = convert_hocr_to_markdown(hocr, None, Some(KreuzbergOutputFormat::Djot)).unwrap();
257
+ // Djot uses * for strong, _ for emphasis
258
+ assert!(!result.is_empty());
259
+ }
216
260
  }