kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -1,1713 +0,0 @@
1
- use serde::{Deserialize, Serialize};
2
- use std::collections::{BTreeMap, HashMap};
3
- use std::sync::Arc;
4
-
5
- #[cfg(feature = "pdf")]
6
- use crate::pdf::metadata::PdfMetadata;
7
-
8
- // ============================================================================
9
- // ============================================================================
10
-
11
- /// Module providing transparent serde support for Arc<T>.
12
- ///
13
- /// Allows Arc-wrapped types to serialize/deserialize as if unwrapped,
14
- /// maintaining exact JSON format while preserving memory efficiency benefits.
15
- ///
16
- /// # Arc Sharing Semantics
17
- ///
18
- /// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
19
- /// When deserializing, each Arc is independently created with `Arc::new()`.
20
- /// This means that if two Arcs referenced the same data before serialization,
21
- /// they will be separate Arcs after deserialization.
22
- ///
23
- /// Example:
24
- /// ```ignore
25
- /// let shared = Arc::new(Table { /* ... */ });
26
- /// let tables = vec![Arc::clone(&shared), Arc::clone(&shared)];
27
- /// // Both in-memory Arcs point to the same Table
28
- ///
29
- /// let json = serde_json::to_string(&tables)?;
30
- /// let deserialized: Vec<Arc<Table>> = serde_json::from_str(&json)?;
31
- /// // deserialized[0] and deserialized[1] are now independent Arcs,
32
- /// // even though they contain identical data
33
- /// ```
34
- ///
35
- /// This design choice maintains:
36
- /// - Exact JSON format compatibility (no sharing metadata in JSON)
37
- /// - Predictable deserialization behavior
38
- /// - Zero additional serialization overhead
39
- ///
40
- /// If in-memory sharing is required, callers must implement custom sharing logic
41
- /// or use a different data structure (like a HashMap of deduplicated values).
42
- #[allow(dead_code)]
43
- mod serde_arc {
44
- use serde::{Deserialize, Deserializer, Serializer};
45
- use std::sync::Arc;
46
-
47
- /// Serialize an Arc<T> by serializing the inner value directly.
48
- ///
49
- /// This makes Arc<T> serialize identically to T, maintaining API compatibility.
50
- /// The outer Arc wrapper is transparent during serialization.
51
- pub fn serialize<S, T>(arc_value: &Arc<T>, serializer: S) -> Result<S::Ok, S::Error>
52
- where
53
- S: Serializer,
54
- T: serde::Serialize,
55
- {
56
- (**arc_value).serialize(serializer)
57
- }
58
-
59
- /// Deserialize a T and wrap it in Arc.
60
- ///
61
- /// This makes Arc<T> deserialize from the same format as T.
62
- /// Each Arc is independently created during deserialization;
63
- /// Arc sharing from before serialization is NOT preserved.
64
- pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Arc<T>, D::Error>
65
- where
66
- D: Deserializer<'de>,
67
- T: Deserialize<'de>,
68
- {
69
- T::deserialize(deserializer).map(Arc::new)
70
- }
71
- }
72
-
73
- /// Module for serializing Vec<Arc<T>> with transparent Arc handling.
74
- ///
75
- /// Serializes a Vec<Arc<T>> as Vec<T> for compatibility, while preserving
76
- /// Arc semantics for memory efficiency.
77
- ///
78
- /// # Arc Sharing Semantics
79
- ///
80
- /// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
81
- /// When deserializing, each element's Arc is independently created with `Arc::new()`.
82
- /// This is important for `PageContent` where tables/images may be shared across pages.
83
- ///
84
- /// Example with shared tables:
85
- /// ```ignore
86
- /// let shared_table = Arc::new(Table { /* ... */ });
87
- /// let page_contents = vec![
88
- /// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
89
- /// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
90
- /// ];
91
- /// // In-memory: both pages' tables point to the same Arc
92
- ///
93
- /// let json = serde_json::to_string(&page_contents)?;
94
- /// let deserialized = serde_json::from_str::<Vec<PageContent>>(&json)?;
95
- /// // After deserialization: each page has independent Arc instances,
96
- /// // even though the table data is identical
97
- /// ```
98
- ///
99
- /// Design rationale:
100
- /// - JSON has no mechanism to represent shared references
101
- /// - Preserving sharing would require complex metadata and deduplication
102
- /// - Current approach is simple, predictable, and maintains compatibility
103
- /// - In-memory sharing (via Arc) is an implementation detail for the Rust side
104
- ///
105
- /// If in-memory sharing is required after deserialization, implement custom
106
- /// deduplication logic using hashing or content comparison.
107
- mod serde_vec_arc {
108
- use serde::{Deserialize, Deserializer, Serializer};
109
- use std::sync::Arc;
110
-
111
- /// Serialize Vec<Arc<T>> by serializing each T directly.
112
- ///
113
- /// Each element is unwrapped from its Arc and serialized independently.
114
- /// No sharing metadata is included in the serialized output.
115
- pub fn serialize<S, T>(vec: &[Arc<T>], serializer: S) -> Result<S::Ok, S::Error>
116
- where
117
- S: Serializer,
118
- T: serde::Serialize,
119
- {
120
- use serde::ser::SerializeSeq;
121
- let mut seq = serializer.serialize_seq(Some(vec.len()))?;
122
- for arc_item in vec {
123
- seq.serialize_element(&**arc_item)?;
124
- }
125
- seq.end()
126
- }
127
-
128
- /// Deserialize Vec<T> and wrap each element in Arc.
129
- ///
130
- /// Each element is independently wrapped in a new Arc.
131
- /// Sharing relationships from before serialization are lost.
132
- pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Vec<Arc<T>>, D::Error>
133
- where
134
- D: Deserializer<'de>,
135
- T: Deserialize<'de>,
136
- {
137
- let vec: Vec<T> = Deserialize::deserialize(deserializer)?;
138
- Ok(vec.into_iter().map(Arc::new).collect())
139
- }
140
- }
141
-
142
- /// General extraction result used by the core extraction API.
143
- ///
144
- /// This is the main result type returned by all extraction functions.
145
- #[derive(Debug, Clone, Serialize, Deserialize)]
146
- pub struct ExtractionResult {
147
- pub content: String,
148
- pub mime_type: String,
149
- pub metadata: Metadata,
150
- pub tables: Vec<Table>,
151
- #[serde(skip_serializing_if = "Option::is_none")]
152
- pub detected_languages: Option<Vec<String>>,
153
-
154
- /// Text chunks when chunking is enabled.
155
- ///
156
- /// When chunking configuration is provided, the content is split into
157
- /// overlapping chunks for efficient processing. Each chunk contains the text,
158
- /// optional embeddings (if enabled), and metadata about its position.
159
- #[serde(skip_serializing_if = "Option::is_none")]
160
- pub chunks: Option<Vec<Chunk>>,
161
-
162
- /// Extracted images from the document.
163
- ///
164
- /// When image extraction is enabled via `ImageExtractionConfig`, this field
165
- /// contains all images found in the document with their raw data and metadata.
166
- /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
167
- #[serde(skip_serializing_if = "Option::is_none")]
168
- pub images: Option<Vec<ExtractedImage>>,
169
-
170
- /// Per-page content when page extraction is enabled.
171
- ///
172
- /// When page extraction is configured, the document is split into per-page content
173
- /// with tables and images mapped to their respective pages.
174
- #[serde(skip_serializing_if = "Option::is_none")]
175
- pub pages: Option<Vec<PageContent>>,
176
- }
177
-
178
- /// Format-specific metadata (discriminated union).
179
- ///
180
- /// Only one format type can exist per extraction result. This provides
181
- /// type-safe, clean metadata without nested optionals.
182
- #[derive(Debug, Clone, Serialize, Deserialize)]
183
- #[serde(tag = "format_type", rename_all = "snake_case")]
184
- pub enum FormatMetadata {
185
- #[cfg(feature = "pdf")]
186
- Pdf(PdfMetadata),
187
- Excel(ExcelMetadata),
188
- Email(EmailMetadata),
189
- Pptx(PptxMetadata),
190
- Archive(ArchiveMetadata),
191
- Image(ImageMetadata),
192
- Xml(XmlMetadata),
193
- Text(TextMetadata),
194
- Html(Box<HtmlMetadata>),
195
- Ocr(OcrMetadata),
196
- }
197
-
198
- /// Extraction result metadata.
199
- ///
200
- /// Contains common fields applicable to all formats, format-specific metadata
201
- /// via a discriminated union, and additional custom fields from postprocessors.
202
- #[derive(Debug, Clone, Serialize, Deserialize, Default)]
203
- pub struct Metadata {
204
- /// Document title
205
- #[serde(skip_serializing_if = "Option::is_none")]
206
- pub title: Option<String>,
207
-
208
- /// Document subject or description
209
- #[serde(skip_serializing_if = "Option::is_none")]
210
- pub subject: Option<String>,
211
-
212
- /// Primary author(s) - always Vec for consistency
213
- #[serde(skip_serializing_if = "Option::is_none")]
214
- pub authors: Option<Vec<String>>,
215
-
216
- /// Keywords/tags - always Vec for consistency
217
- #[serde(skip_serializing_if = "Option::is_none")]
218
- pub keywords: Option<Vec<String>>,
219
-
220
- /// Primary language (ISO 639 code)
221
- #[serde(skip_serializing_if = "Option::is_none")]
222
- pub language: Option<String>,
223
-
224
- /// Creation timestamp (ISO 8601 format)
225
- #[serde(skip_serializing_if = "Option::is_none")]
226
- pub created_at: Option<String>,
227
-
228
- /// Last modification timestamp (ISO 8601 format)
229
- #[serde(skip_serializing_if = "Option::is_none")]
230
- pub modified_at: Option<String>,
231
-
232
- /// User who created the document
233
- #[serde(skip_serializing_if = "Option::is_none")]
234
- pub created_by: Option<String>,
235
-
236
- /// User who last modified the document
237
- #[serde(skip_serializing_if = "Option::is_none")]
238
- pub modified_by: Option<String>,
239
-
240
- /// Page/slide/sheet structure with boundaries
241
- #[serde(skip_serializing_if = "Option::is_none")]
242
- pub pages: Option<PageStructure>,
243
-
244
- /// Format-specific metadata (discriminated union)
245
- ///
246
- /// Contains detailed metadata specific to the document format.
247
- /// Serializes with a `format_type` discriminator field.
248
- #[serde(flatten, skip_serializing_if = "Option::is_none")]
249
- pub format: Option<FormatMetadata>,
250
-
251
- /// Image preprocessing metadata (when OCR preprocessing was applied)
252
- #[serde(skip_serializing_if = "Option::is_none")]
253
- pub image_preprocessing: Option<ImagePreprocessingMetadata>,
254
-
255
- /// JSON schema (for structured data extraction)
256
- #[serde(skip_serializing_if = "Option::is_none")]
257
- pub json_schema: Option<serde_json::Value>,
258
-
259
- /// Error metadata (for batch operations)
260
- #[serde(skip_serializing_if = "Option::is_none")]
261
- pub error: Option<ErrorMetadata>,
262
-
263
- /// Additional custom fields from postprocessors.
264
- ///
265
- /// This flattened HashMap allows Python/TypeScript postprocessors to add
266
- /// arbitrary fields (entity extraction, keyword extraction, etc.).
267
- /// Fields are merged at the root level during serialization.
268
- #[serde(flatten)]
269
- pub additional: HashMap<String, serde_json::Value>,
270
- }
271
-
272
- /// Unified page structure for documents.
273
- ///
274
- /// Supports different page types (PDF pages, PPTX slides, Excel sheets)
275
- /// with character offset boundaries for chunk-to-page mapping.
276
- #[derive(Debug, Clone, Serialize, Deserialize)]
277
- pub struct PageStructure {
278
- /// Total number of pages/slides/sheets
279
- pub total_count: usize,
280
-
281
- /// Type of paginated unit
282
- pub unit_type: PageUnitType,
283
-
284
- /// Character offset boundaries for each page
285
- ///
286
- /// Maps character ranges in the extracted content to page numbers.
287
- /// Used for chunk page range calculation.
288
- #[serde(skip_serializing_if = "Option::is_none")]
289
- pub boundaries: Option<Vec<PageBoundary>>,
290
-
291
- /// Detailed per-page metadata (optional, only when needed)
292
- #[serde(skip_serializing_if = "Option::is_none")]
293
- pub pages: Option<Vec<PageInfo>>,
294
- }
295
-
296
- /// Type of paginated unit in a document.
297
- ///
298
- /// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
299
- #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
300
- #[serde(rename_all = "snake_case")]
301
- pub enum PageUnitType {
302
- /// Standard document pages (PDF, DOCX, images)
303
- Page,
304
- /// Presentation slides (PPTX, ODP)
305
- Slide,
306
- /// Spreadsheet sheets (XLSX, ODS)
307
- Sheet,
308
- }
309
-
310
- /// Byte offset boundary for a page.
311
- ///
312
- /// Tracks where a specific page's content starts and ends in the main content string,
313
- /// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
314
- /// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
315
- #[derive(Debug, Clone, Serialize, Deserialize)]
316
- pub struct PageBoundary {
317
- /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
318
- pub byte_start: usize,
319
- /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
320
- pub byte_end: usize,
321
- /// Page number (1-indexed)
322
- pub page_number: usize,
323
- }
324
-
325
- /// Metadata for individual page/slide/sheet.
326
- ///
327
- /// Captures per-page information including dimensions, content counts,
328
- /// and visibility state (for presentations).
329
- #[derive(Debug, Clone, Serialize, Deserialize)]
330
- pub struct PageInfo {
331
- /// Page number (1-indexed)
332
- pub number: usize,
333
-
334
- /// Page title (usually for presentations)
335
- #[serde(skip_serializing_if = "Option::is_none")]
336
- pub title: Option<String>,
337
-
338
- /// Dimensions in points (PDF) or pixels (images): (width, height)
339
- #[serde(skip_serializing_if = "Option::is_none")]
340
- pub dimensions: Option<(f64, f64)>,
341
-
342
- /// Number of images on this page
343
- #[serde(skip_serializing_if = "Option::is_none")]
344
- pub image_count: Option<usize>,
345
-
346
- /// Number of tables on this page
347
- #[serde(skip_serializing_if = "Option::is_none")]
348
- pub table_count: Option<usize>,
349
-
350
- /// Whether this page is hidden (e.g., in presentations)
351
- #[serde(skip_serializing_if = "Option::is_none")]
352
- pub hidden: Option<bool>,
353
- }
354
-
355
- /// Content for a single page/slide.
356
- ///
357
- /// When page extraction is enabled, documents are split into per-page content
358
- /// with associated tables and images mapped to each page.
359
- ///
360
- /// # Performance
361
- ///
362
- /// Uses Arc-wrapped tables and images for memory efficiency:
363
- /// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
364
- /// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
365
- /// - Maintains exact JSON compatibility via custom Serialize/Deserialize
366
- ///
367
- /// This reduces memory overhead for documents with shared tables/images
368
- /// by avoiding redundant copies during serialization.
369
- #[derive(Debug, Clone, Serialize, Deserialize)]
370
- pub struct PageContent {
371
- /// Page number (1-indexed)
372
- pub page_number: usize,
373
-
374
- /// Text content for this page
375
- pub content: String,
376
-
377
- /// Tables found on this page (uses Arc for memory efficiency)
378
- ///
379
- /// Serializes as Vec<Table> for JSON compatibility while maintaining
380
- /// Arc semantics in-memory for zero-copy sharing.
381
- #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
382
- pub tables: Vec<Arc<Table>>,
383
-
384
- /// Images found on this page (uses Arc for memory efficiency)
385
- ///
386
- /// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
387
- /// Arc semantics in-memory for zero-copy sharing.
388
- #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
389
- pub images: Vec<Arc<ExtractedImage>>,
390
-
391
- /// Hierarchy information for the page (when hierarchy extraction is enabled)
392
- ///
393
- /// Contains text hierarchy levels (H1-H6) extracted from the page content.
394
- #[serde(skip_serializing_if = "Option::is_none")]
395
- pub hierarchy: Option<PageHierarchy>,
396
- }
397
-
398
- /// Page hierarchy structure containing heading levels and block information.
399
- ///
400
- /// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
401
- /// blocks with heading levels (H1-H6) for semantic document structure.
402
- #[derive(Debug, Clone, Serialize, Deserialize)]
403
- pub struct PageHierarchy {
404
- /// Number of hierarchy blocks on this page
405
- pub block_count: usize,
406
-
407
- /// Hierarchical blocks with heading levels
408
- #[serde(skip_serializing_if = "Vec::is_empty", default)]
409
- pub blocks: Vec<HierarchicalBlock>,
410
- }
411
-
412
- /// A text block with hierarchy level assignment.
413
- ///
414
- /// Represents a block of text with semantic heading information extracted from
415
- /// font size clustering and hierarchical analysis.
416
- #[derive(Debug, Clone, Serialize, Deserialize)]
417
- pub struct HierarchicalBlock {
418
- /// The text content of this block
419
- pub text: String,
420
-
421
- /// The font size of the text in this block
422
- pub font_size: f32,
423
-
424
- /// The hierarchy level of this block (H1-H6 or Body)
425
- ///
426
- /// Levels correspond to HTML heading tags:
427
- /// - "h1": Top-level heading
428
- /// - "h2": Secondary heading
429
- /// - "h3": Tertiary heading
430
- /// - "h4": Quaternary heading
431
- /// - "h5": Quinary heading
432
- /// - "h6": Senary heading
433
- /// - "body": Body text (no heading level)
434
- pub level: String,
435
-
436
- /// Bounding box information for the block
437
- ///
438
- /// Contains coordinates as (left, top, right, bottom) in PDF units.
439
- #[serde(skip_serializing_if = "Option::is_none")]
440
- pub bbox: Option<(f32, f32, f32, f32)>,
441
- }
442
-
443
- /// Excel/spreadsheet metadata.
444
- ///
445
- /// Contains information about sheets in Excel, LibreOffice Calc, and other
446
- /// spreadsheet formats (.xlsx, .xls, .ods, etc.).
447
- #[derive(Debug, Clone, Serialize, Deserialize)]
448
- pub struct ExcelMetadata {
449
- /// Total number of sheets in the workbook
450
- pub sheet_count: usize,
451
- /// Names of all sheets in order
452
- pub sheet_names: Vec<String>,
453
- }
454
-
455
- /// Email metadata extracted from .eml and .msg files.
456
- ///
457
- /// Includes sender/recipient information, message ID, and attachment list.
458
- #[derive(Debug, Clone, Serialize, Deserialize)]
459
- pub struct EmailMetadata {
460
- /// Sender's email address
461
- #[serde(skip_serializing_if = "Option::is_none")]
462
- pub from_email: Option<String>,
463
-
464
- /// Sender's display name
465
- #[serde(skip_serializing_if = "Option::is_none")]
466
- pub from_name: Option<String>,
467
-
468
- /// Primary recipients
469
- pub to_emails: Vec<String>,
470
- /// CC recipients
471
- pub cc_emails: Vec<String>,
472
- /// BCC recipients
473
- pub bcc_emails: Vec<String>,
474
-
475
- /// Message-ID header value
476
- #[serde(skip_serializing_if = "Option::is_none")]
477
- pub message_id: Option<String>,
478
-
479
- /// List of attachment filenames
480
- pub attachments: Vec<String>,
481
- }
482
-
483
- /// Archive (ZIP/TAR/7Z) metadata.
484
- ///
485
- /// Extracted from compressed archive files containing file lists and size information.
486
- #[derive(Debug, Clone, Serialize, Deserialize)]
487
- pub struct ArchiveMetadata {
488
- /// Archive format ("ZIP", "TAR", "7Z", etc.)
489
- pub format: String,
490
- /// Total number of files in the archive
491
- pub file_count: usize,
492
- /// List of file paths within the archive
493
- pub file_list: Vec<String>,
494
- /// Total uncompressed size in bytes
495
- pub total_size: usize,
496
-
497
- /// Compressed size in bytes (if available)
498
- #[serde(skip_serializing_if = "Option::is_none")]
499
- pub compressed_size: Option<usize>,
500
- }
501
-
502
- /// Image metadata extracted from image files.
503
- ///
504
- /// Includes dimensions, format, and EXIF data.
505
- #[derive(Debug, Clone, Serialize, Deserialize)]
506
- pub struct ImageMetadata {
507
- /// Image width in pixels
508
- pub width: u32,
509
- /// Image height in pixels
510
- pub height: u32,
511
- /// Image format (e.g., "PNG", "JPEG", "TIFF")
512
- pub format: String,
513
- /// EXIF metadata tags
514
- pub exif: HashMap<String, String>,
515
- }
516
-
517
- /// XML metadata extracted during XML parsing.
518
- ///
519
- /// Provides statistics about XML document structure.
520
- #[derive(Debug, Clone, Serialize, Deserialize)]
521
- pub struct XmlMetadata {
522
- /// Total number of XML elements processed
523
- pub element_count: usize,
524
- /// List of unique element tag names (sorted)
525
- pub unique_elements: Vec<String>,
526
- }
527
-
528
- /// Text/Markdown metadata.
529
- ///
530
- /// Extracted from plain text and Markdown files. Includes word counts and,
531
- /// for Markdown, structural elements like headers and links.
532
- #[derive(Debug, Clone, Serialize, Deserialize)]
533
- pub struct TextMetadata {
534
- /// Number of lines in the document
535
- pub line_count: usize,
536
- /// Number of words
537
- pub word_count: usize,
538
- /// Number of characters
539
- pub character_count: usize,
540
-
541
- /// Markdown headers (headings text only, for Markdown files)
542
- #[serde(skip_serializing_if = "Option::is_none")]
543
- pub headers: Option<Vec<String>>,
544
-
545
- /// Markdown links as (text, url) tuples (for Markdown files)
546
- #[serde(skip_serializing_if = "Option::is_none")]
547
- pub links: Option<Vec<(String, String)>>,
548
-
549
- /// Code blocks as (language, code) tuples (for Markdown files)
550
- #[serde(skip_serializing_if = "Option::is_none")]
551
- pub code_blocks: Option<Vec<(String, String)>>,
552
- }
553
-
554
- /// Text direction enumeration for HTML documents.
555
- #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
556
- #[serde(rename_all = "lowercase")]
557
- pub enum TextDirection {
558
- /// Left-to-right text direction
559
- #[serde(rename = "ltr")]
560
- LeftToRight,
561
- /// Right-to-left text direction
562
- #[serde(rename = "rtl")]
563
- RightToLeft,
564
- /// Automatic text direction detection
565
- #[serde(rename = "auto")]
566
- Auto,
567
- }
568
-
569
- /// Header/heading element metadata.
570
- #[derive(Debug, Clone, Serialize, Deserialize)]
571
- pub struct HeaderMetadata {
572
- /// Header level: 1 (h1) through 6 (h6)
573
- pub level: u8,
574
- /// Normalized text content of the header
575
- pub text: String,
576
- /// HTML id attribute if present
577
- #[serde(skip_serializing_if = "Option::is_none")]
578
- pub id: Option<String>,
579
- /// Document tree depth at the header element
580
- pub depth: usize,
581
- /// Byte offset in original HTML document
582
- pub html_offset: usize,
583
- }
584
-
585
- /// Link element metadata.
586
- #[derive(Debug, Clone, Serialize, Deserialize)]
587
- pub struct LinkMetadata {
588
- /// The href URL value
589
- pub href: String,
590
- /// Link text content (normalized)
591
- pub text: String,
592
- /// Optional title attribute
593
- #[serde(skip_serializing_if = "Option::is_none")]
594
- pub title: Option<String>,
595
- /// Link type classification
596
- pub link_type: LinkType,
597
- /// Rel attribute values
598
- pub rel: Vec<String>,
599
- /// Additional attributes as key-value pairs
600
- pub attributes: HashMap<String, String>,
601
- }
602
-
603
- /// Link type classification.
604
- #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
605
- #[serde(rename_all = "lowercase")]
606
- pub enum LinkType {
607
- /// Anchor link (#section)
608
- Anchor,
609
- /// Internal link (same domain)
610
- Internal,
611
- /// External link (different domain)
612
- External,
613
- /// Email link (mailto:)
614
- Email,
615
- /// Phone link (tel:)
616
- Phone,
617
- /// Other link type
618
- Other,
619
- }
620
-
621
- /// Image element metadata.
622
- #[derive(Debug, Clone, Serialize, Deserialize)]
623
- pub struct ImageMetadataType {
624
- /// Image source (URL, data URI, or SVG content)
625
- pub src: String,
626
- /// Alternative text from alt attribute
627
- #[serde(skip_serializing_if = "Option::is_none")]
628
- pub alt: Option<String>,
629
- /// Title attribute
630
- #[serde(skip_serializing_if = "Option::is_none")]
631
- pub title: Option<String>,
632
- /// Image dimensions as (width, height) if available
633
- pub dimensions: Option<(u32, u32)>,
634
- /// Image type classification
635
- pub image_type: ImageType,
636
- /// Additional attributes as key-value pairs
637
- pub attributes: HashMap<String, String>,
638
- }
639
-
640
- /// Image type classification.
641
- #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
642
- #[serde(rename_all = "lowercase")]
643
- pub enum ImageType {
644
- /// Data URI image
645
- #[serde(rename = "data-uri")]
646
- DataUri,
647
- /// Inline SVG
648
- #[serde(rename = "inline-svg")]
649
- InlineSvg,
650
- /// External image URL
651
- External,
652
- /// Relative path image
653
- Relative,
654
- }
655
-
656
- /// Structured data (Schema.org, microdata, RDFa) block.
657
- #[derive(Debug, Clone, Serialize, Deserialize)]
658
- pub struct StructuredData {
659
- /// Type of structured data
660
- pub data_type: StructuredDataType,
661
- /// Raw JSON string representation
662
- pub raw_json: String,
663
- /// Schema type if detectable (e.g., "Article", "Event", "Product")
664
- #[serde(skip_serializing_if = "Option::is_none")]
665
- pub schema_type: Option<String>,
666
- }
667
-
668
- /// Structured data type classification.
669
- #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
670
- #[serde(rename_all = "lowercase")]
671
- pub enum StructuredDataType {
672
- /// JSON-LD structured data
673
- #[serde(rename = "json-ld")]
674
- JsonLd,
675
- /// Microdata
676
- Microdata,
677
- /// RDFa
678
- #[serde(rename = "rdfa")]
679
- RDFa,
680
- }
681
-
682
- /// HTML metadata extracted from HTML documents.
683
- ///
684
- /// Includes document-level metadata, Open Graph data, Twitter Card metadata,
685
- /// and extracted structural elements (headers, links, images, structured data).
686
- #[derive(Debug, Clone, Serialize, Deserialize, Default)]
687
- pub struct HtmlMetadata {
688
- /// Document title from `<title>` tag
689
- #[serde(skip_serializing_if = "Option::is_none")]
690
- pub title: Option<String>,
691
-
692
- /// Document description from `<meta name="description">` tag
693
- #[serde(skip_serializing_if = "Option::is_none")]
694
- pub description: Option<String>,
695
-
696
- /// Document keywords from `<meta name="keywords">` tag, split on commas
697
- #[serde(default)]
698
- pub keywords: Vec<String>,
699
-
700
- /// Document author from `<meta name="author">` tag
701
- #[serde(skip_serializing_if = "Option::is_none")]
702
- pub author: Option<String>,
703
-
704
- /// Canonical URL from `<link rel="canonical">` tag
705
- #[serde(skip_serializing_if = "Option::is_none")]
706
- pub canonical_url: Option<String>,
707
-
708
- /// Base URL from `<base href="">` tag for resolving relative URLs
709
- #[serde(skip_serializing_if = "Option::is_none")]
710
- pub base_href: Option<String>,
711
-
712
- /// Document language from `lang` attribute
713
- #[serde(skip_serializing_if = "Option::is_none")]
714
- pub language: Option<String>,
715
-
716
- /// Document text direction from `dir` attribute
717
- #[serde(skip_serializing_if = "Option::is_none")]
718
- pub text_direction: Option<TextDirection>,
719
-
720
- /// Open Graph metadata (og:* properties) for social media
721
- /// Keys like "title", "description", "image", "url", etc.
722
- #[serde(default)]
723
- pub open_graph: BTreeMap<String, String>,
724
-
725
- /// Twitter Card metadata (twitter:* properties)
726
- /// Keys like "card", "site", "creator", "title", "description", "image", etc.
727
- #[serde(default)]
728
- pub twitter_card: BTreeMap<String, String>,
729
-
730
- /// Additional meta tags not covered by specific fields
731
- /// Keys are meta name/property attributes, values are content
732
- #[serde(default)]
733
- pub meta_tags: BTreeMap<String, String>,
734
-
735
- /// Extracted header elements with hierarchy
736
- #[serde(default)]
737
- pub headers: Vec<HeaderMetadata>,
738
-
739
- /// Extracted hyperlinks with type classification
740
- #[serde(default)]
741
- pub links: Vec<LinkMetadata>,
742
-
743
- /// Extracted images with source and dimensions
744
- #[serde(default)]
745
- pub images: Vec<ImageMetadataType>,
746
-
747
- /// Extracted structured data blocks
748
- #[serde(default)]
749
- pub structured_data: Vec<StructuredData>,
750
- }
751
-
752
- impl HtmlMetadata {
753
- /// Check if metadata is empty (no meaningful content extracted).
754
- pub fn is_empty(&self) -> bool {
755
- self.title.is_none()
756
- && self.description.is_none()
757
- && self.keywords.is_empty()
758
- && self.author.is_none()
759
- && self.canonical_url.is_none()
760
- && self.base_href.is_none()
761
- && self.language.is_none()
762
- && self.text_direction.is_none()
763
- && self.open_graph.is_empty()
764
- && self.twitter_card.is_empty()
765
- && self.meta_tags.is_empty()
766
- && self.headers.is_empty()
767
- && self.links.is_empty()
768
- && self.images.is_empty()
769
- && self.structured_data.is_empty()
770
- }
771
- }
772
-
773
- #[cfg(feature = "html")]
774
- impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
775
- fn from(metadata: html_to_markdown_rs::ExtendedMetadata) -> Self {
776
- let text_dir = metadata.document.text_direction.map(|td| match td {
777
- html_to_markdown_rs::TextDirection::LeftToRight => TextDirection::LeftToRight,
778
- html_to_markdown_rs::TextDirection::RightToLeft => TextDirection::RightToLeft,
779
- html_to_markdown_rs::TextDirection::Auto => TextDirection::Auto,
780
- });
781
-
782
- HtmlMetadata {
783
- title: metadata.document.title,
784
- description: metadata.document.description,
785
- keywords: metadata.document.keywords,
786
- author: metadata.document.author,
787
- canonical_url: metadata.document.canonical_url,
788
- base_href: metadata.document.base_href,
789
- language: metadata.document.language,
790
- text_direction: text_dir,
791
- open_graph: metadata.document.open_graph,
792
- twitter_card: metadata.document.twitter_card,
793
- meta_tags: metadata.document.meta_tags,
794
- headers: metadata
795
- .headers
796
- .into_iter()
797
- .map(|h| HeaderMetadata {
798
- level: h.level,
799
- text: h.text,
800
- id: h.id,
801
- depth: h.depth,
802
- html_offset: h.html_offset,
803
- })
804
- .collect(),
805
- links: metadata
806
- .links
807
- .into_iter()
808
- .map(|l| LinkMetadata {
809
- href: l.href,
810
- text: l.text,
811
- title: l.title,
812
- link_type: match l.link_type {
813
- html_to_markdown_rs::LinkType::Anchor => LinkType::Anchor,
814
- html_to_markdown_rs::LinkType::Internal => LinkType::Internal,
815
- html_to_markdown_rs::LinkType::External => LinkType::External,
816
- html_to_markdown_rs::LinkType::Email => LinkType::Email,
817
- html_to_markdown_rs::LinkType::Phone => LinkType::Phone,
818
- html_to_markdown_rs::LinkType::Other => LinkType::Other,
819
- },
820
- rel: l.rel,
821
- attributes: l.attributes.into_iter().collect(),
822
- })
823
- .collect(),
824
- images: metadata
825
- .images
826
- .into_iter()
827
- .map(|img| ImageMetadataType {
828
- src: img.src,
829
- alt: img.alt,
830
- title: img.title,
831
- dimensions: img.dimensions,
832
- image_type: match img.image_type {
833
- html_to_markdown_rs::ImageType::DataUri => ImageType::DataUri,
834
- html_to_markdown_rs::ImageType::InlineSvg => ImageType::InlineSvg,
835
- html_to_markdown_rs::ImageType::External => ImageType::External,
836
- html_to_markdown_rs::ImageType::Relative => ImageType::Relative,
837
- },
838
- attributes: img.attributes.into_iter().collect(),
839
- })
840
- .collect(),
841
- structured_data: metadata
842
- .structured_data
843
- .into_iter()
844
- .map(|sd| StructuredData {
845
- data_type: match sd.data_type {
846
- html_to_markdown_rs::StructuredDataType::JsonLd => StructuredDataType::JsonLd,
847
- html_to_markdown_rs::StructuredDataType::Microdata => StructuredDataType::Microdata,
848
- html_to_markdown_rs::StructuredDataType::RDFa => StructuredDataType::RDFa,
849
- },
850
- raw_json: sd.raw_json,
851
- schema_type: sd.schema_type,
852
- })
853
- .collect(),
854
- }
855
- }
856
- }
857
-
858
- /// OCR processing metadata.
859
- ///
860
- /// Captures information about OCR processing configuration and results.
861
- #[derive(Debug, Clone, Serialize, Deserialize)]
862
- pub struct OcrMetadata {
863
- /// OCR language code(s) used
864
- pub language: String,
865
- /// Tesseract Page Segmentation Mode (PSM)
866
- pub psm: i32,
867
- /// Output format (e.g., "text", "hocr")
868
- pub output_format: String,
869
- /// Number of tables detected
870
- pub table_count: usize,
871
-
872
- #[serde(skip_serializing_if = "Option::is_none")]
873
- pub table_rows: Option<usize>,
874
-
875
- #[serde(skip_serializing_if = "Option::is_none")]
876
- pub table_cols: Option<usize>,
877
- }
878
-
879
- /// Error metadata (for batch operations).
880
- #[derive(Debug, Clone, Serialize, Deserialize)]
881
- pub struct ErrorMetadata {
882
- pub error_type: String,
883
- pub message: String,
884
- }
885
-
886
- /// Extracted table structure.
887
- ///
888
- /// Represents a table detected and extracted from a document (PDF, image, etc.).
889
- /// Tables are converted to both structured cell data and Markdown format.
890
- #[derive(Debug, Clone, Serialize, Deserialize)]
891
- pub struct Table {
892
- /// Table cells as a 2D vector (rows × columns)
893
- pub cells: Vec<Vec<String>>,
894
- /// Markdown representation of the table
895
- pub markdown: String,
896
- /// Page number where the table was found (1-indexed)
897
- pub page_number: usize,
898
- }
899
-
900
- /// A text chunk with optional embedding and metadata.
901
- ///
902
- /// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
903
- /// contains the text content, optional embedding vector (if embedding generation
904
- /// is configured), and metadata about its position in the document.
905
- #[derive(Debug, Clone, Serialize, Deserialize)]
906
- pub struct Chunk {
907
- /// The text content of this chunk.
908
- pub content: String,
909
-
910
- /// Optional embedding vector for this chunk.
911
- ///
912
- /// Only populated when `EmbeddingConfig` is provided in chunking configuration.
913
- /// The dimensionality depends on the chosen embedding model.
914
- #[serde(skip_serializing_if = "Option::is_none")]
915
- pub embedding: Option<Vec<f32>>,
916
-
917
- /// Metadata about this chunk's position and properties.
918
- pub metadata: ChunkMetadata,
919
- }
920
-
921
- /// Metadata about a chunk's position in the original document.
922
- #[derive(Debug, Clone, Serialize, Deserialize)]
923
- pub struct ChunkMetadata {
924
- /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
925
- pub byte_start: usize,
926
-
927
- /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
928
- pub byte_end: usize,
929
-
930
- /// Number of tokens in this chunk (if available).
931
- ///
932
- /// This is calculated by the embedding model's tokenizer if embeddings are enabled.
933
- #[serde(skip_serializing_if = "Option::is_none")]
934
- pub token_count: Option<usize>,
935
-
936
- /// Zero-based index of this chunk in the document.
937
- pub chunk_index: usize,
938
-
939
- /// Total number of chunks in the document.
940
- pub total_chunks: usize,
941
-
942
- /// First page number this chunk spans (1-indexed).
943
- ///
944
- /// Only populated when page tracking is enabled in extraction configuration.
945
- #[serde(skip_serializing_if = "Option::is_none")]
946
- pub first_page: Option<usize>,
947
-
948
- /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
949
- ///
950
- /// Only populated when page tracking is enabled in extraction configuration.
951
- #[serde(skip_serializing_if = "Option::is_none")]
952
- pub last_page: Option<usize>,
953
- }
954
-
955
- /// Extracted image from a document.
956
- ///
957
- /// Contains raw image data, metadata, and optional nested OCR results.
958
- /// Raw bytes allow cross-language compatibility - users can convert to
959
- /// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
960
- #[derive(Debug, Clone, Serialize, Deserialize)]
961
- pub struct ExtractedImage {
962
- /// Raw image data (PNG, JPEG, WebP, etc. bytes)
963
- pub data: Vec<u8>,
964
-
965
- /// Image format (e.g., "jpeg", "png", "webp")
966
- pub format: String,
967
-
968
- /// Zero-indexed position of this image in the document/page
969
- pub image_index: usize,
970
-
971
- /// Page/slide number where image was found (1-indexed)
972
- #[serde(skip_serializing_if = "Option::is_none")]
973
- pub page_number: Option<usize>,
974
-
975
- /// Image width in pixels
976
- #[serde(skip_serializing_if = "Option::is_none")]
977
- pub width: Option<u32>,
978
-
979
- /// Image height in pixels
980
- #[serde(skip_serializing_if = "Option::is_none")]
981
- pub height: Option<u32>,
982
-
983
- /// Colorspace information (e.g., "RGB", "CMYK", "Gray")
984
- #[serde(skip_serializing_if = "Option::is_none")]
985
- pub colorspace: Option<String>,
986
-
987
- /// Bits per color component (e.g., 8, 16)
988
- #[serde(skip_serializing_if = "Option::is_none")]
989
- pub bits_per_component: Option<u32>,
990
-
991
- /// Whether this image is a mask image
992
- #[serde(default)]
993
- pub is_mask: bool,
994
-
995
- /// Optional description of the image
996
- #[serde(skip_serializing_if = "Option::is_none")]
997
- pub description: Option<String>,
998
-
999
- /// Nested OCR extraction result (if image was OCRed)
1000
- ///
1001
- /// When OCR is performed on this image, the result is embedded here
1002
- /// rather than in a separate collection, making the relationship explicit.
1003
- #[serde(skip_serializing_if = "Option::is_none")]
1004
- pub ocr_result: Option<Box<ExtractionResult>>,
1005
- }
1006
-
1007
- /// Excel workbook representation.
1008
- ///
1009
- /// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
1010
- /// extracted content and metadata.
1011
- #[derive(Debug, Clone, Serialize, Deserialize)]
1012
- pub struct ExcelWorkbook {
1013
- /// All sheets in the workbook
1014
- pub sheets: Vec<ExcelSheet>,
1015
- /// Workbook-level metadata (author, creation date, etc.)
1016
- pub metadata: HashMap<String, String>,
1017
- }
1018
-
1019
- /// Single Excel worksheet.
1020
- ///
1021
- /// Represents one sheet from an Excel workbook with its content
1022
- /// converted to Markdown format and dimensional statistics.
1023
- #[derive(Debug, Clone, Serialize, Deserialize)]
1024
- pub struct ExcelSheet {
1025
- /// Sheet name as it appears in Excel
1026
- pub name: String,
1027
- /// Sheet content converted to Markdown tables
1028
- pub markdown: String,
1029
- /// Number of rows
1030
- pub row_count: usize,
1031
- /// Number of columns
1032
- pub col_count: usize,
1033
- /// Total number of non-empty cells
1034
- pub cell_count: usize,
1035
- /// Pre-extracted table cells (2D vector of cell values)
1036
- /// Populated during markdown generation to avoid re-parsing markdown.
1037
- /// None for empty sheets.
1038
- #[serde(skip)]
1039
- pub table_cells: Option<Vec<Vec<String>>>,
1040
- }
1041
-
1042
- /// XML extraction result.
1043
- ///
1044
- /// Contains extracted text content from XML files along with
1045
- /// structural statistics about the XML document.
1046
- #[derive(Debug, Clone, Serialize, Deserialize)]
1047
- pub struct XmlExtractionResult {
1048
- /// Extracted text content (XML structure filtered out)
1049
- pub content: String,
1050
- /// Total number of XML elements processed
1051
- pub element_count: usize,
1052
- /// List of unique element names found (sorted)
1053
- pub unique_elements: Vec<String>,
1054
- }
1055
-
1056
- /// Plain text and Markdown extraction result.
1057
- ///
1058
- /// Contains the extracted text along with statistics and,
1059
- /// for Markdown files, structural elements like headers and links.
1060
- #[derive(Debug, Clone, Serialize, Deserialize)]
1061
- pub struct TextExtractionResult {
1062
- /// Extracted text content
1063
- pub content: String,
1064
- /// Number of lines
1065
- pub line_count: usize,
1066
- /// Number of words
1067
- pub word_count: usize,
1068
- /// Number of characters
1069
- pub character_count: usize,
1070
- /// Markdown headers (text only, Markdown files only)
1071
- #[serde(skip_serializing_if = "Option::is_none")]
1072
- pub headers: Option<Vec<String>>,
1073
- /// Markdown links as (text, URL) tuples (Markdown files only)
1074
- #[serde(skip_serializing_if = "Option::is_none")]
1075
- pub links: Option<Vec<(String, String)>>,
1076
- /// Code blocks as (language, code) tuples (Markdown files only)
1077
- #[serde(skip_serializing_if = "Option::is_none")]
1078
- pub code_blocks: Option<Vec<(String, String)>>,
1079
- }
1080
-
1081
- /// PowerPoint (PPTX) extraction result.
1082
- ///
1083
- /// Contains extracted slide content, metadata, and embedded images/tables.
1084
- #[derive(Debug, Clone, Serialize, Deserialize)]
1085
- pub struct PptxExtractionResult {
1086
- /// Extracted text content from all slides
1087
- pub content: String,
1088
- /// Presentation metadata
1089
- pub metadata: PptxMetadata,
1090
- /// Total number of slides
1091
- pub slide_count: usize,
1092
- /// Total number of embedded images
1093
- pub image_count: usize,
1094
- /// Total number of tables
1095
- pub table_count: usize,
1096
- /// Extracted images from the presentation
1097
- pub images: Vec<ExtractedImage>,
1098
- /// Slide structure with boundaries (when page tracking is enabled)
1099
- #[serde(skip_serializing_if = "Option::is_none")]
1100
- pub page_structure: Option<PageStructure>,
1101
- /// Per-slide content (when page tracking is enabled)
1102
- #[serde(skip_serializing_if = "Option::is_none")]
1103
- pub page_contents: Option<Vec<PageContent>>,
1104
- }
1105
-
1106
- /// PowerPoint presentation metadata.
1107
- ///
1108
- /// Contains PPTX-specific metadata. Common fields like title, author, and description
1109
- /// are now in the base `Metadata` struct.
1110
- #[derive(Debug, Clone, Serialize, Deserialize)]
1111
- pub struct PptxMetadata {
1112
- /// List of fonts used in the presentation
1113
- #[serde(skip_serializing_if = "Vec::is_empty", default)]
1114
- pub fonts: Vec<String>,
1115
- }
1116
-
1117
- /// Email extraction result.
1118
- ///
1119
- /// Complete representation of an extracted email message (.eml or .msg)
1120
- /// including headers, body content, and attachments.
1121
- #[derive(Debug, Clone, Serialize, Deserialize)]
1122
- pub struct EmailExtractionResult {
1123
- /// Email subject line
1124
- pub subject: Option<String>,
1125
- /// Sender email address
1126
- pub from_email: Option<String>,
1127
- /// Primary recipient email addresses
1128
- pub to_emails: Vec<String>,
1129
- /// CC recipient email addresses
1130
- pub cc_emails: Vec<String>,
1131
- /// BCC recipient email addresses
1132
- pub bcc_emails: Vec<String>,
1133
- /// Email date/timestamp
1134
- pub date: Option<String>,
1135
- /// Message-ID header value
1136
- pub message_id: Option<String>,
1137
- /// Plain text version of the email body
1138
- pub plain_text: Option<String>,
1139
- /// HTML version of the email body
1140
- pub html_content: Option<String>,
1141
- /// Cleaned/processed text content
1142
- pub cleaned_text: String,
1143
- /// List of email attachments
1144
- pub attachments: Vec<EmailAttachment>,
1145
- /// Additional email headers and metadata
1146
- pub metadata: HashMap<String, String>,
1147
- }
1148
-
1149
- /// Email attachment representation.
1150
- ///
1151
- /// Contains metadata and optionally the content of an email attachment.
1152
- #[derive(Debug, Clone, Serialize, Deserialize)]
1153
- pub struct EmailAttachment {
1154
- /// Attachment name (from Content-Disposition header)
1155
- pub name: Option<String>,
1156
- /// Filename of the attachment
1157
- pub filename: Option<String>,
1158
- /// MIME type of the attachment
1159
- pub mime_type: Option<String>,
1160
- /// Size in bytes
1161
- pub size: Option<usize>,
1162
- /// Whether this attachment is an image
1163
- pub is_image: bool,
1164
- /// Attachment data (if extracted)
1165
- pub data: Option<Vec<u8>>,
1166
- }
1167
-
1168
- /// OCR extraction result.
1169
- ///
1170
- /// Result of performing OCR on an image or scanned document,
1171
- /// including recognized text and detected tables.
1172
- #[derive(Debug, Clone, Serialize, Deserialize)]
1173
- pub struct OcrExtractionResult {
1174
- /// Recognized text content
1175
- pub content: String,
1176
- /// Original MIME type of the processed image
1177
- pub mime_type: String,
1178
- /// OCR processing metadata (confidence scores, language, etc.)
1179
- pub metadata: HashMap<String, serde_json::Value>,
1180
- /// Tables detected and extracted via OCR
1181
- pub tables: Vec<OcrTable>,
1182
- }
1183
-
1184
- /// Table detected via OCR.
1185
- ///
1186
- /// Represents a table structure recognized during OCR processing.
1187
- #[derive(Debug, Clone, Serialize, Deserialize)]
1188
- pub struct OcrTable {
1189
- /// Table cells as a 2D vector (rows × columns)
1190
- pub cells: Vec<Vec<String>>,
1191
- /// Markdown representation of the table
1192
- pub markdown: String,
1193
- /// Page number where the table was found (1-indexed)
1194
- pub page_number: usize,
1195
- }
1196
-
1197
- /// Image preprocessing configuration for OCR.
1198
- ///
1199
- /// These settings control how images are preprocessed before OCR to improve
1200
- /// text recognition quality. Different preprocessing strategies work better
1201
- /// for different document types.
1202
- #[derive(Debug, Clone, Serialize, Deserialize)]
1203
- #[serde(default)]
1204
- pub struct ImagePreprocessingConfig {
1205
- /// Target DPI for the image (300 is standard, 600 for small text).
1206
- pub target_dpi: i32,
1207
-
1208
- /// Auto-detect and correct image rotation.
1209
- pub auto_rotate: bool,
1210
-
1211
- /// Correct skew (tilted images).
1212
- pub deskew: bool,
1213
-
1214
- /// Remove noise from the image.
1215
- pub denoise: bool,
1216
-
1217
- /// Enhance contrast for better text visibility.
1218
- pub contrast_enhance: bool,
1219
-
1220
- /// Binarization method: "otsu", "sauvola", "adaptive".
1221
- pub binarization_method: String,
1222
-
1223
- /// Invert colors (white text on black → black on white).
1224
- pub invert_colors: bool,
1225
- }
1226
-
1227
- impl Default for ImagePreprocessingConfig {
1228
- fn default() -> Self {
1229
- Self {
1230
- target_dpi: 300,
1231
- auto_rotate: true,
1232
- deskew: true,
1233
- denoise: false,
1234
- contrast_enhance: false,
1235
- binarization_method: "otsu".to_string(),
1236
- invert_colors: false,
1237
- }
1238
- }
1239
- }
1240
-
1241
- /// Tesseract OCR configuration.
1242
- ///
1243
- /// Provides fine-grained control over Tesseract OCR engine parameters.
1244
- /// Most users can use the defaults, but these settings allow optimization
1245
- /// for specific document types (invoices, handwriting, etc.).
1246
- #[derive(Debug, Clone, Serialize, Deserialize)]
1247
- #[serde(default)]
1248
- pub struct TesseractConfig {
1249
- /// Language code (e.g., "eng", "deu", "fra")
1250
- pub language: String,
1251
-
1252
- /// Page Segmentation Mode (0-13).
1253
- ///
1254
- /// Common values:
1255
- /// - 3: Fully automatic page segmentation (default)
1256
- /// - 6: Assume a single uniform block of text
1257
- /// - 11: Sparse text with no particular order
1258
- pub psm: i32,
1259
-
1260
- /// Output format ("text" or "markdown")
1261
- pub output_format: String,
1262
-
1263
- /// OCR Engine Mode (0-3).
1264
- ///
1265
- /// - 0: Legacy engine only
1266
- /// - 1: Neural nets (LSTM) only (usually best)
1267
- /// - 2: Legacy + LSTM
1268
- /// - 3: Default (based on what's available)
1269
- pub oem: i32,
1270
-
1271
- /// Minimum confidence threshold (0.0-100.0).
1272
- ///
1273
- /// Words with confidence below this threshold may be rejected or flagged.
1274
- pub min_confidence: f64,
1275
-
1276
- /// Image preprocessing configuration.
1277
- ///
1278
- /// Controls how images are preprocessed before OCR. Can significantly
1279
- /// improve quality for scanned documents or low-quality images.
1280
- #[serde(skip_serializing_if = "Option::is_none")]
1281
- pub preprocessing: Option<ImagePreprocessingConfig>,
1282
-
1283
- /// Enable automatic table detection and reconstruction
1284
- pub enable_table_detection: bool,
1285
-
1286
- /// Minimum confidence threshold for table detection (0.0-1.0)
1287
- pub table_min_confidence: f64,
1288
-
1289
- /// Column threshold for table detection (pixels)
1290
- pub table_column_threshold: i32,
1291
-
1292
- /// Row threshold ratio for table detection (0.0-1.0)
1293
- pub table_row_threshold_ratio: f64,
1294
-
1295
- /// Enable OCR result caching
1296
- pub use_cache: bool,
1297
-
1298
- /// Use pre-adapted templates for character classification
1299
- pub classify_use_pre_adapted_templates: bool,
1300
-
1301
- /// Enable N-gram language model
1302
- pub language_model_ngram_on: bool,
1303
-
1304
- /// Don't reject good words during block-level processing
1305
- pub tessedit_dont_blkrej_good_wds: bool,
1306
-
1307
- /// Don't reject good words during row-level processing
1308
- pub tessedit_dont_rowrej_good_wds: bool,
1309
-
1310
- /// Enable dictionary correction
1311
- pub tessedit_enable_dict_correction: bool,
1312
-
1313
- /// Whitelist of allowed characters (empty = all allowed)
1314
- pub tessedit_char_whitelist: String,
1315
-
1316
- /// Blacklist of forbidden characters (empty = none forbidden)
1317
- pub tessedit_char_blacklist: String,
1318
-
1319
- /// Use primary language params model
1320
- pub tessedit_use_primary_params_model: bool,
1321
-
1322
- /// Variable-width space detection
1323
- pub textord_space_size_is_variable: bool,
1324
-
1325
- /// Use adaptive thresholding method
1326
- pub thresholding_method: bool,
1327
- }
1328
-
1329
- impl Default for TesseractConfig {
1330
- fn default() -> Self {
1331
- Self {
1332
- language: "eng".to_string(),
1333
- psm: 3,
1334
- output_format: "markdown".to_string(),
1335
- oem: 3,
1336
- min_confidence: 0.0,
1337
- preprocessing: None,
1338
- enable_table_detection: true,
1339
- table_min_confidence: 0.0,
1340
- table_column_threshold: 50,
1341
- table_row_threshold_ratio: 0.5,
1342
- use_cache: true,
1343
- classify_use_pre_adapted_templates: true,
1344
- language_model_ngram_on: false,
1345
- tessedit_dont_blkrej_good_wds: true,
1346
- tessedit_dont_rowrej_good_wds: true,
1347
- tessedit_enable_dict_correction: true,
1348
- tessedit_char_whitelist: String::new(),
1349
- tessedit_char_blacklist: String::new(),
1350
- tessedit_use_primary_params_model: true,
1351
- textord_space_size_is_variable: true,
1352
- thresholding_method: false,
1353
- }
1354
- }
1355
- }
1356
-
1357
- /// Image preprocessing metadata.
1358
- ///
1359
- /// Tracks the transformations applied to an image during OCR preprocessing,
1360
- /// including DPI normalization, resizing, and resampling.
1361
- #[derive(Debug, Clone, Serialize, Deserialize)]
1362
- pub struct ImagePreprocessingMetadata {
1363
- /// Original image dimensions (width, height) in pixels
1364
- pub original_dimensions: (usize, usize),
1365
- /// Original image DPI (horizontal, vertical)
1366
- pub original_dpi: (f64, f64),
1367
- /// Target DPI from configuration
1368
- pub target_dpi: i32,
1369
- /// Scaling factor applied to the image
1370
- pub scale_factor: f64,
1371
- /// Whether DPI was auto-adjusted based on content
1372
- pub auto_adjusted: bool,
1373
- /// Final DPI after processing
1374
- pub final_dpi: i32,
1375
- /// New dimensions after resizing (if resized)
1376
- pub new_dimensions: Option<(usize, usize)>,
1377
- /// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
1378
- pub resample_method: String,
1379
- /// Whether dimensions were clamped to max_image_dimension
1380
- pub dimension_clamped: bool,
1381
- /// Calculated optimal DPI (if auto_adjust_dpi enabled)
1382
- pub calculated_dpi: Option<i32>,
1383
- /// Whether resize was skipped (dimensions already optimal)
1384
- pub skipped_resize: bool,
1385
- /// Error message if resize failed
1386
- pub resize_error: Option<String>,
1387
- }
1388
-
1389
- /// Image extraction configuration (internal use).
1390
- ///
1391
- /// **Note:** This is an internal type used for image preprocessing.
1392
- /// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
1393
- #[derive(Debug, Clone, Serialize, Deserialize)]
1394
- pub struct ExtractionConfig {
1395
- /// Target DPI for image normalization
1396
- pub target_dpi: i32,
1397
- /// Maximum image dimension (width or height)
1398
- pub max_image_dimension: i32,
1399
- /// Whether to auto-adjust DPI based on content
1400
- pub auto_adjust_dpi: bool,
1401
- /// Minimum DPI threshold
1402
- pub min_dpi: i32,
1403
- /// Maximum DPI threshold
1404
- pub max_dpi: i32,
1405
- }
1406
-
1407
- impl Default for ExtractionConfig {
1408
- fn default() -> Self {
1409
- Self {
1410
- target_dpi: 300,
1411
- max_image_dimension: 4096,
1412
- auto_adjust_dpi: true,
1413
- min_dpi: 72,
1414
- max_dpi: 600,
1415
- }
1416
- }
1417
- }
1418
-
1419
- /// Cache statistics.
1420
- ///
1421
- /// Provides information about the extraction result cache,
1422
- /// including size, file count, and age distribution.
1423
- #[derive(Debug, Clone, Serialize, Deserialize)]
1424
- pub struct CacheStats {
1425
- /// Total number of cached files
1426
- pub total_files: usize,
1427
- /// Total cache size in megabytes
1428
- pub total_size_mb: f64,
1429
- /// Available disk space in megabytes
1430
- pub available_space_mb: f64,
1431
- /// Age of the oldest cached file in days
1432
- pub oldest_file_age_days: f64,
1433
- /// Age of the newest cached file in days
1434
- pub newest_file_age_days: f64,
1435
- }
1436
-
1437
- /// LibreOffice conversion result.
1438
- ///
1439
- /// Result of converting a legacy office document (e.g., .doc, .ppt)
1440
- /// to a modern format using LibreOffice.
1441
- #[derive(Debug, Clone, Serialize, Deserialize)]
1442
- pub struct LibreOfficeConversionResult {
1443
- /// Converted file bytes
1444
- pub converted_bytes: Vec<u8>,
1445
- /// Original format identifier
1446
- pub original_format: String,
1447
- /// Target format identifier
1448
- pub target_format: String,
1449
- /// Target MIME type after conversion
1450
- pub target_mime: String,
1451
- }
1452
-
1453
- #[cfg(test)]
1454
- mod tests {
1455
- use super::*;
1456
-
1457
- #[test]
1458
- fn test_metadata_serialization_with_format() {
1459
- let mut metadata = Metadata {
1460
- format: Some(FormatMetadata::Text(TextMetadata {
1461
- line_count: 1,
1462
- word_count: 2,
1463
- character_count: 13,
1464
- headers: None,
1465
- links: None,
1466
- code_blocks: None,
1467
- })),
1468
- ..Default::default()
1469
- };
1470
-
1471
- metadata
1472
- .additional
1473
- .insert("quality_score".to_string(), serde_json::json!(1.0));
1474
-
1475
- let json = serde_json::to_value(&metadata).unwrap();
1476
- println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
1477
-
1478
- assert!(
1479
- json.get("format_type").is_some(),
1480
- "format_type should be present in serialized JSON"
1481
- );
1482
- assert_eq!(json.get("format_type").unwrap(), "text");
1483
-
1484
- assert_eq!(json.get("line_count").unwrap(), 1);
1485
- assert_eq!(json.get("word_count").unwrap(), 2);
1486
- assert_eq!(json.get("character_count").unwrap(), 13);
1487
-
1488
- assert_eq!(json.get("quality_score").unwrap(), 1.0);
1489
- }
1490
-
1491
- #[test]
1492
- fn test_arc_table_serialization_format() {
1493
- let table = Table {
1494
- cells: vec![vec!["A".to_string(), "B".to_string()]],
1495
- markdown: "| A | B |\n|---|---|\n".to_string(),
1496
- page_number: 1,
1497
- };
1498
-
1499
- let json = serde_json::to_value(&table).unwrap();
1500
-
1501
- assert_eq!(json.get("cells").unwrap()[0][0], "A");
1502
- assert_eq!(json.get("markdown").unwrap(), "| A | B |\n|---|---|\n");
1503
- assert_eq!(json.get("page_number").unwrap(), 1);
1504
- }
1505
-
1506
- #[test]
1507
- fn test_arc_table_roundtrip() {
1508
- let original = Table {
1509
- cells: vec![
1510
- vec!["X".to_string(), "Y".to_string()],
1511
- vec!["1".to_string(), "2".to_string()],
1512
- ],
1513
- markdown: "| X | Y |\n|---|---|\n| 1 | 2 |\n".to_string(),
1514
- page_number: 5,
1515
- };
1516
-
1517
- let json = serde_json::to_string(&original).unwrap();
1518
- let deserialized: Table = serde_json::from_str(&json).unwrap();
1519
-
1520
- assert_eq!(deserialized.cells, original.cells);
1521
- assert_eq!(deserialized.markdown, original.markdown);
1522
- assert_eq!(deserialized.page_number, original.page_number);
1523
- }
1524
-
1525
- #[test]
1526
- fn test_arc_sharing_preserved_before_serialization() {
1527
- let shared_table = Arc::new(Table {
1528
- cells: vec![vec!["shared".to_string()]],
1529
- markdown: "| shared |".to_string(),
1530
- page_number: 1,
1531
- });
1532
-
1533
- let tables_before = [Arc::clone(&shared_table), Arc::clone(&shared_table)].to_vec();
1534
- assert_eq!(Arc::strong_count(&tables_before[0]), 3);
1535
- assert_eq!(Arc::strong_count(&tables_before[1]), 3);
1536
- assert!(Arc::ptr_eq(&tables_before[0], &tables_before[1]));
1537
- }
1538
-
1539
- #[test]
1540
- fn test_vec_arc_table_serialization_format() {
1541
- let tables = vec![
1542
- Table {
1543
- cells: vec![vec!["A".to_string()]],
1544
- markdown: "| A |".to_string(),
1545
- page_number: 1,
1546
- },
1547
- Table {
1548
- cells: vec![vec!["B".to_string()]],
1549
- markdown: "| B |".to_string(),
1550
- page_number: 2,
1551
- },
1552
- ];
1553
-
1554
- let json = serde_json::to_string(&tables).unwrap();
1555
- let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
1556
-
1557
- assert!(parsed.is_array());
1558
- assert_eq!(parsed.as_array().unwrap().len(), 2);
1559
- assert_eq!(parsed[0]["cells"][0][0], "A");
1560
- assert_eq!(parsed[1]["cells"][0][0], "B");
1561
- }
1562
-
1563
- #[test]
1564
- fn test_page_content_arc_tables_roundtrip() {
1565
- let page = PageContent {
1566
- page_number: 3,
1567
- content: "Page 3 content".to_string(),
1568
- tables: vec![
1569
- Arc::new(Table {
1570
- cells: vec![vec!["Table1".to_string()]],
1571
- markdown: "| Table1 |".to_string(),
1572
- page_number: 3,
1573
- }),
1574
- Arc::new(Table {
1575
- cells: vec![vec!["Table2".to_string()]],
1576
- markdown: "| Table2 |".to_string(),
1577
- page_number: 3,
1578
- }),
1579
- ],
1580
- images: Vec::new(),
1581
- hierarchy: None,
1582
- };
1583
-
1584
- let json = serde_json::to_string(&page).unwrap();
1585
- let deserialized: PageContent = serde_json::from_str(&json).unwrap();
1586
-
1587
- assert_eq!(deserialized.page_number, 3);
1588
- assert_eq!(deserialized.content, "Page 3 content");
1589
- assert_eq!(deserialized.tables.len(), 2);
1590
- assert_eq!(deserialized.tables[0].cells[0][0], "Table1");
1591
- assert_eq!(deserialized.tables[1].cells[0][0], "Table2");
1592
- }
1593
-
1594
- #[test]
1595
- fn test_page_content_arc_images_roundtrip() {
1596
- let image1 = Arc::new(ExtractedImage {
1597
- data: vec![0xFF, 0xD8, 0xFF],
1598
- format: "jpeg".to_string(),
1599
- image_index: 0,
1600
- page_number: Some(1),
1601
- width: Some(100),
1602
- height: Some(200),
1603
- colorspace: Some("RGB".to_string()),
1604
- bits_per_component: Some(8),
1605
- is_mask: false,
1606
- description: Some("Image 1".to_string()),
1607
- ocr_result: None,
1608
- });
1609
-
1610
- let image2 = Arc::new(ExtractedImage {
1611
- data: vec![0x89, 0x50, 0x4E],
1612
- format: "png".to_string(),
1613
- image_index: 1,
1614
- page_number: Some(1),
1615
- width: Some(300),
1616
- height: Some(400),
1617
- colorspace: Some("RGBA".to_string()),
1618
- bits_per_component: Some(8),
1619
- is_mask: false,
1620
- description: Some("Image 2".to_string()),
1621
- ocr_result: None,
1622
- });
1623
-
1624
- let page = PageContent {
1625
- page_number: 1,
1626
- content: "Page with images".to_string(),
1627
- tables: Vec::new(),
1628
- images: vec![image1, image2],
1629
- hierarchy: None,
1630
- };
1631
-
1632
- let json = serde_json::to_string(&page).unwrap();
1633
- let deserialized: PageContent = serde_json::from_str(&json).unwrap();
1634
-
1635
- assert_eq!(deserialized.images.len(), 2);
1636
- assert_eq!(deserialized.images[0].format, "jpeg");
1637
- assert_eq!(deserialized.images[0].width, Some(100));
1638
- assert_eq!(deserialized.images[1].format, "png");
1639
- assert_eq!(deserialized.images[1].height, Some(400));
1640
- }
1641
-
1642
- #[test]
1643
- fn test_arc_sharing_loss_with_page_content() {
1644
- let shared_table = Arc::new(Table {
1645
- cells: vec![vec!["shared across pages".to_string()]],
1646
- markdown: "| shared across pages |".to_string(),
1647
- page_number: 0,
1648
- });
1649
-
1650
- let page1 = PageContent {
1651
- page_number: 1,
1652
- content: "Page 1".to_string(),
1653
- tables: vec![Arc::clone(&shared_table)],
1654
- images: Vec::new(),
1655
- hierarchy: None,
1656
- };
1657
-
1658
- let page2 = PageContent {
1659
- page_number: 2,
1660
- content: "Page 2".to_string(),
1661
- tables: vec![Arc::clone(&shared_table)],
1662
- images: Vec::new(),
1663
- hierarchy: None,
1664
- };
1665
-
1666
- assert!(Arc::ptr_eq(&page1.tables[0], &page2.tables[0]));
1667
-
1668
- let pages = vec![page1, page2];
1669
- let json = serde_json::to_string(&pages).unwrap();
1670
- let deserialized: Vec<PageContent> = serde_json::from_str(&json).unwrap();
1671
-
1672
- assert_eq!(deserialized.len(), 2);
1673
- assert_eq!(deserialized[0].tables[0].cells, deserialized[1].tables[0].cells);
1674
- assert!(!Arc::ptr_eq(&deserialized[0].tables[0], &deserialized[1].tables[0]));
1675
- }
1676
-
1677
- #[test]
1678
- fn test_empty_page_content_arcs() {
1679
- let page = PageContent {
1680
- page_number: 5,
1681
- content: "No tables or images".to_string(),
1682
- tables: Vec::new(),
1683
- images: Vec::new(),
1684
- hierarchy: None,
1685
- };
1686
-
1687
- let json = serde_json::to_string(&page).unwrap();
1688
- let deserialized: PageContent = serde_json::from_str(&json).unwrap();
1689
-
1690
- assert_eq!(deserialized.page_number, 5);
1691
- assert_eq!(deserialized.tables.len(), 0);
1692
- assert_eq!(deserialized.images.len(), 0);
1693
- }
1694
-
1695
- #[test]
1696
- fn test_serde_vec_arc_module_behavior() {
1697
- let table1 = Table {
1698
- cells: vec![vec!["A".to_string()]],
1699
- markdown: "| A |".to_string(),
1700
- page_number: 1,
1701
- };
1702
-
1703
- let table2 = Table {
1704
- cells: vec![vec!["B".to_string()]],
1705
- markdown: "| B |".to_string(),
1706
- page_number: 2,
1707
- };
1708
-
1709
- let json = serde_json::to_string(&vec![table1, table2]).unwrap();
1710
- assert!(json.contains("\"A\""));
1711
- assert!(json.contains("\"B\""));
1712
- }
1713
- }