kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,443 @@
1
+ //! Format-specific extraction results and OCR configuration types.
2
+
3
+ use serde::{Deserialize, Serialize};
4
+ use std::collections::HashMap;
5
+
6
+ use super::extraction::ExtractedImage;
7
+ use super::metadata::PptxMetadata;
8
+ use super::page::{PageContent, PageStructure};
9
+
10
+ /// Excel workbook representation.
11
+ ///
12
+ /// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
13
+ /// extracted content and metadata.
14
+ #[derive(Debug, Clone, Serialize, Deserialize)]
15
+ pub struct ExcelWorkbook {
16
+ /// All sheets in the workbook
17
+ pub sheets: Vec<ExcelSheet>,
18
+ /// Workbook-level metadata (author, creation date, etc.)
19
+ pub metadata: HashMap<String, String>,
20
+ }
21
+
22
+ /// Single Excel worksheet.
23
+ ///
24
+ /// Represents one sheet from an Excel workbook with its content
25
+ /// converted to Markdown format and dimensional statistics.
26
+ #[derive(Debug, Clone, Serialize, Deserialize)]
27
+ pub struct ExcelSheet {
28
+ /// Sheet name as it appears in Excel
29
+ pub name: String,
30
+ /// Sheet content converted to Markdown tables
31
+ pub markdown: String,
32
+ /// Number of rows
33
+ pub row_count: usize,
34
+ /// Number of columns
35
+ pub col_count: usize,
36
+ /// Total number of non-empty cells
37
+ pub cell_count: usize,
38
+ /// Pre-extracted table cells (2D vector of cell values)
39
+ /// Populated during markdown generation to avoid re-parsing markdown.
40
+ /// None for empty sheets.
41
+ #[serde(skip)]
42
+ pub table_cells: Option<Vec<Vec<String>>>,
43
+ }
44
+
45
+ /// XML extraction result.
46
+ ///
47
+ /// Contains extracted text content from XML files along with
48
+ /// structural statistics about the XML document.
49
+ #[derive(Debug, Clone, Serialize, Deserialize)]
50
+ pub struct XmlExtractionResult {
51
+ /// Extracted text content (XML structure filtered out)
52
+ pub content: String,
53
+ /// Total number of XML elements processed
54
+ pub element_count: usize,
55
+ /// List of unique element names found (sorted)
56
+ pub unique_elements: Vec<String>,
57
+ }
58
+
59
+ /// Plain text and Markdown extraction result.
60
+ ///
61
+ /// Contains the extracted text along with statistics and,
62
+ /// for Markdown files, structural elements like headers and links.
63
+ #[derive(Debug, Clone, Serialize, Deserialize)]
64
+ pub struct TextExtractionResult {
65
+ /// Extracted text content
66
+ pub content: String,
67
+ /// Number of lines
68
+ pub line_count: usize,
69
+ /// Number of words
70
+ pub word_count: usize,
71
+ /// Number of characters
72
+ pub character_count: usize,
73
+ /// Markdown headers (text only, Markdown files only)
74
+ #[serde(skip_serializing_if = "Option::is_none")]
75
+ pub headers: Option<Vec<String>>,
76
+ /// Markdown links as (text, URL) tuples (Markdown files only)
77
+ #[serde(skip_serializing_if = "Option::is_none")]
78
+ pub links: Option<Vec<(String, String)>>,
79
+ /// Code blocks as (language, code) tuples (Markdown files only)
80
+ #[serde(skip_serializing_if = "Option::is_none")]
81
+ pub code_blocks: Option<Vec<(String, String)>>,
82
+ }
83
+
84
+ /// PowerPoint (PPTX) extraction result.
85
+ ///
86
+ /// Contains extracted slide content, metadata, and embedded images/tables.
87
+ #[derive(Debug, Clone, Serialize, Deserialize)]
88
+ pub struct PptxExtractionResult {
89
+ /// Extracted text content from all slides
90
+ pub content: String,
91
+ /// Presentation metadata
92
+ pub metadata: PptxMetadata,
93
+ /// Total number of slides
94
+ pub slide_count: usize,
95
+ /// Total number of embedded images
96
+ pub image_count: usize,
97
+ /// Total number of tables
98
+ pub table_count: usize,
99
+ /// Extracted images from the presentation
100
+ pub images: Vec<ExtractedImage>,
101
+ /// Slide structure with boundaries (when page tracking is enabled)
102
+ #[serde(skip_serializing_if = "Option::is_none")]
103
+ pub page_structure: Option<PageStructure>,
104
+ /// Per-slide content (when page tracking is enabled)
105
+ #[serde(skip_serializing_if = "Option::is_none")]
106
+ pub page_contents: Option<Vec<PageContent>>,
107
+ }
108
+
109
+ /// Email extraction result.
110
+ ///
111
+ /// Complete representation of an extracted email message (.eml or .msg)
112
+ /// including headers, body content, and attachments.
113
+ #[derive(Debug, Clone, Serialize, Deserialize)]
114
+ pub struct EmailExtractionResult {
115
+ /// Email subject line
116
+ pub subject: Option<String>,
117
+ /// Sender email address
118
+ pub from_email: Option<String>,
119
+ /// Primary recipient email addresses
120
+ pub to_emails: Vec<String>,
121
+ /// CC recipient email addresses
122
+ pub cc_emails: Vec<String>,
123
+ /// BCC recipient email addresses
124
+ pub bcc_emails: Vec<String>,
125
+ /// Email date/timestamp
126
+ pub date: Option<String>,
127
+ /// Message-ID header value
128
+ pub message_id: Option<String>,
129
+ /// Plain text version of the email body
130
+ pub plain_text: Option<String>,
131
+ /// HTML version of the email body
132
+ pub html_content: Option<String>,
133
+ /// Cleaned/processed text content
134
+ pub cleaned_text: String,
135
+ /// List of email attachments
136
+ pub attachments: Vec<EmailAttachment>,
137
+ /// Additional email headers and metadata
138
+ pub metadata: HashMap<String, String>,
139
+ }
140
+
141
+ /// Email attachment representation.
142
+ ///
143
+ /// Contains metadata and optionally the content of an email attachment.
144
+ #[derive(Debug, Clone, Serialize, Deserialize)]
145
+ pub struct EmailAttachment {
146
+ /// Attachment name (from Content-Disposition header)
147
+ pub name: Option<String>,
148
+ /// Filename of the attachment
149
+ pub filename: Option<String>,
150
+ /// MIME type of the attachment
151
+ pub mime_type: Option<String>,
152
+ /// Size in bytes
153
+ pub size: Option<usize>,
154
+ /// Whether this attachment is an image
155
+ pub is_image: bool,
156
+ /// Attachment data (if extracted)
157
+ pub data: Option<Vec<u8>>,
158
+ }
159
+
160
+ /// OCR extraction result.
161
+ ///
162
+ /// Result of performing OCR on an image or scanned document,
163
+ /// including recognized text and detected tables.
164
+ #[derive(Debug, Clone, Serialize, Deserialize)]
165
+ pub struct OcrExtractionResult {
166
+ /// Recognized text content
167
+ pub content: String,
168
+ /// Original MIME type of the processed image
169
+ pub mime_type: String,
170
+ /// OCR processing metadata (confidence scores, language, etc.)
171
+ pub metadata: HashMap<String, serde_json::Value>,
172
+ /// Tables detected and extracted via OCR
173
+ pub tables: Vec<OcrTable>,
174
+ }
175
+
176
+ /// Table detected via OCR.
177
+ ///
178
+ /// Represents a table structure recognized during OCR processing.
179
+ #[derive(Debug, Clone, Serialize, Deserialize)]
180
+ pub struct OcrTable {
181
+ /// Table cells as a 2D vector (rows × columns)
182
+ pub cells: Vec<Vec<String>>,
183
+ /// Markdown representation of the table
184
+ pub markdown: String,
185
+ /// Page number where the table was found (1-indexed)
186
+ pub page_number: usize,
187
+ }
188
+
189
+ /// Image preprocessing configuration for OCR.
190
+ ///
191
+ /// These settings control how images are preprocessed before OCR to improve
192
+ /// text recognition quality. Different preprocessing strategies work better
193
+ /// for different document types.
194
+ #[derive(Debug, Clone, Serialize, Deserialize)]
195
+ #[serde(default)]
196
+ pub struct ImagePreprocessingConfig {
197
+ /// Target DPI for the image (300 is standard, 600 for small text).
198
+ pub target_dpi: i32,
199
+
200
+ /// Auto-detect and correct image rotation.
201
+ pub auto_rotate: bool,
202
+
203
+ /// Correct skew (tilted images).
204
+ pub deskew: bool,
205
+
206
+ /// Remove noise from the image.
207
+ pub denoise: bool,
208
+
209
+ /// Enhance contrast for better text visibility.
210
+ pub contrast_enhance: bool,
211
+
212
+ /// Binarization method: "otsu", "sauvola", "adaptive".
213
+ pub binarization_method: String,
214
+
215
+ /// Invert colors (white text on black → black on white).
216
+ pub invert_colors: bool,
217
+ }
218
+
219
+ impl Default for ImagePreprocessingConfig {
220
+ fn default() -> Self {
221
+ Self {
222
+ target_dpi: 300,
223
+ auto_rotate: true,
224
+ deskew: true,
225
+ denoise: false,
226
+ contrast_enhance: false,
227
+ binarization_method: "otsu".to_string(),
228
+ invert_colors: false,
229
+ }
230
+ }
231
+ }
232
+
233
+ /// Tesseract OCR configuration.
234
+ ///
235
+ /// Provides fine-grained control over Tesseract OCR engine parameters.
236
+ /// Most users can use the defaults, but these settings allow optimization
237
+ /// for specific document types (invoices, handwriting, etc.).
238
+ #[derive(Debug, Clone, Serialize, Deserialize)]
239
+ #[serde(default)]
240
+ pub struct TesseractConfig {
241
+ /// Language code (e.g., "eng", "deu", "fra")
242
+ pub language: String,
243
+
244
+ /// Page Segmentation Mode (0-13).
245
+ ///
246
+ /// Common values:
247
+ /// - 3: Fully automatic page segmentation (default)
248
+ /// - 6: Assume a single uniform block of text
249
+ /// - 11: Sparse text with no particular order
250
+ pub psm: i32,
251
+
252
+ /// Output format ("text" or "markdown")
253
+ pub output_format: String,
254
+
255
+ /// OCR Engine Mode (0-3).
256
+ ///
257
+ /// - 0: Legacy engine only
258
+ /// - 1: Neural nets (LSTM) only (usually best)
259
+ /// - 2: Legacy + LSTM
260
+ /// - 3: Default (based on what's available)
261
+ pub oem: i32,
262
+
263
+ /// Minimum confidence threshold (0.0-100.0).
264
+ ///
265
+ /// Words with confidence below this threshold may be rejected or flagged.
266
+ pub min_confidence: f64,
267
+
268
+ /// Image preprocessing configuration.
269
+ ///
270
+ /// Controls how images are preprocessed before OCR. Can significantly
271
+ /// improve quality for scanned documents or low-quality images.
272
+ #[serde(skip_serializing_if = "Option::is_none")]
273
+ pub preprocessing: Option<ImagePreprocessingConfig>,
274
+
275
+ /// Enable automatic table detection and reconstruction
276
+ pub enable_table_detection: bool,
277
+
278
+ /// Minimum confidence threshold for table detection (0.0-1.0)
279
+ pub table_min_confidence: f64,
280
+
281
+ /// Column threshold for table detection (pixels)
282
+ pub table_column_threshold: i32,
283
+
284
+ /// Row threshold ratio for table detection (0.0-1.0)
285
+ pub table_row_threshold_ratio: f64,
286
+
287
+ /// Enable OCR result caching
288
+ pub use_cache: bool,
289
+
290
+ /// Use pre-adapted templates for character classification
291
+ pub classify_use_pre_adapted_templates: bool,
292
+
293
+ /// Enable N-gram language model
294
+ pub language_model_ngram_on: bool,
295
+
296
+ /// Don't reject good words during block-level processing
297
+ pub tessedit_dont_blkrej_good_wds: bool,
298
+
299
+ /// Don't reject good words during row-level processing
300
+ pub tessedit_dont_rowrej_good_wds: bool,
301
+
302
+ /// Enable dictionary correction
303
+ pub tessedit_enable_dict_correction: bool,
304
+
305
+ /// Whitelist of allowed characters (empty = all allowed)
306
+ pub tessedit_char_whitelist: String,
307
+
308
+ /// Blacklist of forbidden characters (empty = none forbidden)
309
+ pub tessedit_char_blacklist: String,
310
+
311
+ /// Use primary language params model
312
+ pub tessedit_use_primary_params_model: bool,
313
+
314
+ /// Variable-width space detection
315
+ pub textord_space_size_is_variable: bool,
316
+
317
+ /// Use adaptive thresholding method
318
+ pub thresholding_method: bool,
319
+ }
320
+
321
+ impl Default for TesseractConfig {
322
+ fn default() -> Self {
323
+ Self {
324
+ language: "eng".to_string(),
325
+ psm: 3,
326
+ output_format: "markdown".to_string(),
327
+ oem: 3,
328
+ min_confidence: 0.0,
329
+ preprocessing: None,
330
+ enable_table_detection: true,
331
+ table_min_confidence: 0.0,
332
+ table_column_threshold: 50,
333
+ table_row_threshold_ratio: 0.5,
334
+ use_cache: true,
335
+ classify_use_pre_adapted_templates: true,
336
+ language_model_ngram_on: false,
337
+ tessedit_dont_blkrej_good_wds: true,
338
+ tessedit_dont_rowrej_good_wds: true,
339
+ tessedit_enable_dict_correction: true,
340
+ tessedit_char_whitelist: String::new(),
341
+ tessedit_char_blacklist: String::new(),
342
+ tessedit_use_primary_params_model: true,
343
+ textord_space_size_is_variable: true,
344
+ thresholding_method: false,
345
+ }
346
+ }
347
+ }
348
+
349
+ /// Image preprocessing metadata.
350
+ ///
351
+ /// Tracks the transformations applied to an image during OCR preprocessing,
352
+ /// including DPI normalization, resizing, and resampling.
353
+ #[derive(Debug, Clone, Serialize, Deserialize)]
354
+ pub struct ImagePreprocessingMetadata {
355
+ /// Original image dimensions (width, height) in pixels
356
+ pub original_dimensions: (usize, usize),
357
+ /// Original image DPI (horizontal, vertical)
358
+ pub original_dpi: (f64, f64),
359
+ /// Target DPI from configuration
360
+ pub target_dpi: i32,
361
+ /// Scaling factor applied to the image
362
+ pub scale_factor: f64,
363
+ /// Whether DPI was auto-adjusted based on content
364
+ pub auto_adjusted: bool,
365
+ /// Final DPI after processing
366
+ pub final_dpi: i32,
367
+ /// New dimensions after resizing (if resized)
368
+ pub new_dimensions: Option<(usize, usize)>,
369
+ /// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
370
+ pub resample_method: String,
371
+ /// Whether dimensions were clamped to max_image_dimension
372
+ pub dimension_clamped: bool,
373
+ /// Calculated optimal DPI (if auto_adjust_dpi enabled)
374
+ pub calculated_dpi: Option<i32>,
375
+ /// Whether resize was skipped (dimensions already optimal)
376
+ pub skipped_resize: bool,
377
+ /// Error message if resize failed
378
+ pub resize_error: Option<String>,
379
+ }
380
+
381
+ /// Image extraction configuration (internal use).
382
+ ///
383
+ /// **Note:** This is an internal type used for image preprocessing.
384
+ /// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
385
+ #[derive(Debug, Clone, Serialize, Deserialize)]
386
+ pub struct ExtractionConfig {
387
+ /// Target DPI for image normalization
388
+ pub target_dpi: i32,
389
+ /// Maximum image dimension (width or height)
390
+ pub max_image_dimension: i32,
391
+ /// Whether to auto-adjust DPI based on content
392
+ pub auto_adjust_dpi: bool,
393
+ /// Minimum DPI threshold
394
+ pub min_dpi: i32,
395
+ /// Maximum DPI threshold
396
+ pub max_dpi: i32,
397
+ }
398
+
399
+ impl Default for ExtractionConfig {
400
+ fn default() -> Self {
401
+ Self {
402
+ target_dpi: 300,
403
+ max_image_dimension: 4096,
404
+ auto_adjust_dpi: true,
405
+ min_dpi: 72,
406
+ max_dpi: 600,
407
+ }
408
+ }
409
+ }
410
+
411
+ /// Cache statistics.
412
+ ///
413
+ /// Provides information about the extraction result cache,
414
+ /// including size, file count, and age distribution.
415
+ #[derive(Debug, Clone, Serialize, Deserialize)]
416
+ pub struct CacheStats {
417
+ /// Total number of cached files
418
+ pub total_files: usize,
419
+ /// Total cache size in megabytes
420
+ pub total_size_mb: f64,
421
+ /// Available disk space in megabytes
422
+ pub available_space_mb: f64,
423
+ /// Age of the oldest cached file in days
424
+ pub oldest_file_age_days: f64,
425
+ /// Age of the newest cached file in days
426
+ pub newest_file_age_days: f64,
427
+ }
428
+
429
+ /// LibreOffice conversion result.
430
+ ///
431
+ /// Result of converting a legacy office document (e.g., .doc, .ppt)
432
+ /// to a modern format using LibreOffice.
433
+ #[derive(Debug, Clone, Serialize, Deserialize)]
434
+ pub struct LibreOfficeConversionResult {
435
+ /// Converted file bytes
436
+ pub converted_bytes: Vec<u8>,
437
+ /// Original format identifier
438
+ pub original_format: String,
439
+ /// Target format identifier
440
+ pub target_format: String,
441
+ /// Target MIME type after conversion
442
+ pub target_mime: String,
443
+ }