kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,560 @@
1
+ //! Metadata types for extraction results.
2
+ //!
3
+ //! This module defines metadata structures for various document formats.
4
+
5
+ use serde::{Deserialize, Serialize};
6
+ use std::collections::{BTreeMap, HashMap};
7
+
8
+ #[cfg(feature = "pdf")]
9
+ use crate::pdf::metadata::PdfMetadata;
10
+
11
+ use super::formats::ImagePreprocessingMetadata;
12
+ use super::page::PageStructure;
13
+
14
+ /// Format-specific metadata (discriminated union).
15
+ ///
16
+ /// Only one format type can exist per extraction result. This provides
17
+ /// type-safe, clean metadata without nested optionals.
18
+ #[derive(Debug, Clone, Serialize, Deserialize)]
19
+ #[serde(tag = "format_type", rename_all = "snake_case")]
20
+ pub enum FormatMetadata {
21
+ #[cfg(feature = "pdf")]
22
+ Pdf(PdfMetadata),
23
+ Excel(ExcelMetadata),
24
+ Email(EmailMetadata),
25
+ Pptx(PptxMetadata),
26
+ Archive(ArchiveMetadata),
27
+ Image(ImageMetadata),
28
+ Xml(XmlMetadata),
29
+ Text(TextMetadata),
30
+ Html(Box<HtmlMetadata>),
31
+ Ocr(OcrMetadata),
32
+ }
33
+
34
+ /// Extraction result metadata.
35
+ ///
36
+ /// Contains common fields applicable to all formats, format-specific metadata
37
+ /// via a discriminated union, and additional custom fields from postprocessors.
38
+ #[derive(Debug, Clone, Serialize, Deserialize, Default)]
39
+ pub struct Metadata {
40
+ /// Document title
41
+ #[serde(skip_serializing_if = "Option::is_none")]
42
+ pub title: Option<String>,
43
+
44
+ /// Document subject or description
45
+ #[serde(skip_serializing_if = "Option::is_none")]
46
+ pub subject: Option<String>,
47
+
48
+ /// Primary author(s) - always Vec for consistency
49
+ #[serde(skip_serializing_if = "Option::is_none")]
50
+ pub authors: Option<Vec<String>>,
51
+
52
+ /// Keywords/tags - always Vec for consistency
53
+ #[serde(skip_serializing_if = "Option::is_none")]
54
+ pub keywords: Option<Vec<String>>,
55
+
56
+ /// Primary language (ISO 639 code)
57
+ #[serde(skip_serializing_if = "Option::is_none")]
58
+ pub language: Option<String>,
59
+
60
+ /// Creation timestamp (ISO 8601 format)
61
+ #[serde(skip_serializing_if = "Option::is_none")]
62
+ pub created_at: Option<String>,
63
+
64
+ /// Last modification timestamp (ISO 8601 format)
65
+ #[serde(skip_serializing_if = "Option::is_none")]
66
+ pub modified_at: Option<String>,
67
+
68
+ /// User who created the document
69
+ #[serde(skip_serializing_if = "Option::is_none")]
70
+ pub created_by: Option<String>,
71
+
72
+ /// User who last modified the document
73
+ #[serde(skip_serializing_if = "Option::is_none")]
74
+ pub modified_by: Option<String>,
75
+
76
+ /// Page/slide/sheet structure with boundaries
77
+ #[serde(skip_serializing_if = "Option::is_none")]
78
+ pub pages: Option<PageStructure>,
79
+
80
+ /// Format-specific metadata (discriminated union)
81
+ ///
82
+ /// Contains detailed metadata specific to the document format.
83
+ /// Serializes with a `format_type` discriminator field.
84
+ #[serde(flatten, skip_serializing_if = "Option::is_none")]
85
+ pub format: Option<FormatMetadata>,
86
+
87
+ /// Image preprocessing metadata (when OCR preprocessing was applied)
88
+ #[serde(skip_serializing_if = "Option::is_none")]
89
+ pub image_preprocessing: Option<ImagePreprocessingMetadata>,
90
+
91
+ /// JSON schema (for structured data extraction)
92
+ #[serde(skip_serializing_if = "Option::is_none")]
93
+ pub json_schema: Option<serde_json::Value>,
94
+
95
+ /// Error metadata (for batch operations)
96
+ #[serde(skip_serializing_if = "Option::is_none")]
97
+ pub error: Option<ErrorMetadata>,
98
+
99
+ /// Additional custom fields from postprocessors.
100
+ ///
101
+ /// This flattened HashMap allows Python/TypeScript postprocessors to add
102
+ /// arbitrary fields (entity extraction, keyword extraction, etc.).
103
+ /// Fields are merged at the root level during serialization.
104
+ #[serde(flatten)]
105
+ pub additional: HashMap<String, serde_json::Value>,
106
+ }
107
+
108
+ /// Excel/spreadsheet metadata.
109
+ ///
110
+ /// Contains information about sheets in Excel, LibreOffice Calc, and other
111
+ /// spreadsheet formats (.xlsx, .xls, .ods, etc.).
112
+ #[derive(Debug, Clone, Serialize, Deserialize)]
113
+ pub struct ExcelMetadata {
114
+ /// Total number of sheets in the workbook
115
+ pub sheet_count: usize,
116
+ /// Names of all sheets in order
117
+ pub sheet_names: Vec<String>,
118
+ }
119
+
120
+ /// Email metadata extracted from .eml and .msg files.
121
+ ///
122
+ /// Includes sender/recipient information, message ID, and attachment list.
123
+ #[derive(Debug, Clone, Serialize, Deserialize)]
124
+ pub struct EmailMetadata {
125
+ /// Sender's email address
126
+ #[serde(skip_serializing_if = "Option::is_none")]
127
+ pub from_email: Option<String>,
128
+
129
+ /// Sender's display name
130
+ #[serde(skip_serializing_if = "Option::is_none")]
131
+ pub from_name: Option<String>,
132
+
133
+ /// Primary recipients
134
+ pub to_emails: Vec<String>,
135
+ /// CC recipients
136
+ pub cc_emails: Vec<String>,
137
+ /// BCC recipients
138
+ pub bcc_emails: Vec<String>,
139
+
140
+ /// Message-ID header value
141
+ #[serde(skip_serializing_if = "Option::is_none")]
142
+ pub message_id: Option<String>,
143
+
144
+ /// List of attachment filenames
145
+ pub attachments: Vec<String>,
146
+ }
147
+
148
+ /// Archive (ZIP/TAR/7Z) metadata.
149
+ ///
150
+ /// Extracted from compressed archive files containing file lists and size information.
151
+ #[derive(Debug, Clone, Serialize, Deserialize)]
152
+ pub struct ArchiveMetadata {
153
+ /// Archive format ("ZIP", "TAR", "7Z", etc.)
154
+ pub format: String,
155
+ /// Total number of files in the archive
156
+ pub file_count: usize,
157
+ /// List of file paths within the archive
158
+ pub file_list: Vec<String>,
159
+ /// Total uncompressed size in bytes
160
+ pub total_size: usize,
161
+
162
+ /// Compressed size in bytes (if available)
163
+ #[serde(skip_serializing_if = "Option::is_none")]
164
+ pub compressed_size: Option<usize>,
165
+ }
166
+
167
+ /// Image metadata extracted from image files.
168
+ ///
169
+ /// Includes dimensions, format, and EXIF data.
170
+ #[derive(Debug, Clone, Serialize, Deserialize)]
171
+ pub struct ImageMetadata {
172
+ /// Image width in pixels
173
+ pub width: u32,
174
+ /// Image height in pixels
175
+ pub height: u32,
176
+ /// Image format (e.g., "PNG", "JPEG", "TIFF")
177
+ pub format: String,
178
+ /// EXIF metadata tags
179
+ pub exif: HashMap<String, String>,
180
+ }
181
+
182
+ /// XML metadata extracted during XML parsing.
183
+ ///
184
+ /// Provides statistics about XML document structure.
185
+ #[derive(Debug, Clone, Serialize, Deserialize)]
186
+ pub struct XmlMetadata {
187
+ /// Total number of XML elements processed
188
+ pub element_count: usize,
189
+ /// List of unique element tag names (sorted)
190
+ pub unique_elements: Vec<String>,
191
+ }
192
+
193
+ /// Text/Markdown metadata.
194
+ ///
195
+ /// Extracted from plain text and Markdown files. Includes word counts and,
196
+ /// for Markdown, structural elements like headers and links.
197
+ #[derive(Debug, Clone, Serialize, Deserialize)]
198
+ pub struct TextMetadata {
199
+ /// Number of lines in the document
200
+ pub line_count: usize,
201
+ /// Number of words
202
+ pub word_count: usize,
203
+ /// Number of characters
204
+ pub character_count: usize,
205
+
206
+ /// Markdown headers (headings text only, for Markdown files)
207
+ #[serde(skip_serializing_if = "Option::is_none")]
208
+ pub headers: Option<Vec<String>>,
209
+
210
+ /// Markdown links as (text, url) tuples (for Markdown files)
211
+ #[serde(skip_serializing_if = "Option::is_none")]
212
+ pub links: Option<Vec<(String, String)>>,
213
+
214
+ /// Code blocks as (language, code) tuples (for Markdown files)
215
+ #[serde(skip_serializing_if = "Option::is_none")]
216
+ pub code_blocks: Option<Vec<(String, String)>>,
217
+ }
218
+
219
+ /// Text direction enumeration for HTML documents.
220
+ #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
221
+ #[serde(rename_all = "lowercase")]
222
+ pub enum TextDirection {
223
+ /// Left-to-right text direction
224
+ #[serde(rename = "ltr")]
225
+ LeftToRight,
226
+ /// Right-to-left text direction
227
+ #[serde(rename = "rtl")]
228
+ RightToLeft,
229
+ /// Automatic text direction detection
230
+ #[serde(rename = "auto")]
231
+ Auto,
232
+ }
233
+
234
+ /// Header/heading element metadata.
235
+ #[derive(Debug, Clone, Serialize, Deserialize)]
236
+ pub struct HeaderMetadata {
237
+ /// Header level: 1 (h1) through 6 (h6)
238
+ pub level: u8,
239
+ /// Normalized text content of the header
240
+ pub text: String,
241
+ /// HTML id attribute if present
242
+ #[serde(skip_serializing_if = "Option::is_none")]
243
+ pub id: Option<String>,
244
+ /// Document tree depth at the header element
245
+ pub depth: usize,
246
+ /// Byte offset in original HTML document
247
+ pub html_offset: usize,
248
+ }
249
+
250
+ /// Link element metadata.
251
+ #[derive(Debug, Clone, Serialize, Deserialize)]
252
+ pub struct LinkMetadata {
253
+ /// The href URL value
254
+ pub href: String,
255
+ /// Link text content (normalized)
256
+ pub text: String,
257
+ /// Optional title attribute
258
+ #[serde(skip_serializing_if = "Option::is_none")]
259
+ pub title: Option<String>,
260
+ /// Link type classification
261
+ pub link_type: LinkType,
262
+ /// Rel attribute values
263
+ pub rel: Vec<String>,
264
+ /// Additional attributes as key-value pairs
265
+ pub attributes: HashMap<String, String>,
266
+ }
267
+
268
+ /// Link type classification.
269
+ #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
270
+ #[serde(rename_all = "lowercase")]
271
+ pub enum LinkType {
272
+ /// Anchor link (#section)
273
+ Anchor,
274
+ /// Internal link (same domain)
275
+ Internal,
276
+ /// External link (different domain)
277
+ External,
278
+ /// Email link (mailto:)
279
+ Email,
280
+ /// Phone link (tel:)
281
+ Phone,
282
+ /// Other link type
283
+ Other,
284
+ }
285
+
286
+ /// Image element metadata.
287
+ #[derive(Debug, Clone, Serialize, Deserialize)]
288
+ pub struct ImageMetadataType {
289
+ /// Image source (URL, data URI, or SVG content)
290
+ pub src: String,
291
+ /// Alternative text from alt attribute
292
+ #[serde(skip_serializing_if = "Option::is_none")]
293
+ pub alt: Option<String>,
294
+ /// Title attribute
295
+ #[serde(skip_serializing_if = "Option::is_none")]
296
+ pub title: Option<String>,
297
+ /// Image dimensions as (width, height) if available
298
+ pub dimensions: Option<(u32, u32)>,
299
+ /// Image type classification
300
+ pub image_type: ImageType,
301
+ /// Additional attributes as key-value pairs
302
+ pub attributes: HashMap<String, String>,
303
+ }
304
+
305
+ /// Image type classification.
306
+ #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
307
+ #[serde(rename_all = "lowercase")]
308
+ pub enum ImageType {
309
+ /// Data URI image
310
+ #[serde(rename = "data-uri")]
311
+ DataUri,
312
+ /// Inline SVG
313
+ #[serde(rename = "inline-svg")]
314
+ InlineSvg,
315
+ /// External image URL
316
+ External,
317
+ /// Relative path image
318
+ Relative,
319
+ }
320
+
321
+ /// Structured data (Schema.org, microdata, RDFa) block.
322
+ #[derive(Debug, Clone, Serialize, Deserialize)]
323
+ pub struct StructuredData {
324
+ /// Type of structured data
325
+ pub data_type: StructuredDataType,
326
+ /// Raw JSON string representation
327
+ pub raw_json: String,
328
+ /// Schema type if detectable (e.g., "Article", "Event", "Product")
329
+ #[serde(skip_serializing_if = "Option::is_none")]
330
+ pub schema_type: Option<String>,
331
+ }
332
+
333
+ /// Structured data type classification.
334
+ #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
335
+ #[serde(rename_all = "lowercase")]
336
+ pub enum StructuredDataType {
337
+ /// JSON-LD structured data
338
+ #[serde(rename = "json-ld")]
339
+ JsonLd,
340
+ /// Microdata
341
+ Microdata,
342
+ /// RDFa
343
+ #[serde(rename = "rdfa")]
344
+ RDFa,
345
+ }
346
+
347
+ /// HTML metadata extracted from HTML documents.
348
+ ///
349
+ /// Includes document-level metadata, Open Graph data, Twitter Card metadata,
350
+ /// and extracted structural elements (headers, links, images, structured data).
351
+ #[derive(Debug, Clone, Serialize, Deserialize, Default)]
352
+ pub struct HtmlMetadata {
353
+ /// Document title from `<title>` tag
354
+ #[serde(skip_serializing_if = "Option::is_none")]
355
+ pub title: Option<String>,
356
+
357
+ /// Document description from `<meta name="description">` tag
358
+ #[serde(skip_serializing_if = "Option::is_none")]
359
+ pub description: Option<String>,
360
+
361
+ /// Document keywords from `<meta name="keywords">` tag, split on commas
362
+ #[serde(default)]
363
+ pub keywords: Vec<String>,
364
+
365
+ /// Document author from `<meta name="author">` tag
366
+ #[serde(skip_serializing_if = "Option::is_none")]
367
+ pub author: Option<String>,
368
+
369
+ /// Canonical URL from `<link rel="canonical">` tag
370
+ #[serde(skip_serializing_if = "Option::is_none")]
371
+ pub canonical_url: Option<String>,
372
+
373
+ /// Base URL from `<base href="">` tag for resolving relative URLs
374
+ #[serde(skip_serializing_if = "Option::is_none")]
375
+ pub base_href: Option<String>,
376
+
377
+ /// Document language from `lang` attribute
378
+ #[serde(skip_serializing_if = "Option::is_none")]
379
+ pub language: Option<String>,
380
+
381
+ /// Document text direction from `dir` attribute
382
+ #[serde(skip_serializing_if = "Option::is_none")]
383
+ pub text_direction: Option<TextDirection>,
384
+
385
+ /// Open Graph metadata (og:* properties) for social media
386
+ /// Keys like "title", "description", "image", "url", etc.
387
+ #[serde(default)]
388
+ pub open_graph: BTreeMap<String, String>,
389
+
390
+ /// Twitter Card metadata (twitter:* properties)
391
+ /// Keys like "card", "site", "creator", "title", "description", "image", etc.
392
+ #[serde(default)]
393
+ pub twitter_card: BTreeMap<String, String>,
394
+
395
+ /// Additional meta tags not covered by specific fields
396
+ /// Keys are meta name/property attributes, values are content
397
+ #[serde(default)]
398
+ pub meta_tags: BTreeMap<String, String>,
399
+
400
+ /// Extracted header elements with hierarchy
401
+ #[serde(default)]
402
+ pub headers: Vec<HeaderMetadata>,
403
+
404
+ /// Extracted hyperlinks with type classification
405
+ #[serde(default)]
406
+ pub links: Vec<LinkMetadata>,
407
+
408
+ /// Extracted images with source and dimensions
409
+ #[serde(default)]
410
+ pub images: Vec<ImageMetadataType>,
411
+
412
+ /// Extracted structured data blocks
413
+ #[serde(default)]
414
+ pub structured_data: Vec<StructuredData>,
415
+ }
416
+
417
+ impl HtmlMetadata {
418
+ /// Check if metadata is empty (no meaningful content extracted).
419
+ pub fn is_empty(&self) -> bool {
420
+ self.title.is_none()
421
+ && self.description.is_none()
422
+ && self.keywords.is_empty()
423
+ && self.author.is_none()
424
+ && self.canonical_url.is_none()
425
+ && self.base_href.is_none()
426
+ && self.language.is_none()
427
+ && self.text_direction.is_none()
428
+ && self.open_graph.is_empty()
429
+ && self.twitter_card.is_empty()
430
+ && self.meta_tags.is_empty()
431
+ && self.headers.is_empty()
432
+ && self.links.is_empty()
433
+ && self.images.is_empty()
434
+ && self.structured_data.is_empty()
435
+ }
436
+ }
437
+
438
+ #[cfg(feature = "html")]
439
+ impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
440
+ fn from(metadata: html_to_markdown_rs::ExtendedMetadata) -> Self {
441
+ let text_dir = metadata.document.text_direction.map(|td| match td {
442
+ html_to_markdown_rs::TextDirection::LeftToRight => TextDirection::LeftToRight,
443
+ html_to_markdown_rs::TextDirection::RightToLeft => TextDirection::RightToLeft,
444
+ html_to_markdown_rs::TextDirection::Auto => TextDirection::Auto,
445
+ });
446
+
447
+ HtmlMetadata {
448
+ title: metadata.document.title,
449
+ description: metadata.document.description,
450
+ keywords: metadata.document.keywords,
451
+ author: metadata.document.author,
452
+ canonical_url: metadata.document.canonical_url,
453
+ base_href: metadata.document.base_href,
454
+ language: metadata.document.language,
455
+ text_direction: text_dir,
456
+ open_graph: metadata.document.open_graph,
457
+ twitter_card: metadata.document.twitter_card,
458
+ meta_tags: metadata.document.meta_tags,
459
+ headers: metadata
460
+ .headers
461
+ .into_iter()
462
+ .map(|h| HeaderMetadata {
463
+ level: h.level,
464
+ text: h.text,
465
+ id: h.id,
466
+ depth: h.depth,
467
+ html_offset: h.html_offset,
468
+ })
469
+ .collect(),
470
+ links: metadata
471
+ .links
472
+ .into_iter()
473
+ .map(|l| LinkMetadata {
474
+ href: l.href,
475
+ text: l.text,
476
+ title: l.title,
477
+ link_type: match l.link_type {
478
+ html_to_markdown_rs::LinkType::Anchor => LinkType::Anchor,
479
+ html_to_markdown_rs::LinkType::Internal => LinkType::Internal,
480
+ html_to_markdown_rs::LinkType::External => LinkType::External,
481
+ html_to_markdown_rs::LinkType::Email => LinkType::Email,
482
+ html_to_markdown_rs::LinkType::Phone => LinkType::Phone,
483
+ html_to_markdown_rs::LinkType::Other => LinkType::Other,
484
+ },
485
+ rel: l.rel,
486
+ attributes: l.attributes.into_iter().collect(),
487
+ })
488
+ .collect(),
489
+ images: metadata
490
+ .images
491
+ .into_iter()
492
+ .map(|img| ImageMetadataType {
493
+ src: img.src,
494
+ alt: img.alt,
495
+ title: img.title,
496
+ dimensions: img.dimensions,
497
+ image_type: match img.image_type {
498
+ html_to_markdown_rs::ImageType::DataUri => ImageType::DataUri,
499
+ html_to_markdown_rs::ImageType::InlineSvg => ImageType::InlineSvg,
500
+ html_to_markdown_rs::ImageType::External => ImageType::External,
501
+ html_to_markdown_rs::ImageType::Relative => ImageType::Relative,
502
+ },
503
+ attributes: img.attributes.into_iter().collect(),
504
+ })
505
+ .collect(),
506
+ structured_data: metadata
507
+ .structured_data
508
+ .into_iter()
509
+ .map(|sd| StructuredData {
510
+ data_type: match sd.data_type {
511
+ html_to_markdown_rs::StructuredDataType::JsonLd => StructuredDataType::JsonLd,
512
+ html_to_markdown_rs::StructuredDataType::Microdata => StructuredDataType::Microdata,
513
+ html_to_markdown_rs::StructuredDataType::RDFa => StructuredDataType::RDFa,
514
+ },
515
+ raw_json: sd.raw_json,
516
+ schema_type: sd.schema_type,
517
+ })
518
+ .collect(),
519
+ }
520
+ }
521
+ }
522
+
523
+ /// OCR processing metadata.
524
+ ///
525
+ /// Captures information about OCR processing configuration and results.
526
+ #[derive(Debug, Clone, Serialize, Deserialize)]
527
+ pub struct OcrMetadata {
528
+ /// OCR language code(s) used
529
+ pub language: String,
530
+ /// Tesseract Page Segmentation Mode (PSM)
531
+ pub psm: i32,
532
+ /// Output format (e.g., "text", "hocr")
533
+ pub output_format: String,
534
+ /// Number of tables detected
535
+ pub table_count: usize,
536
+
537
+ #[serde(skip_serializing_if = "Option::is_none")]
538
+ pub table_rows: Option<usize>,
539
+
540
+ #[serde(skip_serializing_if = "Option::is_none")]
541
+ pub table_cols: Option<usize>,
542
+ }
543
+
544
+ /// Error metadata (for batch operations).
545
+ #[derive(Debug, Clone, Serialize, Deserialize)]
546
+ pub struct ErrorMetadata {
547
+ pub error_type: String,
548
+ pub message: String,
549
+ }
550
+
551
+ /// PowerPoint presentation metadata.
552
+ ///
553
+ /// Extracted from PPTX files containing slide counts and presentation details.
554
+ #[derive(Debug, Clone, Serialize, Deserialize)]
555
+ pub struct PptxMetadata {
556
+ /// Total number of slides in the presentation
557
+ pub slide_count: usize,
558
+ /// Names of slides (if available)
559
+ pub slide_names: Vec<String>,
560
+ }