kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,209 @@
1
+ //! Djot document types.
2
+ //!
3
+ //! This module defines types for representing Djot document structures.
4
+
5
+ use serde::{Deserialize, Serialize};
6
+ use std::collections::HashMap;
7
+
8
+ // Import Metadata and Table from parent module
9
+ use super::Table;
10
+ use super::metadata::Metadata;
11
+
12
+ /// Comprehensive Djot document structure with semantic preservation.
13
+ ///
14
+ /// This type captures the full richness of Djot markup, including:
15
+ /// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
16
+ /// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
17
+ /// - Attributes (classes, IDs, key-value pairs)
18
+ /// - Links, images, footnotes
19
+ /// - Math expressions (inline and display)
20
+ /// - Tables with full structure
21
+ ///
22
+ /// Available when the `djot` feature is enabled.
23
+ #[derive(Debug, Clone, Serialize, Deserialize)]
24
+ pub struct DjotContent {
25
+ /// Plain text representation for backwards compatibility
26
+ pub plain_text: String,
27
+
28
+ /// Structured block-level content
29
+ pub blocks: Vec<FormattedBlock>,
30
+
31
+ /// Metadata from YAML frontmatter
32
+ pub metadata: Metadata,
33
+
34
+ /// Extracted tables as structured data
35
+ pub tables: Vec<Table>,
36
+
37
+ /// Extracted images with metadata
38
+ pub images: Vec<DjotImage>,
39
+
40
+ /// Extracted links with URLs
41
+ pub links: Vec<DjotLink>,
42
+
43
+ /// Footnote definitions
44
+ pub footnotes: Vec<Footnote>,
45
+
46
+ /// Attributes mapped by element identifier (if present)
47
+ #[serde(skip_serializing_if = "HashMap::is_empty", default)]
48
+ pub attributes: HashMap<String, Attributes>,
49
+ }
50
+
51
+ /// Block-level element in a Djot document.
52
+ ///
53
+ /// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
54
+ #[derive(Debug, Clone, Serialize, Deserialize)]
55
+ pub struct FormattedBlock {
56
+ /// Type of block element
57
+ pub block_type: BlockType,
58
+
59
+ /// Heading level (1-6) for headings, or nesting level for lists
60
+ #[serde(skip_serializing_if = "Option::is_none")]
61
+ pub level: Option<usize>,
62
+
63
+ /// Inline content within the block
64
+ pub inline_content: Vec<InlineElement>,
65
+
66
+ /// Element attributes (classes, IDs, key-value pairs)
67
+ #[serde(skip_serializing_if = "Option::is_none")]
68
+ pub attributes: Option<Attributes>,
69
+
70
+ /// Language identifier for code blocks
71
+ #[serde(skip_serializing_if = "Option::is_none")]
72
+ pub language: Option<String>,
73
+
74
+ /// Raw code content for code blocks
75
+ #[serde(skip_serializing_if = "Option::is_none")]
76
+ pub code: Option<String>,
77
+
78
+ /// Nested blocks for containers (blockquotes, list items, divs)
79
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
80
+ pub children: Vec<FormattedBlock>,
81
+ }
82
+
83
+ /// Types of block-level elements in Djot.
84
+ #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
85
+ #[serde(rename_all = "snake_case")]
86
+ pub enum BlockType {
87
+ Paragraph,
88
+ Heading,
89
+ Blockquote,
90
+ CodeBlock,
91
+ ListItem,
92
+ OrderedList,
93
+ BulletList,
94
+ TaskList,
95
+ DefinitionList,
96
+ DefinitionTerm,
97
+ DefinitionDescription,
98
+ Div,
99
+ Section,
100
+ ThematicBreak,
101
+ RawBlock,
102
+ MathDisplay,
103
+ }
104
+
105
+ /// Inline element within a block.
106
+ ///
107
+ /// Represents text with formatting, links, images, etc.
108
+ #[derive(Debug, Clone, Serialize, Deserialize)]
109
+ pub struct InlineElement {
110
+ /// Type of inline element
111
+ pub element_type: InlineType,
112
+
113
+ /// Text content
114
+ pub content: String,
115
+
116
+ /// Element attributes
117
+ #[serde(skip_serializing_if = "Option::is_none")]
118
+ pub attributes: Option<Attributes>,
119
+
120
+ /// Additional metadata (e.g., href for links, src/alt for images)
121
+ #[serde(skip_serializing_if = "Option::is_none")]
122
+ pub metadata: Option<HashMap<String, String>>,
123
+ }
124
+
125
+ /// Types of inline elements in Djot.
126
+ #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
127
+ #[serde(rename_all = "snake_case")]
128
+ pub enum InlineType {
129
+ Text,
130
+ Strong,
131
+ Emphasis,
132
+ Highlight,
133
+ Subscript,
134
+ Superscript,
135
+ Insert,
136
+ Delete,
137
+ Code,
138
+ Link,
139
+ Image,
140
+ Span,
141
+ Math,
142
+ RawInline,
143
+ FootnoteRef,
144
+ Symbol,
145
+ }
146
+
147
+ /// Element attributes in Djot.
148
+ ///
149
+ /// Represents the attributes attached to elements using {.class #id key="value"} syntax.
150
+ #[derive(Debug, Clone, Serialize, Deserialize, Default)]
151
+ pub struct Attributes {
152
+ /// Element ID (#identifier)
153
+ #[serde(skip_serializing_if = "Option::is_none")]
154
+ pub id: Option<String>,
155
+
156
+ /// CSS classes (.class1 .class2)
157
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
158
+ pub classes: Vec<String>,
159
+
160
+ /// Key-value pairs (key="value")
161
+ #[serde(skip_serializing_if = "HashMap::is_empty", default)]
162
+ pub key_values: HashMap<String, String>,
163
+ }
164
+
165
+ /// Image element in Djot.
166
+ #[derive(Debug, Clone, Serialize, Deserialize)]
167
+ pub struct DjotImage {
168
+ /// Image source URL or path
169
+ pub src: String,
170
+
171
+ /// Alternative text
172
+ pub alt: String,
173
+
174
+ /// Optional title
175
+ #[serde(skip_serializing_if = "Option::is_none")]
176
+ pub title: Option<String>,
177
+
178
+ /// Element attributes
179
+ #[serde(skip_serializing_if = "Option::is_none")]
180
+ pub attributes: Option<Attributes>,
181
+ }
182
+
183
+ /// Link element in Djot.
184
+ #[derive(Debug, Clone, Serialize, Deserialize)]
185
+ pub struct DjotLink {
186
+ /// Link URL
187
+ pub url: String,
188
+
189
+ /// Link text content
190
+ pub text: String,
191
+
192
+ /// Optional title
193
+ #[serde(skip_serializing_if = "Option::is_none")]
194
+ pub title: Option<String>,
195
+
196
+ /// Element attributes
197
+ #[serde(skip_serializing_if = "Option::is_none")]
198
+ pub attributes: Option<Attributes>,
199
+ }
200
+
201
+ /// Footnote in Djot.
202
+ #[derive(Debug, Clone, Serialize, Deserialize)]
203
+ pub struct Footnote {
204
+ /// Footnote label
205
+ pub label: String,
206
+
207
+ /// Footnote content blocks
208
+ pub content: Vec<FormattedBlock>,
209
+ }
@@ -0,0 +1,301 @@
1
+ //! Core extraction types and results.
2
+
3
+ use serde::{Deserialize, Serialize};
4
+ use std::collections::HashMap;
5
+
6
+ use super::djot::DjotContent;
7
+ use super::metadata::Metadata;
8
+ use super::page::PageContent;
9
+ use super::tables::Table;
10
+
11
+ /// General extraction result used by the core extraction API.
12
+ ///
13
+ /// This is the main result type returned by all extraction functions.
14
+ #[derive(Debug, Clone, Serialize, Deserialize)]
15
+ pub struct ExtractionResult {
16
+ pub content: String,
17
+ pub mime_type: String,
18
+ pub metadata: Metadata,
19
+ pub tables: Vec<Table>,
20
+ #[serde(skip_serializing_if = "Option::is_none")]
21
+ pub detected_languages: Option<Vec<String>>,
22
+
23
+ /// Text chunks when chunking is enabled.
24
+ ///
25
+ /// When chunking configuration is provided, the content is split into
26
+ /// overlapping chunks for efficient processing. Each chunk contains the text,
27
+ /// optional embeddings (if enabled), and metadata about its position.
28
+ #[serde(skip_serializing_if = "Option::is_none")]
29
+ pub chunks: Option<Vec<Chunk>>,
30
+
31
+ /// Extracted images from the document.
32
+ ///
33
+ /// When image extraction is enabled via `ImageExtractionConfig`, this field
34
+ /// contains all images found in the document with their raw data and metadata.
35
+ /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
36
+ #[serde(skip_serializing_if = "Option::is_none")]
37
+ pub images: Option<Vec<ExtractedImage>>,
38
+
39
+ /// Per-page content when page extraction is enabled.
40
+ ///
41
+ /// When page extraction is configured, the document is split into per-page content
42
+ /// with tables and images mapped to their respective pages.
43
+ #[serde(skip_serializing_if = "Option::is_none")]
44
+ pub pages: Option<Vec<PageContent>>,
45
+
46
+ /// Semantic elements when element-based output format is enabled.
47
+ ///
48
+ /// When output_format is set to ElementBased, this field contains semantic
49
+ /// elements with type classification, unique identifiers, and metadata for
50
+ /// Unstructured-compatible element-based processing.
51
+ #[serde(skip_serializing_if = "Option::is_none", default)]
52
+ pub elements: Option<Vec<Element>>,
53
+
54
+ /// Rich Djot content structure (when extracting Djot documents).
55
+ ///
56
+ /// When extracting Djot documents with structured extraction enabled,
57
+ /// this field contains the full semantic structure including:
58
+ /// - Block-level elements with nesting
59
+ /// - Inline formatting with attributes
60
+ /// - Links, images, footnotes
61
+ /// - Math expressions
62
+ /// - Complete attribute information
63
+ ///
64
+ /// The `content` field still contains plain text for backward compatibility.
65
+ ///
66
+ /// Always `None` for non-Djot documents.
67
+ #[serde(skip_serializing_if = "Option::is_none")]
68
+ #[serde(default)]
69
+ pub djot_content: Option<DjotContent>,
70
+ }
71
+
72
+ /// A text chunk with optional embedding and metadata.
73
+ ///
74
+ /// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
75
+ /// contains the text content, optional embedding vector (if embedding generation
76
+ /// is configured), and metadata about its position in the document.
77
+ #[derive(Debug, Clone, Serialize, Deserialize)]
78
+ pub struct Chunk {
79
+ /// The text content of this chunk.
80
+ pub content: String,
81
+
82
+ /// Optional embedding vector for this chunk.
83
+ ///
84
+ /// Only populated when `EmbeddingConfig` is provided in chunking configuration.
85
+ /// The dimensionality depends on the chosen embedding model.
86
+ #[serde(skip_serializing_if = "Option::is_none")]
87
+ pub embedding: Option<Vec<f32>>,
88
+
89
+ /// Metadata about this chunk's position and properties.
90
+ pub metadata: ChunkMetadata,
91
+ }
92
+
93
+ /// Metadata about a chunk's position in the original document.
94
+ #[derive(Debug, Clone, Serialize, Deserialize)]
95
+ pub struct ChunkMetadata {
96
+ /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
97
+ pub byte_start: usize,
98
+
99
+ /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
100
+ pub byte_end: usize,
101
+
102
+ /// Number of tokens in this chunk (if available).
103
+ ///
104
+ /// This is calculated by the embedding model's tokenizer if embeddings are enabled.
105
+ #[serde(skip_serializing_if = "Option::is_none")]
106
+ pub token_count: Option<usize>,
107
+
108
+ /// Zero-based index of this chunk in the document.
109
+ pub chunk_index: usize,
110
+
111
+ /// Total number of chunks in the document.
112
+ pub total_chunks: usize,
113
+
114
+ /// First page number this chunk spans (1-indexed).
115
+ ///
116
+ /// Only populated when page tracking is enabled in extraction configuration.
117
+ #[serde(skip_serializing_if = "Option::is_none")]
118
+ pub first_page: Option<usize>,
119
+
120
+ /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
121
+ ///
122
+ /// Only populated when page tracking is enabled in extraction configuration.
123
+ #[serde(skip_serializing_if = "Option::is_none")]
124
+ pub last_page: Option<usize>,
125
+ }
126
+
127
+ /// Extracted image from a document.
128
+ ///
129
+ /// Contains raw image data, metadata, and optional nested OCR results.
130
+ /// Raw bytes allow cross-language compatibility - users can convert to
131
+ /// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
132
+ #[derive(Debug, Clone, Serialize, Deserialize)]
133
+ pub struct ExtractedImage {
134
+ /// Raw image data (PNG, JPEG, WebP, etc. bytes)
135
+ pub data: Vec<u8>,
136
+
137
+ /// Image format (e.g., "jpeg", "png", "webp")
138
+ pub format: String,
139
+
140
+ /// Zero-indexed position of this image in the document/page
141
+ pub image_index: usize,
142
+
143
+ /// Page/slide number where image was found (1-indexed)
144
+ #[serde(skip_serializing_if = "Option::is_none")]
145
+ pub page_number: Option<usize>,
146
+
147
+ /// Image width in pixels
148
+ #[serde(skip_serializing_if = "Option::is_none")]
149
+ pub width: Option<u32>,
150
+
151
+ /// Image height in pixels
152
+ #[serde(skip_serializing_if = "Option::is_none")]
153
+ pub height: Option<u32>,
154
+
155
+ /// Colorspace information (e.g., "RGB", "CMYK", "Gray")
156
+ #[serde(skip_serializing_if = "Option::is_none")]
157
+ pub colorspace: Option<String>,
158
+
159
+ /// Bits per color component (e.g., 8, 16)
160
+ #[serde(skip_serializing_if = "Option::is_none")]
161
+ pub bits_per_component: Option<u32>,
162
+
163
+ /// Whether this image is a mask image
164
+ #[serde(default)]
165
+ pub is_mask: bool,
166
+
167
+ /// Optional description of the image
168
+ #[serde(skip_serializing_if = "Option::is_none")]
169
+ pub description: Option<String>,
170
+
171
+ /// Nested OCR extraction result (if image was OCRed)
172
+ ///
173
+ /// When OCR is performed on this image, the result is embedded here
174
+ /// rather than in a separate collection, making the relationship explicit.
175
+ #[serde(skip_serializing_if = "Option::is_none")]
176
+ pub ocr_result: Option<Box<ExtractionResult>>,
177
+ }
178
+
179
+ // ============================================================================
180
+ // Element-based Output Format Types (Unstructured-compatible)
181
+ // ============================================================================
182
+
183
+ /// Output format selection for extraction results.
184
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
185
+ #[serde(rename_all = "snake_case")]
186
+ pub enum OutputFormat {
187
+ /// Unified format with all content in `content` field
188
+ #[default]
189
+ Unified,
190
+ /// Element-based format with semantic element extraction
191
+ ElementBased,
192
+ }
193
+
194
+ /// Unique identifier for semantic elements.
195
+ ///
196
+ /// Wraps a string identifier that is deterministically generated
197
+ /// from element type, content, and page number.
198
+ #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
199
+ pub struct ElementId(String);
200
+
201
+ impl ElementId {
202
+ /// Create a new ElementId from a string.
203
+ ///
204
+ /// # Errors
205
+ ///
206
+ /// Returns error if the string is not valid.
207
+ pub fn new(hex_str: impl Into<String>) -> std::result::Result<Self, String> {
208
+ let s = hex_str.into();
209
+ if s.is_empty() {
210
+ return Err("ElementId cannot be empty".to_string());
211
+ }
212
+ Ok(ElementId(s))
213
+ }
214
+ }
215
+
216
+ impl AsRef<str> for ElementId {
217
+ fn as_ref(&self) -> &str {
218
+ &self.0
219
+ }
220
+ }
221
+
222
+ impl std::fmt::Display for ElementId {
223
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
224
+ write!(f, "{}", self.0)
225
+ }
226
+ }
227
+
228
+ /// Semantic element type classification.
229
+ ///
230
+ /// Categorizes text content into semantic units for downstream processing.
231
+ /// Supports the element types commonly found in Unstructured documents.
232
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
233
+ #[serde(rename_all = "snake_case")]
234
+ pub enum ElementType {
235
+ /// Document title
236
+ Title,
237
+ /// Main narrative text body
238
+ NarrativeText,
239
+ /// Section heading
240
+ Heading,
241
+ /// List item (bullet, numbered, etc.)
242
+ ListItem,
243
+ /// Table element
244
+ Table,
245
+ /// Image element
246
+ Image,
247
+ /// Page break marker
248
+ PageBreak,
249
+ /// Code block
250
+ CodeBlock,
251
+ /// Block quote
252
+ BlockQuote,
253
+ /// Footer text
254
+ Footer,
255
+ /// Header text
256
+ Header,
257
+ }
258
+
259
+ /// Bounding box coordinates for element positioning.
260
+ #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
261
+ pub struct BoundingBox {
262
+ /// Left x-coordinate
263
+ pub x0: f64,
264
+ /// Bottom y-coordinate
265
+ pub y0: f64,
266
+ /// Right x-coordinate
267
+ pub x1: f64,
268
+ /// Top y-coordinate
269
+ pub y1: f64,
270
+ }
271
+
272
+ /// Metadata for a semantic element.
273
+ #[derive(Debug, Clone, Serialize, Deserialize)]
274
+ pub struct ElementMetadata {
275
+ /// Page number (1-indexed)
276
+ pub page_number: Option<usize>,
277
+ /// Source filename or document name
278
+ pub filename: Option<String>,
279
+ /// Bounding box coordinates if available
280
+ pub coordinates: Option<BoundingBox>,
281
+ /// Position index in the element sequence
282
+ pub element_index: Option<usize>,
283
+ /// Additional custom metadata
284
+ pub additional: HashMap<String, String>,
285
+ }
286
+
287
+ /// Semantic element extracted from document.
288
+ ///
289
+ /// Represents a logical unit of content with semantic classification,
290
+ /// unique identifier, and metadata for tracking origin and position.
291
+ #[derive(Debug, Clone, Serialize, Deserialize)]
292
+ pub struct Element {
293
+ /// Unique element identifier
294
+ pub element_id: ElementId,
295
+ /// Semantic type of this element
296
+ pub element_type: ElementType,
297
+ /// Text content of the element
298
+ pub text: String,
299
+ /// Metadata about the element
300
+ pub metadata: ElementMetadata,
301
+ }