kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,271 @@
1
+ //! Block-level container handlers for Djot parsing.
2
+
3
+ use super::state::{ExtractionState, push_block};
4
+ use crate::types::{Attributes, BlockType, FormattedBlock};
5
+ use jotdown::Container;
6
+
7
+ /// Handle start of block containers.
8
+ pub(super) fn handle_block_start(
9
+ state: &mut ExtractionState,
10
+ container: &Container,
11
+ _attrs: &jotdown::Attributes,
12
+ parsed_attrs: Option<Attributes>,
13
+ footnotes: &mut Vec<crate::types::Footnote>,
14
+ ) -> bool {
15
+ match container {
16
+ Container::Heading { level, .. } => {
17
+ push_block(
18
+ state,
19
+ FormattedBlock {
20
+ block_type: BlockType::Heading,
21
+ level: Some(*level as usize),
22
+ inline_content: Vec::new(),
23
+ attributes: parsed_attrs,
24
+ language: None,
25
+ code: None,
26
+ children: Vec::new(),
27
+ },
28
+ );
29
+ true
30
+ }
31
+ Container::Paragraph => {
32
+ push_block(
33
+ state,
34
+ FormattedBlock {
35
+ block_type: BlockType::Paragraph,
36
+ level: None,
37
+ inline_content: Vec::new(),
38
+ attributes: parsed_attrs,
39
+ language: None,
40
+ code: None,
41
+ children: Vec::new(),
42
+ },
43
+ );
44
+ true
45
+ }
46
+ Container::Blockquote => {
47
+ push_block(
48
+ state,
49
+ FormattedBlock {
50
+ block_type: BlockType::Blockquote,
51
+ level: None,
52
+ inline_content: Vec::new(),
53
+ attributes: parsed_attrs,
54
+ language: None,
55
+ code: None,
56
+ children: Vec::new(),
57
+ },
58
+ );
59
+ true
60
+ }
61
+ Container::CodeBlock { language } => {
62
+ let lang_str = if language.is_empty() {
63
+ None
64
+ } else {
65
+ Some(language.to_string())
66
+ };
67
+ state.in_code_block = true;
68
+ state.code_content.clear();
69
+ push_block(
70
+ state,
71
+ FormattedBlock {
72
+ block_type: BlockType::CodeBlock,
73
+ level: None,
74
+ inline_content: Vec::new(),
75
+ attributes: parsed_attrs,
76
+ language: lang_str,
77
+ code: Some(String::new()),
78
+ children: Vec::new(),
79
+ },
80
+ );
81
+ true
82
+ }
83
+ Container::RawBlock { format } => {
84
+ state.in_raw_block = true;
85
+ state.raw_format = Some(format.to_string());
86
+ state.code_content.clear();
87
+ push_block(
88
+ state,
89
+ FormattedBlock {
90
+ block_type: BlockType::RawBlock,
91
+ level: None,
92
+ inline_content: Vec::new(),
93
+ attributes: parsed_attrs,
94
+ language: Some(format.to_string()),
95
+ code: Some(String::new()),
96
+ children: Vec::new(),
97
+ },
98
+ );
99
+ true
100
+ }
101
+ Container::List { kind, .. } => {
102
+ let block_type = match kind {
103
+ jotdown::ListKind::Ordered { .. } => BlockType::OrderedList,
104
+ jotdown::ListKind::Unordered(_) => BlockType::BulletList,
105
+ jotdown::ListKind::Task(_) => BlockType::TaskList,
106
+ };
107
+ push_block(
108
+ state,
109
+ FormattedBlock {
110
+ block_type,
111
+ level: None,
112
+ inline_content: Vec::new(),
113
+ attributes: parsed_attrs,
114
+ language: None,
115
+ code: None,
116
+ children: Vec::new(),
117
+ },
118
+ );
119
+ true
120
+ }
121
+ Container::ListItem => {
122
+ push_block(
123
+ state,
124
+ FormattedBlock {
125
+ block_type: BlockType::ListItem,
126
+ level: None,
127
+ inline_content: Vec::new(),
128
+ attributes: parsed_attrs,
129
+ language: None,
130
+ code: None,
131
+ children: Vec::new(),
132
+ },
133
+ );
134
+ true
135
+ }
136
+ Container::TaskListItem { checked } => {
137
+ let mut attrs = parsed_attrs.unwrap_or_default();
138
+ attrs.key_values.insert("checked".to_string(), checked.to_string());
139
+ push_block(
140
+ state,
141
+ FormattedBlock {
142
+ block_type: BlockType::ListItem,
143
+ level: None,
144
+ inline_content: Vec::new(),
145
+ attributes: Some(attrs),
146
+ language: None,
147
+ code: None,
148
+ children: Vec::new(),
149
+ },
150
+ );
151
+ true
152
+ }
153
+ Container::DescriptionList => {
154
+ push_block(
155
+ state,
156
+ FormattedBlock {
157
+ block_type: BlockType::DefinitionList,
158
+ level: None,
159
+ inline_content: Vec::new(),
160
+ attributes: parsed_attrs,
161
+ language: None,
162
+ code: None,
163
+ children: Vec::new(),
164
+ },
165
+ );
166
+ true
167
+ }
168
+ Container::DescriptionTerm => {
169
+ push_block(
170
+ state,
171
+ FormattedBlock {
172
+ block_type: BlockType::DefinitionTerm,
173
+ level: None,
174
+ inline_content: Vec::new(),
175
+ attributes: parsed_attrs,
176
+ language: None,
177
+ code: None,
178
+ children: Vec::new(),
179
+ },
180
+ );
181
+ true
182
+ }
183
+ Container::DescriptionDetails => {
184
+ push_block(
185
+ state,
186
+ FormattedBlock {
187
+ block_type: BlockType::DefinitionDescription,
188
+ level: None,
189
+ inline_content: Vec::new(),
190
+ attributes: parsed_attrs,
191
+ language: None,
192
+ code: None,
193
+ children: Vec::new(),
194
+ },
195
+ );
196
+ true
197
+ }
198
+ Container::Div { .. } => {
199
+ push_block(
200
+ state,
201
+ FormattedBlock {
202
+ block_type: BlockType::Div,
203
+ level: None,
204
+ inline_content: Vec::new(),
205
+ attributes: parsed_attrs,
206
+ language: None,
207
+ code: None,
208
+ children: Vec::new(),
209
+ },
210
+ );
211
+ true
212
+ }
213
+ Container::Section { .. } => {
214
+ push_block(
215
+ state,
216
+ FormattedBlock {
217
+ block_type: BlockType::Section,
218
+ level: None,
219
+ inline_content: Vec::new(),
220
+ attributes: parsed_attrs,
221
+ language: None,
222
+ code: None,
223
+ children: Vec::new(),
224
+ },
225
+ );
226
+ true
227
+ }
228
+ Container::Footnote { label } => {
229
+ // Start tracking a footnote definition
230
+ footnotes.push(crate::types::Footnote {
231
+ label: label.to_string(),
232
+ content: Vec::new(),
233
+ });
234
+ // We'll collect the content as blocks
235
+ push_block(
236
+ state,
237
+ FormattedBlock {
238
+ block_type: BlockType::Paragraph,
239
+ level: None,
240
+ inline_content: Vec::new(),
241
+ attributes: parsed_attrs,
242
+ language: None,
243
+ code: None,
244
+ children: Vec::new(),
245
+ },
246
+ );
247
+ true
248
+ }
249
+ _ => false,
250
+ }
251
+ }
252
+
253
+ /// Handle end of block containers.
254
+ pub(super) fn handle_block_end(_state: &mut ExtractionState, container: &Container) -> bool {
255
+ matches!(
256
+ container,
257
+ Container::Heading { .. }
258
+ | Container::Paragraph
259
+ | Container::Blockquote
260
+ | Container::CodeBlock { .. }
261
+ | Container::RawBlock { .. }
262
+ | Container::Div { .. }
263
+ | Container::Section { .. }
264
+ | Container::List { .. }
265
+ | Container::ListItem
266
+ | Container::TaskListItem { .. }
267
+ | Container::DescriptionList
268
+ | Container::DescriptionTerm
269
+ | Container::DescriptionDetails
270
+ )
271
+ }
@@ -0,0 +1,257 @@
1
+ //! Complete Djot content extraction.
2
+ //!
3
+ //! Handles extraction of rich DjotContent structures from Djot events.
4
+
5
+ use super::block_handlers::{handle_block_end, handle_block_start};
6
+ use super::event_handlers::{
7
+ finalize_block_element, handle_footnote_end, handle_footnote_reference, handle_symbol, handle_thematic_break,
8
+ };
9
+ use super::inline_handlers::{
10
+ finalize_inline_element, handle_image_end, handle_inline_end, handle_inline_start, handle_link_end, handle_math_end,
11
+ };
12
+ use super::state::{ExtractionState, pop_block};
13
+ use super::text_extraction::extract_text_from_events;
14
+ use crate::extractors::djot_format::attributes::parse_jotdown_attributes;
15
+ use crate::types::{Attributes, DjotContent, DjotImage, DjotLink, FormattedBlock};
16
+ use jotdown::{Container, Event};
17
+ use std::collections::HashMap;
18
+
19
+ /// Extract complete djot content with 100% feature extraction.
20
+ ///
21
+ /// Processes ALL djot events to build a rich DjotContent structure including:
22
+ /// - Block structure (headings, lists, blockquotes, divs, sections, code blocks)
23
+ /// - Inline formatting (strong, emphasis, highlight, subscript, superscript, insert, delete)
24
+ /// - Attributes (classes, IDs, key-value pairs)
25
+ /// - Links and images with full metadata (href, src, alt, title)
26
+ /// - Math blocks (inline & display)
27
+ /// - Definition lists (term/description pairs)
28
+ /// - Task lists with checked state
29
+ /// - Raw blocks (HTML/LaTeX)
30
+ /// - Footnotes (references and definitions)
31
+ /// - Captions
32
+ /// - Smart punctuation
33
+ /// - All other djot features
34
+ pub fn extract_complete_djot_content(
35
+ events: &[Event],
36
+ metadata: crate::types::Metadata,
37
+ tables: Vec<crate::types::Table>,
38
+ ) -> DjotContent {
39
+ let plain_text = extract_text_from_events(events);
40
+
41
+ let mut blocks = Vec::new();
42
+ let mut images = Vec::new();
43
+ let mut links = Vec::new();
44
+ let mut footnotes = Vec::new();
45
+ let attributes_map: HashMap<String, Attributes> = HashMap::new();
46
+
47
+ let mut state = ExtractionState::new();
48
+
49
+ for event in events {
50
+ match event {
51
+ Event::Start(container, attrs) => {
52
+ handle_start_event(
53
+ &mut state,
54
+ container,
55
+ attrs,
56
+ &mut blocks,
57
+ &mut images,
58
+ &mut links,
59
+ &mut footnotes,
60
+ );
61
+ }
62
+ Event::End(container) => {
63
+ handle_end_event(
64
+ &mut state,
65
+ container,
66
+ &mut blocks,
67
+ &mut images,
68
+ &mut links,
69
+ &mut footnotes,
70
+ );
71
+ }
72
+ Event::Str(s) => {
73
+ if state.in_code_block || state.in_raw_block {
74
+ state.code_content.push_str(s);
75
+ } else if state.in_math {
76
+ state.math_content.push_str(s);
77
+ } else {
78
+ state.current_text.push_str(s);
79
+ }
80
+ }
81
+ Event::FootnoteReference(label) => {
82
+ handle_footnote_reference(&mut state, label);
83
+ }
84
+ Event::Symbol(sym) => {
85
+ handle_symbol(&mut state, sym);
86
+ }
87
+ Event::Attributes(attrs) => {
88
+ // Store attributes to be applied to the next element
89
+ state.pending_attributes = Some(parse_jotdown_attributes(attrs));
90
+ }
91
+ Event::Softbreak => {
92
+ if state.in_math {
93
+ state.math_content.push(' ');
94
+ } else if !state.inline_type_stack.is_empty() {
95
+ state.current_text.push(' ');
96
+ } else {
97
+ state.current_text.push('\n');
98
+ }
99
+ }
100
+ Event::Hardbreak => {
101
+ if state.in_math {
102
+ state.math_content.push('\n');
103
+ } else {
104
+ state.current_text.push('\n');
105
+ }
106
+ }
107
+ Event::NonBreakingSpace => {
108
+ state.current_text.push(' ');
109
+ }
110
+ Event::Blankline => {
111
+ // Blank lines are typically ignored in block processing
112
+ }
113
+ Event::ThematicBreak(attrs) => {
114
+ handle_thematic_break(&mut state, attrs, &mut blocks);
115
+ }
116
+ // Smart punctuation events
117
+ Event::LeftSingleQuote => {
118
+ state.current_text.push('\'');
119
+ }
120
+ Event::RightSingleQuote => {
121
+ state.current_text.push('\'');
122
+ }
123
+ Event::LeftDoubleQuote => {
124
+ state.current_text.push('"');
125
+ }
126
+ Event::RightDoubleQuote => {
127
+ state.current_text.push('"');
128
+ }
129
+ Event::Ellipsis => {
130
+ state.current_text.push_str("...");
131
+ }
132
+ Event::EnDash => {
133
+ state.current_text.push_str("--");
134
+ }
135
+ Event::EmDash => {
136
+ state.current_text.push_str("---");
137
+ }
138
+ Event::Escape => {
139
+ // Escape is a marker, doesn't produce output
140
+ }
141
+ }
142
+ }
143
+
144
+ // Finalize any remaining content
145
+ state.flush_text();
146
+
147
+ // Pop any remaining blocks
148
+ while !state.block_stack.is_empty() {
149
+ pop_block(&mut state, &mut blocks);
150
+ }
151
+
152
+ // Add any remaining inline elements to the last block if exists
153
+ if !state.current_inline_elements.is_empty()
154
+ && let Some(last_block) = blocks.last_mut()
155
+ {
156
+ last_block.inline_content.append(&mut state.current_inline_elements);
157
+ }
158
+
159
+ DjotContent {
160
+ plain_text,
161
+ blocks,
162
+ metadata,
163
+ tables,
164
+ images,
165
+ links,
166
+ footnotes,
167
+ attributes: attributes_map,
168
+ }
169
+ }
170
+
171
+ /// Handle start of a container event.
172
+ fn handle_start_event(
173
+ state: &mut ExtractionState,
174
+ container: &Container,
175
+ attrs: &jotdown::Attributes,
176
+ _blocks: &mut Vec<FormattedBlock>,
177
+ images: &mut Vec<DjotImage>,
178
+ links: &mut Vec<DjotLink>,
179
+ footnotes: &mut Vec<crate::types::Footnote>,
180
+ ) {
181
+ // Parse attributes from jotdown's Attributes type
182
+ let parsed_attrs = if attrs.is_empty() {
183
+ state.pending_attributes.take()
184
+ } else {
185
+ Some(parse_jotdown_attributes(attrs))
186
+ };
187
+
188
+ // Try block handlers first
189
+ if handle_block_start(state, container, attrs, parsed_attrs.clone(), footnotes) {
190
+ return;
191
+ }
192
+
193
+ // Try inline handlers
194
+ if handle_inline_start(state, container, parsed_attrs, images, links) {
195
+ return;
196
+ }
197
+
198
+ // Handle remaining containers (tables, link definitions, etc.)
199
+ match container {
200
+ Container::Table | Container::TableRow { .. } | Container::TableCell { .. } | Container::Caption => {
201
+ // Tables are extracted separately
202
+ }
203
+ Container::LinkDefinition { .. } => {
204
+ // Link definitions are resolved by jotdown, not needed in output
205
+ }
206
+ _ => {}
207
+ }
208
+ }
209
+
210
+ /// Handle end of a container event.
211
+ fn handle_end_event(
212
+ state: &mut ExtractionState,
213
+ container: &Container,
214
+ blocks: &mut Vec<FormattedBlock>,
215
+ images: &mut [DjotImage],
216
+ links: &mut [DjotLink],
217
+ footnotes: &mut [crate::types::Footnote],
218
+ ) {
219
+ // Check if it's a block container
220
+ if handle_block_end(state, container) {
221
+ finalize_block_element(state, blocks);
222
+ return;
223
+ }
224
+
225
+ // Handle special cases
226
+ match container {
227
+ Container::Footnote { .. } => {
228
+ handle_footnote_end(state, footnotes);
229
+ }
230
+ Container::Math { display } => {
231
+ handle_math_end(state, *display);
232
+ }
233
+ Container::Link(url, _) => {
234
+ handle_link_end(state, url, links);
235
+ }
236
+ Container::Image(src, _) => {
237
+ handle_image_end(state, src, images);
238
+ }
239
+ _ => {
240
+ // Check if it's an inline element
241
+ if handle_inline_end(state, container) {
242
+ finalize_inline_element(state, container);
243
+ }
244
+ }
245
+ }
246
+
247
+ // Handle remaining containers (tables, link definitions, etc.)
248
+ match container {
249
+ Container::Table | Container::TableRow { .. } | Container::TableCell { .. } | Container::Caption => {
250
+ // Tables are handled separately
251
+ }
252
+ Container::LinkDefinition { .. } => {
253
+ // Link definitions don't produce output
254
+ }
255
+ _ => {}
256
+ }
257
+ }
@@ -0,0 +1,101 @@
1
+ //! Event handlers for special Djot elements.
2
+
3
+ use super::state::{ExtractionState, pop_block};
4
+ use crate::extractors::djot_format::attributes::parse_jotdown_attributes;
5
+ use crate::types::{BlockType, FormattedBlock, InlineElement, InlineType};
6
+ use std::collections::HashMap;
7
+
8
+ /// Handle footnote reference event.
9
+ pub(super) fn handle_footnote_reference(state: &mut ExtractionState, label: &str) {
10
+ state.flush_text();
11
+
12
+ let mut meta = HashMap::new();
13
+ meta.insert("label".to_string(), label.to_string());
14
+
15
+ state.current_inline_elements.push(InlineElement {
16
+ element_type: InlineType::FootnoteRef,
17
+ content: label.to_string(),
18
+ attributes: None,
19
+ metadata: Some(meta),
20
+ });
21
+ }
22
+
23
+ /// Handle symbol event.
24
+ pub(super) fn handle_symbol(state: &mut ExtractionState, sym: &str) {
25
+ state.flush_text();
26
+
27
+ state.current_inline_elements.push(InlineElement {
28
+ element_type: InlineType::Symbol,
29
+ content: sym.to_string(),
30
+ attributes: None,
31
+ metadata: None,
32
+ });
33
+ }
34
+
35
+ /// Handle thematic break event.
36
+ pub(super) fn handle_thematic_break(
37
+ state: &mut ExtractionState,
38
+ attrs: &jotdown::Attributes,
39
+ blocks: &mut Vec<FormattedBlock>,
40
+ ) {
41
+ state.flush_text();
42
+
43
+ let parsed_attrs = if attrs.is_empty() {
44
+ None
45
+ } else {
46
+ Some(parse_jotdown_attributes(attrs))
47
+ };
48
+
49
+ let hr_block = FormattedBlock {
50
+ block_type: BlockType::ThematicBreak,
51
+ level: None,
52
+ inline_content: Vec::new(),
53
+ attributes: parsed_attrs,
54
+ language: None,
55
+ code: None,
56
+ children: Vec::new(),
57
+ };
58
+
59
+ if let Some(parent) = state.block_stack.last_mut() {
60
+ parent.children.push(hr_block);
61
+ } else {
62
+ blocks.push(hr_block);
63
+ }
64
+ }
65
+
66
+ /// Handle end of footnote definition.
67
+ pub(super) fn handle_footnote_end(state: &mut ExtractionState, footnotes: &mut [crate::types::Footnote]) {
68
+ state.flush_text();
69
+ // Pop the footnote content block and add to the last footnote
70
+ if let Some(mut block) = state.block_stack.pop() {
71
+ block.inline_content.append(&mut state.current_inline_elements);
72
+ if let Some(footnote) = footnotes.last_mut() {
73
+ footnote.content.push(block);
74
+ }
75
+ }
76
+ }
77
+
78
+ /// Finalize block element content and pop from stack.
79
+ pub(super) fn finalize_block_element(state: &mut ExtractionState, blocks: &mut Vec<FormattedBlock>) {
80
+ // Flush any remaining text
81
+ state.flush_text();
82
+
83
+ // For code blocks, set the accumulated code content
84
+ if state.in_code_block {
85
+ if let Some(block) = state.block_stack.last_mut() {
86
+ block.code = Some(std::mem::take(&mut state.code_content));
87
+ }
88
+ state.in_code_block = false;
89
+ }
90
+
91
+ // For raw blocks
92
+ if state.in_raw_block {
93
+ if let Some(block) = state.block_stack.last_mut() {
94
+ block.code = Some(std::mem::take(&mut state.code_content));
95
+ }
96
+ state.in_raw_block = false;
97
+ state.raw_format = None;
98
+ }
99
+
100
+ pop_block(state, blocks);
101
+ }