kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,201 @@
1
+ //! Inline-level element handlers for Djot parsing.
2
+
3
+ use super::state::ExtractionState;
4
+ use crate::types::{DjotImage, DjotLink, InlineElement, InlineType};
5
+ use jotdown::Container;
6
+ use std::collections::HashMap;
7
+
8
+ /// Handle start of inline elements.
9
+ pub(super) fn handle_inline_start(
10
+ state: &mut ExtractionState,
11
+ container: &Container,
12
+ parsed_attrs: Option<crate::types::Attributes>,
13
+ images: &mut Vec<DjotImage>,
14
+ links: &mut Vec<DjotLink>,
15
+ ) -> bool {
16
+ match container {
17
+ Container::Math { display } => {
18
+ state.in_math = true;
19
+ state.math_display = *display;
20
+ state.math_content.clear();
21
+ state.inline_type_stack.push(InlineType::Math);
22
+ true
23
+ }
24
+ Container::Strong => {
25
+ state.inline_type_stack.push(InlineType::Strong);
26
+ state.flush_text();
27
+ true
28
+ }
29
+ Container::Emphasis => {
30
+ state.inline_type_stack.push(InlineType::Emphasis);
31
+ state.flush_text();
32
+ true
33
+ }
34
+ Container::Mark => {
35
+ state.inline_type_stack.push(InlineType::Highlight);
36
+ state.flush_text();
37
+ true
38
+ }
39
+ Container::Subscript => {
40
+ state.inline_type_stack.push(InlineType::Subscript);
41
+ state.flush_text();
42
+ true
43
+ }
44
+ Container::Superscript => {
45
+ state.inline_type_stack.push(InlineType::Superscript);
46
+ state.flush_text();
47
+ true
48
+ }
49
+ Container::Insert => {
50
+ state.inline_type_stack.push(InlineType::Insert);
51
+ state.flush_text();
52
+ true
53
+ }
54
+ Container::Delete => {
55
+ state.inline_type_stack.push(InlineType::Delete);
56
+ state.flush_text();
57
+ true
58
+ }
59
+ Container::Verbatim => {
60
+ state.inline_type_stack.push(InlineType::Code);
61
+ state.flush_text();
62
+ true
63
+ }
64
+ Container::Span => {
65
+ state.inline_type_stack.push(InlineType::Span);
66
+ state.flush_text();
67
+ true
68
+ }
69
+ Container::RawInline { format } => {
70
+ state.inline_type_stack.push(InlineType::RawInline);
71
+ state.raw_format = Some(format.to_string());
72
+ state.flush_text();
73
+ true
74
+ }
75
+ Container::Link(url, _link_type) => {
76
+ state.inline_type_stack.push(InlineType::Link);
77
+ links.push(DjotLink {
78
+ url: url.to_string(),
79
+ text: String::new(),
80
+ title: None,
81
+ attributes: parsed_attrs,
82
+ });
83
+ state.current_link_index = Some(links.len() - 1);
84
+ state.flush_text();
85
+ true
86
+ }
87
+ Container::Image(src, _link_type) => {
88
+ state.inline_type_stack.push(InlineType::Image);
89
+ images.push(DjotImage {
90
+ src: src.to_string(),
91
+ alt: String::new(),
92
+ title: None,
93
+ attributes: parsed_attrs,
94
+ });
95
+ state.current_image_index = Some(images.len() - 1);
96
+ state.flush_text();
97
+ true
98
+ }
99
+ _ => false,
100
+ }
101
+ }
102
+
103
+ /// Handle end of inline elements.
104
+ pub(super) fn handle_inline_end(_state: &mut ExtractionState, container: &Container) -> bool {
105
+ matches!(
106
+ container,
107
+ Container::Strong
108
+ | Container::Emphasis
109
+ | Container::Mark
110
+ | Container::Subscript
111
+ | Container::Superscript
112
+ | Container::Insert
113
+ | Container::Delete
114
+ | Container::Verbatim
115
+ | Container::Span
116
+ | Container::RawInline { .. }
117
+ )
118
+ }
119
+
120
+ /// Handle end of math element.
121
+ pub(super) fn handle_math_end(state: &mut ExtractionState, display: bool) {
122
+ state.in_math = false;
123
+ let math_text = std::mem::take(&mut state.math_content);
124
+ state.inline_type_stack.pop();
125
+
126
+ let mut meta = HashMap::new();
127
+ meta.insert("display".to_string(), display.to_string());
128
+
129
+ state.current_inline_elements.push(InlineElement {
130
+ element_type: InlineType::Math,
131
+ content: math_text,
132
+ attributes: state.pending_attributes.take(),
133
+ metadata: Some(meta),
134
+ });
135
+ }
136
+
137
+ /// Finalize inline element content.
138
+ pub(super) fn finalize_inline_element(state: &mut ExtractionState, container: &Container) {
139
+ if let Some(inline_type) = state.inline_type_stack.pop() {
140
+ let content = std::mem::take(&mut state.current_text);
141
+ let mut meta = None;
142
+
143
+ // For raw inline, include the format
144
+ if matches!(container, Container::RawInline { .. })
145
+ && let Some(fmt) = state.raw_format.take()
146
+ {
147
+ let mut m = HashMap::new();
148
+ m.insert("format".to_string(), fmt);
149
+ meta = Some(m);
150
+ }
151
+
152
+ state.current_inline_elements.push(InlineElement {
153
+ element_type: inline_type,
154
+ content,
155
+ attributes: state.pending_attributes.take(),
156
+ metadata: meta,
157
+ });
158
+ }
159
+ }
160
+
161
+ /// Handle end of link element.
162
+ pub(super) fn handle_link_end(state: &mut ExtractionState, url: &str, links: &mut [DjotLink]) {
163
+ if let Some(idx) = state.current_link_index.take() {
164
+ let text = std::mem::take(&mut state.current_text);
165
+ if let Some(link) = links.get_mut(idx) {
166
+ link.text = text.clone();
167
+ }
168
+ state.inline_type_stack.pop();
169
+
170
+ let mut meta = HashMap::new();
171
+ meta.insert("href".to_string(), url.to_string());
172
+
173
+ state.current_inline_elements.push(InlineElement {
174
+ element_type: InlineType::Link,
175
+ content: text,
176
+ attributes: state.pending_attributes.take(),
177
+ metadata: Some(meta),
178
+ });
179
+ }
180
+ }
181
+
182
+ /// Handle end of image element.
183
+ pub(super) fn handle_image_end(state: &mut ExtractionState, src: &str, images: &mut [DjotImage]) {
184
+ if let Some(idx) = state.current_image_index.take() {
185
+ let alt = std::mem::take(&mut state.current_text);
186
+ if let Some(image) = images.get_mut(idx) {
187
+ image.alt = alt.clone();
188
+ }
189
+ state.inline_type_stack.pop();
190
+
191
+ let mut meta = HashMap::new();
192
+ meta.insert("src".to_string(), src.to_string());
193
+
194
+ state.current_inline_elements.push(InlineElement {
195
+ element_type: InlineType::Image,
196
+ content: alt,
197
+ attributes: state.pending_attributes.take(),
198
+ metadata: Some(meta),
199
+ });
200
+ }
201
+ }
@@ -0,0 +1,16 @@
1
+ //! Djot event parsing and content extraction.
2
+ //!
3
+ //! Handles parsing of jotdown events into plain text, tables, and full DjotContent structures.
4
+
5
+ mod block_handlers;
6
+ mod content_extraction;
7
+ mod event_handlers;
8
+ mod inline_handlers;
9
+ mod state;
10
+ mod table_extraction;
11
+ mod text_extraction;
12
+
13
+ // Re-export public API for backward compatibility
14
+ pub use content_extraction::extract_complete_djot_content;
15
+ pub use table_extraction::extract_tables_from_events;
16
+ pub use text_extraction::extract_text_from_events;
@@ -0,0 +1,78 @@
1
+ //! State management for Djot content extraction.
2
+ //!
3
+ //! Provides extraction state and helper functions for parsing Djot events.
4
+
5
+ use crate::types::{Attributes, FormattedBlock, InlineElement, InlineType};
6
+
7
+ /// Enhanced state tracking using a block stack for proper nesting.
8
+ pub(super) struct ExtractionState {
9
+ pub block_stack: Vec<FormattedBlock>, // Stack for nested blocks
10
+ pub inline_type_stack: Vec<InlineType>, // Stack for nested inline element types
11
+ pub current_text: String, // Text accumulator
12
+ pub pending_attributes: Option<Attributes>,
13
+ pub code_content: String, // Accumulator for code blocks
14
+ pub in_code_block: bool,
15
+ pub in_math: bool,
16
+ pub math_display: bool,
17
+ pub math_content: String,
18
+ pub current_link_index: Option<usize>,
19
+ pub current_image_index: Option<usize>,
20
+ pub in_raw_block: bool,
21
+ pub raw_format: Option<String>,
22
+ pub current_inline_elements: Vec<InlineElement>,
23
+ }
24
+
25
+ impl ExtractionState {
26
+ /// Create a new extraction state.
27
+ pub fn new() -> Self {
28
+ Self {
29
+ block_stack: Vec::new(),
30
+ inline_type_stack: Vec::new(),
31
+ current_text: String::new(),
32
+ pending_attributes: None,
33
+ code_content: String::new(),
34
+ in_code_block: false,
35
+ in_math: false,
36
+ math_display: false,
37
+ math_content: String::new(),
38
+ current_link_index: None,
39
+ current_image_index: None,
40
+ in_raw_block: false,
41
+ raw_format: None,
42
+ current_inline_elements: Vec::new(),
43
+ }
44
+ }
45
+
46
+ /// Flush current text to inline elements if any text is pending.
47
+ pub fn flush_text(&mut self) {
48
+ if !self.current_text.is_empty() {
49
+ self.current_inline_elements.push(InlineElement {
50
+ element_type: InlineType::Text,
51
+ content: std::mem::take(&mut self.current_text),
52
+ attributes: None,
53
+ metadata: None,
54
+ });
55
+ }
56
+ }
57
+ }
58
+
59
+ /// Helper to create a new block and push to stack.
60
+ pub(super) fn push_block(state: &mut ExtractionState, block: FormattedBlock) {
61
+ state.block_stack.push(block);
62
+ }
63
+
64
+ /// Helper to pop a block from the stack and add to parent or blocks list.
65
+ pub(super) fn pop_block(state: &mut ExtractionState, blocks: &mut Vec<FormattedBlock>) {
66
+ if let Some(mut block) = state.block_stack.pop() {
67
+ // Add any pending inline elements to the block
68
+ if !state.current_inline_elements.is_empty() {
69
+ block.inline_content.append(&mut state.current_inline_elements);
70
+ }
71
+ // If there's a parent block, add as child; otherwise add to top-level blocks
72
+ if let Some(parent) = state.block_stack.last_mut() {
73
+ parent.children.push(block);
74
+ } else {
75
+ blocks.push(block);
76
+ }
77
+ }
78
+ }
@@ -0,0 +1,68 @@
1
+ //! Table extraction from Djot events.
2
+ //!
3
+ //! Parses table events and extracts table data.
4
+
5
+ use crate::types::Table;
6
+ use jotdown::{Container, Event};
7
+
8
+ /// Extract tables from Djot events.
9
+ ///
10
+ /// Parses table events and extracts table data as a Vec<Vec<String>>,
11
+ /// converting each table to markdown representation for storage.
12
+ pub fn extract_tables_from_events(events: &[Event]) -> Vec<Table> {
13
+ let mut tables = Vec::new();
14
+ let mut current_table: Option<(Vec<Vec<String>>, usize)> = None;
15
+ let mut current_row: Vec<String> = Vec::new();
16
+ let mut current_cell = String::new();
17
+ let mut in_table_cell = false;
18
+ let mut table_index = 0;
19
+
20
+ for event in events {
21
+ match event {
22
+ Event::Start(Container::Table, _) => {
23
+ current_table = Some((Vec::new(), table_index));
24
+ }
25
+ Event::Start(Container::TableRow { .. }, _) => {
26
+ current_row = Vec::new();
27
+ }
28
+ Event::Start(Container::TableCell { .. }, _) => {
29
+ current_cell = String::new();
30
+ in_table_cell = true;
31
+ }
32
+ Event::Str(s) if in_table_cell => {
33
+ current_cell.push_str(s.as_ref());
34
+ }
35
+ Event::End(Container::TableCell { .. }) => {
36
+ if in_table_cell {
37
+ current_row.push(current_cell.trim().to_string());
38
+ current_cell = String::new();
39
+ in_table_cell = false;
40
+ }
41
+ }
42
+ Event::End(Container::TableRow { .. }) => {
43
+ if !current_row.is_empty()
44
+ && let Some((ref mut rows, _)) = current_table
45
+ {
46
+ rows.push(current_row.clone());
47
+ }
48
+ current_row = Vec::new();
49
+ }
50
+ Event::End(Container::Table) => {
51
+ if let Some((cells, idx)) = current_table.take()
52
+ && !cells.is_empty()
53
+ {
54
+ let markdown = crate::extractors::frontmatter_utils::cells_to_markdown(&cells);
55
+ tables.push(Table {
56
+ cells,
57
+ markdown,
58
+ page_number: idx + 1,
59
+ });
60
+ table_index += 1;
61
+ }
62
+ }
63
+ _ => {}
64
+ }
65
+ }
66
+
67
+ tables
68
+ }
@@ -0,0 +1,61 @@
1
+ //! Text extraction from Djot events.
2
+ //!
3
+ //! Handles parsing of jotdown events into plain text.
4
+
5
+ use jotdown::Event;
6
+
7
+ /// Extract plain text from Djot events.
8
+ ///
9
+ /// Processes djot events and extracts plain text content, handling:
10
+ /// - Text content
11
+ /// - Line breaks (soft, hard, blank)
12
+ /// - Smart punctuation (quotes, dashes, ellipsis)
13
+ /// - Special symbols and footnote references
14
+ pub fn extract_text_from_events(events: &[Event]) -> String {
15
+ let mut text = String::new();
16
+
17
+ for event in events {
18
+ match event {
19
+ Event::Str(s) => {
20
+ text.push_str(s.as_ref());
21
+ }
22
+ Event::Softbreak | Event::Hardbreak | Event::Blankline => {
23
+ text.push('\n');
24
+ }
25
+ Event::NonBreakingSpace => {
26
+ text.push(' ');
27
+ }
28
+ Event::LeftSingleQuote | Event::RightSingleQuote => {
29
+ text.push('\'');
30
+ }
31
+ Event::LeftDoubleQuote | Event::RightDoubleQuote => {
32
+ text.push('"');
33
+ }
34
+ Event::Ellipsis => {
35
+ text.push_str("...");
36
+ }
37
+ Event::EnDash => {
38
+ text.push_str("--");
39
+ }
40
+ Event::EmDash => {
41
+ text.push_str("---");
42
+ }
43
+ Event::FootnoteReference(s) => {
44
+ text.push('[');
45
+ text.push_str(s.as_ref());
46
+ text.push(']');
47
+ }
48
+ Event::Symbol(s) => {
49
+ text.push(':');
50
+ text.push_str(s.as_ref());
51
+ text.push(':');
52
+ }
53
+ Event::ThematicBreak(_) => {
54
+ text.push_str("\n---\n");
55
+ }
56
+ Event::Start(_, _) | Event::End(_) | Event::Escape | Event::Attributes(_) => {}
57
+ }
58
+ }
59
+
60
+ text
61
+ }