kreuzberg 4.0.7 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +24 -16
  3. data/README.md +4 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,201 @@
1
+ //! Inline-level element handlers for Djot parsing.
2
+
3
+ use super::state::ExtractionState;
4
+ use crate::types::{DjotImage, DjotLink, InlineElement, InlineType};
5
+ use jotdown::Container;
6
+ use std::collections::HashMap;
7
+
8
+ /// Handle start of inline elements.
9
+ pub(super) fn handle_inline_start(
10
+ state: &mut ExtractionState,
11
+ container: &Container,
12
+ parsed_attrs: Option<crate::types::Attributes>,
13
+ images: &mut Vec<DjotImage>,
14
+ links: &mut Vec<DjotLink>,
15
+ ) -> bool {
16
+ match container {
17
+ Container::Math { display } => {
18
+ state.in_math = true;
19
+ state.math_display = *display;
20
+ state.math_content.clear();
21
+ state.inline_type_stack.push(InlineType::Math);
22
+ true
23
+ }
24
+ Container::Strong => {
25
+ state.inline_type_stack.push(InlineType::Strong);
26
+ state.flush_text();
27
+ true
28
+ }
29
+ Container::Emphasis => {
30
+ state.inline_type_stack.push(InlineType::Emphasis);
31
+ state.flush_text();
32
+ true
33
+ }
34
+ Container::Mark => {
35
+ state.inline_type_stack.push(InlineType::Highlight);
36
+ state.flush_text();
37
+ true
38
+ }
39
+ Container::Subscript => {
40
+ state.inline_type_stack.push(InlineType::Subscript);
41
+ state.flush_text();
42
+ true
43
+ }
44
+ Container::Superscript => {
45
+ state.inline_type_stack.push(InlineType::Superscript);
46
+ state.flush_text();
47
+ true
48
+ }
49
+ Container::Insert => {
50
+ state.inline_type_stack.push(InlineType::Insert);
51
+ state.flush_text();
52
+ true
53
+ }
54
+ Container::Delete => {
55
+ state.inline_type_stack.push(InlineType::Delete);
56
+ state.flush_text();
57
+ true
58
+ }
59
+ Container::Verbatim => {
60
+ state.inline_type_stack.push(InlineType::Code);
61
+ state.flush_text();
62
+ true
63
+ }
64
+ Container::Span => {
65
+ state.inline_type_stack.push(InlineType::Span);
66
+ state.flush_text();
67
+ true
68
+ }
69
+ Container::RawInline { format } => {
70
+ state.inline_type_stack.push(InlineType::RawInline);
71
+ state.raw_format = Some(format.to_string());
72
+ state.flush_text();
73
+ true
74
+ }
75
+ Container::Link(url, _link_type) => {
76
+ state.inline_type_stack.push(InlineType::Link);
77
+ links.push(DjotLink {
78
+ url: url.to_string(),
79
+ text: String::new(),
80
+ title: None,
81
+ attributes: parsed_attrs,
82
+ });
83
+ state.current_link_index = Some(links.len() - 1);
84
+ state.flush_text();
85
+ true
86
+ }
87
+ Container::Image(src, _link_type) => {
88
+ state.inline_type_stack.push(InlineType::Image);
89
+ images.push(DjotImage {
90
+ src: src.to_string(),
91
+ alt: String::new(),
92
+ title: None,
93
+ attributes: parsed_attrs,
94
+ });
95
+ state.current_image_index = Some(images.len() - 1);
96
+ state.flush_text();
97
+ true
98
+ }
99
+ _ => false,
100
+ }
101
+ }
102
+
103
+ /// Handle end of inline elements.
104
+ pub(super) fn handle_inline_end(_state: &mut ExtractionState, container: &Container) -> bool {
105
+ matches!(
106
+ container,
107
+ Container::Strong
108
+ | Container::Emphasis
109
+ | Container::Mark
110
+ | Container::Subscript
111
+ | Container::Superscript
112
+ | Container::Insert
113
+ | Container::Delete
114
+ | Container::Verbatim
115
+ | Container::Span
116
+ | Container::RawInline { .. }
117
+ )
118
+ }
119
+
120
+ /// Handle end of math element.
121
+ pub(super) fn handle_math_end(state: &mut ExtractionState, display: bool) {
122
+ state.in_math = false;
123
+ let math_text = std::mem::take(&mut state.math_content);
124
+ state.inline_type_stack.pop();
125
+
126
+ let mut meta = HashMap::new();
127
+ meta.insert("display".to_string(), display.to_string());
128
+
129
+ state.current_inline_elements.push(InlineElement {
130
+ element_type: InlineType::Math,
131
+ content: math_text,
132
+ attributes: state.pending_attributes.take(),
133
+ metadata: Some(meta),
134
+ });
135
+ }
136
+
137
+ /// Finalize inline element content.
138
+ pub(super) fn finalize_inline_element(state: &mut ExtractionState, container: &Container) {
139
+ if let Some(inline_type) = state.inline_type_stack.pop() {
140
+ let content = std::mem::take(&mut state.current_text);
141
+ let mut meta = None;
142
+
143
+ // For raw inline, include the format
144
+ if matches!(container, Container::RawInline { .. })
145
+ && let Some(fmt) = state.raw_format.take()
146
+ {
147
+ let mut m = HashMap::new();
148
+ m.insert("format".to_string(), fmt);
149
+ meta = Some(m);
150
+ }
151
+
152
+ state.current_inline_elements.push(InlineElement {
153
+ element_type: inline_type,
154
+ content,
155
+ attributes: state.pending_attributes.take(),
156
+ metadata: meta,
157
+ });
158
+ }
159
+ }
160
+
161
+ /// Handle end of link element.
162
+ pub(super) fn handle_link_end(state: &mut ExtractionState, url: &str, links: &mut [DjotLink]) {
163
+ if let Some(idx) = state.current_link_index.take() {
164
+ let text = std::mem::take(&mut state.current_text);
165
+ if let Some(link) = links.get_mut(idx) {
166
+ link.text = text.clone();
167
+ }
168
+ state.inline_type_stack.pop();
169
+
170
+ let mut meta = HashMap::new();
171
+ meta.insert("href".to_string(), url.to_string());
172
+
173
+ state.current_inline_elements.push(InlineElement {
174
+ element_type: InlineType::Link,
175
+ content: text,
176
+ attributes: state.pending_attributes.take(),
177
+ metadata: Some(meta),
178
+ });
179
+ }
180
+ }
181
+
182
+ /// Handle end of image element.
183
+ pub(super) fn handle_image_end(state: &mut ExtractionState, src: &str, images: &mut [DjotImage]) {
184
+ if let Some(idx) = state.current_image_index.take() {
185
+ let alt = std::mem::take(&mut state.current_text);
186
+ if let Some(image) = images.get_mut(idx) {
187
+ image.alt = alt.clone();
188
+ }
189
+ state.inline_type_stack.pop();
190
+
191
+ let mut meta = HashMap::new();
192
+ meta.insert("src".to_string(), src.to_string());
193
+
194
+ state.current_inline_elements.push(InlineElement {
195
+ element_type: InlineType::Image,
196
+ content: alt,
197
+ attributes: state.pending_attributes.take(),
198
+ metadata: Some(meta),
199
+ });
200
+ }
201
+ }
@@ -0,0 +1,16 @@
1
+ //! Djot event parsing and content extraction.
2
+ //!
3
+ //! Handles parsing of jotdown events into plain text, tables, and full DjotContent structures.
4
+
5
+ mod block_handlers;
6
+ mod content_extraction;
7
+ mod event_handlers;
8
+ mod inline_handlers;
9
+ mod state;
10
+ mod table_extraction;
11
+ mod text_extraction;
12
+
13
+ // Re-export public API for backward compatibility
14
+ pub use content_extraction::extract_complete_djot_content;
15
+ pub use table_extraction::extract_tables_from_events;
16
+ pub use text_extraction::extract_text_from_events;
@@ -0,0 +1,78 @@
1
+ //! State management for Djot content extraction.
2
+ //!
3
+ //! Provides extraction state and helper functions for parsing Djot events.
4
+
5
+ use crate::types::{Attributes, FormattedBlock, InlineElement, InlineType};
6
+
7
+ /// Enhanced state tracking using a block stack for proper nesting.
8
+ pub(super) struct ExtractionState {
9
+ pub block_stack: Vec<FormattedBlock>, // Stack for nested blocks
10
+ pub inline_type_stack: Vec<InlineType>, // Stack for nested inline element types
11
+ pub current_text: String, // Text accumulator
12
+ pub pending_attributes: Option<Attributes>,
13
+ pub code_content: String, // Accumulator for code blocks
14
+ pub in_code_block: bool,
15
+ pub in_math: bool,
16
+ pub math_display: bool,
17
+ pub math_content: String,
18
+ pub current_link_index: Option<usize>,
19
+ pub current_image_index: Option<usize>,
20
+ pub in_raw_block: bool,
21
+ pub raw_format: Option<String>,
22
+ pub current_inline_elements: Vec<InlineElement>,
23
+ }
24
+
25
+ impl ExtractionState {
26
+ /// Create a new extraction state.
27
+ pub fn new() -> Self {
28
+ Self {
29
+ block_stack: Vec::new(),
30
+ inline_type_stack: Vec::new(),
31
+ current_text: String::new(),
32
+ pending_attributes: None,
33
+ code_content: String::new(),
34
+ in_code_block: false,
35
+ in_math: false,
36
+ math_display: false,
37
+ math_content: String::new(),
38
+ current_link_index: None,
39
+ current_image_index: None,
40
+ in_raw_block: false,
41
+ raw_format: None,
42
+ current_inline_elements: Vec::new(),
43
+ }
44
+ }
45
+
46
+ /// Flush current text to inline elements if any text is pending.
47
+ pub fn flush_text(&mut self) {
48
+ if !self.current_text.is_empty() {
49
+ self.current_inline_elements.push(InlineElement {
50
+ element_type: InlineType::Text,
51
+ content: std::mem::take(&mut self.current_text),
52
+ attributes: None,
53
+ metadata: None,
54
+ });
55
+ }
56
+ }
57
+ }
58
+
59
+ /// Helper to create a new block and push to stack.
60
+ pub(super) fn push_block(state: &mut ExtractionState, block: FormattedBlock) {
61
+ state.block_stack.push(block);
62
+ }
63
+
64
+ /// Helper to pop a block from the stack and add to parent or blocks list.
65
+ pub(super) fn pop_block(state: &mut ExtractionState, blocks: &mut Vec<FormattedBlock>) {
66
+ if let Some(mut block) = state.block_stack.pop() {
67
+ // Add any pending inline elements to the block
68
+ if !state.current_inline_elements.is_empty() {
69
+ block.inline_content.append(&mut state.current_inline_elements);
70
+ }
71
+ // If there's a parent block, add as child; otherwise add to top-level blocks
72
+ if let Some(parent) = state.block_stack.last_mut() {
73
+ parent.children.push(block);
74
+ } else {
75
+ blocks.push(block);
76
+ }
77
+ }
78
+ }
@@ -0,0 +1,68 @@
1
+ //! Table extraction from Djot events.
2
+ //!
3
+ //! Parses table events and extracts table data.
4
+
5
+ use crate::types::Table;
6
+ use jotdown::{Container, Event};
7
+
8
+ /// Extract tables from Djot events.
9
+ ///
10
+ /// Parses table events and extracts table data as a Vec<Vec<String>>,
11
+ /// converting each table to markdown representation for storage.
12
+ pub fn extract_tables_from_events(events: &[Event]) -> Vec<Table> {
13
+ let mut tables = Vec::new();
14
+ let mut current_table: Option<(Vec<Vec<String>>, usize)> = None;
15
+ let mut current_row: Vec<String> = Vec::new();
16
+ let mut current_cell = String::new();
17
+ let mut in_table_cell = false;
18
+ let mut table_index = 0;
19
+
20
+ for event in events {
21
+ match event {
22
+ Event::Start(Container::Table, _) => {
23
+ current_table = Some((Vec::new(), table_index));
24
+ }
25
+ Event::Start(Container::TableRow { .. }, _) => {
26
+ current_row = Vec::new();
27
+ }
28
+ Event::Start(Container::TableCell { .. }, _) => {
29
+ current_cell = String::new();
30
+ in_table_cell = true;
31
+ }
32
+ Event::Str(s) if in_table_cell => {
33
+ current_cell.push_str(s.as_ref());
34
+ }
35
+ Event::End(Container::TableCell { .. }) => {
36
+ if in_table_cell {
37
+ current_row.push(current_cell.trim().to_string());
38
+ current_cell = String::new();
39
+ in_table_cell = false;
40
+ }
41
+ }
42
+ Event::End(Container::TableRow { .. }) => {
43
+ if !current_row.is_empty()
44
+ && let Some((ref mut rows, _)) = current_table
45
+ {
46
+ rows.push(current_row.clone());
47
+ }
48
+ current_row = Vec::new();
49
+ }
50
+ Event::End(Container::Table) => {
51
+ if let Some((cells, idx)) = current_table.take()
52
+ && !cells.is_empty()
53
+ {
54
+ let markdown = crate::extractors::frontmatter_utils::cells_to_markdown(&cells);
55
+ tables.push(Table {
56
+ cells,
57
+ markdown,
58
+ page_number: idx + 1,
59
+ });
60
+ table_index += 1;
61
+ }
62
+ }
63
+ _ => {}
64
+ }
65
+ }
66
+
67
+ tables
68
+ }
@@ -0,0 +1,61 @@
1
+ //! Text extraction from Djot events.
2
+ //!
3
+ //! Handles parsing of jotdown events into plain text.
4
+
5
+ use jotdown::Event;
6
+
7
+ /// Extract plain text from Djot events.
8
+ ///
9
+ /// Processes djot events and extracts plain text content, handling:
10
+ /// - Text content
11
+ /// - Line breaks (soft, hard, blank)
12
+ /// - Smart punctuation (quotes, dashes, ellipsis)
13
+ /// - Special symbols and footnote references
14
+ pub fn extract_text_from_events(events: &[Event]) -> String {
15
+ let mut text = String::new();
16
+
17
+ for event in events {
18
+ match event {
19
+ Event::Str(s) => {
20
+ text.push_str(s.as_ref());
21
+ }
22
+ Event::Softbreak | Event::Hardbreak | Event::Blankline => {
23
+ text.push('\n');
24
+ }
25
+ Event::NonBreakingSpace => {
26
+ text.push(' ');
27
+ }
28
+ Event::LeftSingleQuote | Event::RightSingleQuote => {
29
+ text.push('\'');
30
+ }
31
+ Event::LeftDoubleQuote | Event::RightDoubleQuote => {
32
+ text.push('"');
33
+ }
34
+ Event::Ellipsis => {
35
+ text.push_str("...");
36
+ }
37
+ Event::EnDash => {
38
+ text.push_str("--");
39
+ }
40
+ Event::EmDash => {
41
+ text.push_str("---");
42
+ }
43
+ Event::FootnoteReference(s) => {
44
+ text.push('[');
45
+ text.push_str(s.as_ref());
46
+ text.push(']');
47
+ }
48
+ Event::Symbol(s) => {
49
+ text.push(':');
50
+ text.push_str(s.as_ref());
51
+ text.push(':');
52
+ }
53
+ Event::ThematicBreak(_) => {
54
+ text.push_str("\n---\n");
55
+ }
56
+ Event::Start(_, _) | Event::End(_) | Event::Escape | Event::Attributes(_) => {}
57
+ }
58
+ }
59
+
60
+ text
61
+ }