kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,271 @@
1
+ //! Block-level container handlers for Djot parsing.
2
+
3
+ use super::state::{ExtractionState, push_block};
4
+ use crate::types::{Attributes, BlockType, FormattedBlock};
5
+ use jotdown::Container;
6
+
7
+ /// Handle start of block containers.
8
+ pub(super) fn handle_block_start(
9
+ state: &mut ExtractionState,
10
+ container: &Container,
11
+ _attrs: &jotdown::Attributes,
12
+ parsed_attrs: Option<Attributes>,
13
+ footnotes: &mut Vec<crate::types::Footnote>,
14
+ ) -> bool {
15
+ match container {
16
+ Container::Heading { level, .. } => {
17
+ push_block(
18
+ state,
19
+ FormattedBlock {
20
+ block_type: BlockType::Heading,
21
+ level: Some(*level as usize),
22
+ inline_content: Vec::new(),
23
+ attributes: parsed_attrs,
24
+ language: None,
25
+ code: None,
26
+ children: Vec::new(),
27
+ },
28
+ );
29
+ true
30
+ }
31
+ Container::Paragraph => {
32
+ push_block(
33
+ state,
34
+ FormattedBlock {
35
+ block_type: BlockType::Paragraph,
36
+ level: None,
37
+ inline_content: Vec::new(),
38
+ attributes: parsed_attrs,
39
+ language: None,
40
+ code: None,
41
+ children: Vec::new(),
42
+ },
43
+ );
44
+ true
45
+ }
46
+ Container::Blockquote => {
47
+ push_block(
48
+ state,
49
+ FormattedBlock {
50
+ block_type: BlockType::Blockquote,
51
+ level: None,
52
+ inline_content: Vec::new(),
53
+ attributes: parsed_attrs,
54
+ language: None,
55
+ code: None,
56
+ children: Vec::new(),
57
+ },
58
+ );
59
+ true
60
+ }
61
+ Container::CodeBlock { language } => {
62
+ let lang_str = if language.is_empty() {
63
+ None
64
+ } else {
65
+ Some(language.to_string())
66
+ };
67
+ state.in_code_block = true;
68
+ state.code_content.clear();
69
+ push_block(
70
+ state,
71
+ FormattedBlock {
72
+ block_type: BlockType::CodeBlock,
73
+ level: None,
74
+ inline_content: Vec::new(),
75
+ attributes: parsed_attrs,
76
+ language: lang_str,
77
+ code: Some(String::new()),
78
+ children: Vec::new(),
79
+ },
80
+ );
81
+ true
82
+ }
83
+ Container::RawBlock { format } => {
84
+ state.in_raw_block = true;
85
+ state.raw_format = Some(format.to_string());
86
+ state.code_content.clear();
87
+ push_block(
88
+ state,
89
+ FormattedBlock {
90
+ block_type: BlockType::RawBlock,
91
+ level: None,
92
+ inline_content: Vec::new(),
93
+ attributes: parsed_attrs,
94
+ language: Some(format.to_string()),
95
+ code: Some(String::new()),
96
+ children: Vec::new(),
97
+ },
98
+ );
99
+ true
100
+ }
101
+ Container::List { kind, .. } => {
102
+ let block_type = match kind {
103
+ jotdown::ListKind::Ordered { .. } => BlockType::OrderedList,
104
+ jotdown::ListKind::Unordered(_) => BlockType::BulletList,
105
+ jotdown::ListKind::Task(_) => BlockType::TaskList,
106
+ };
107
+ push_block(
108
+ state,
109
+ FormattedBlock {
110
+ block_type,
111
+ level: None,
112
+ inline_content: Vec::new(),
113
+ attributes: parsed_attrs,
114
+ language: None,
115
+ code: None,
116
+ children: Vec::new(),
117
+ },
118
+ );
119
+ true
120
+ }
121
+ Container::ListItem => {
122
+ push_block(
123
+ state,
124
+ FormattedBlock {
125
+ block_type: BlockType::ListItem,
126
+ level: None,
127
+ inline_content: Vec::new(),
128
+ attributes: parsed_attrs,
129
+ language: None,
130
+ code: None,
131
+ children: Vec::new(),
132
+ },
133
+ );
134
+ true
135
+ }
136
+ Container::TaskListItem { checked } => {
137
+ let mut attrs = parsed_attrs.unwrap_or_default();
138
+ attrs.key_values.insert("checked".to_string(), checked.to_string());
139
+ push_block(
140
+ state,
141
+ FormattedBlock {
142
+ block_type: BlockType::ListItem,
143
+ level: None,
144
+ inline_content: Vec::new(),
145
+ attributes: Some(attrs),
146
+ language: None,
147
+ code: None,
148
+ children: Vec::new(),
149
+ },
150
+ );
151
+ true
152
+ }
153
+ Container::DescriptionList => {
154
+ push_block(
155
+ state,
156
+ FormattedBlock {
157
+ block_type: BlockType::DefinitionList,
158
+ level: None,
159
+ inline_content: Vec::new(),
160
+ attributes: parsed_attrs,
161
+ language: None,
162
+ code: None,
163
+ children: Vec::new(),
164
+ },
165
+ );
166
+ true
167
+ }
168
+ Container::DescriptionTerm => {
169
+ push_block(
170
+ state,
171
+ FormattedBlock {
172
+ block_type: BlockType::DefinitionTerm,
173
+ level: None,
174
+ inline_content: Vec::new(),
175
+ attributes: parsed_attrs,
176
+ language: None,
177
+ code: None,
178
+ children: Vec::new(),
179
+ },
180
+ );
181
+ true
182
+ }
183
+ Container::DescriptionDetails => {
184
+ push_block(
185
+ state,
186
+ FormattedBlock {
187
+ block_type: BlockType::DefinitionDescription,
188
+ level: None,
189
+ inline_content: Vec::new(),
190
+ attributes: parsed_attrs,
191
+ language: None,
192
+ code: None,
193
+ children: Vec::new(),
194
+ },
195
+ );
196
+ true
197
+ }
198
+ Container::Div { .. } => {
199
+ push_block(
200
+ state,
201
+ FormattedBlock {
202
+ block_type: BlockType::Div,
203
+ level: None,
204
+ inline_content: Vec::new(),
205
+ attributes: parsed_attrs,
206
+ language: None,
207
+ code: None,
208
+ children: Vec::new(),
209
+ },
210
+ );
211
+ true
212
+ }
213
+ Container::Section { .. } => {
214
+ push_block(
215
+ state,
216
+ FormattedBlock {
217
+ block_type: BlockType::Section,
218
+ level: None,
219
+ inline_content: Vec::new(),
220
+ attributes: parsed_attrs,
221
+ language: None,
222
+ code: None,
223
+ children: Vec::new(),
224
+ },
225
+ );
226
+ true
227
+ }
228
+ Container::Footnote { label } => {
229
+ // Start tracking a footnote definition
230
+ footnotes.push(crate::types::Footnote {
231
+ label: label.to_string(),
232
+ content: Vec::new(),
233
+ });
234
+ // We'll collect the content as blocks
235
+ push_block(
236
+ state,
237
+ FormattedBlock {
238
+ block_type: BlockType::Paragraph,
239
+ level: None,
240
+ inline_content: Vec::new(),
241
+ attributes: parsed_attrs,
242
+ language: None,
243
+ code: None,
244
+ children: Vec::new(),
245
+ },
246
+ );
247
+ true
248
+ }
249
+ _ => false,
250
+ }
251
+ }
252
+
253
+ /// Handle end of block containers.
254
+ pub(super) fn handle_block_end(_state: &mut ExtractionState, container: &Container) -> bool {
255
+ matches!(
256
+ container,
257
+ Container::Heading { .. }
258
+ | Container::Paragraph
259
+ | Container::Blockquote
260
+ | Container::CodeBlock { .. }
261
+ | Container::RawBlock { .. }
262
+ | Container::Div { .. }
263
+ | Container::Section { .. }
264
+ | Container::List { .. }
265
+ | Container::ListItem
266
+ | Container::TaskListItem { .. }
267
+ | Container::DescriptionList
268
+ | Container::DescriptionTerm
269
+ | Container::DescriptionDetails
270
+ )
271
+ }
@@ -0,0 +1,257 @@
1
+ //! Complete Djot content extraction.
2
+ //!
3
+ //! Handles extraction of rich DjotContent structures from Djot events.
4
+
5
+ use super::block_handlers::{handle_block_end, handle_block_start};
6
+ use super::event_handlers::{
7
+ finalize_block_element, handle_footnote_end, handle_footnote_reference, handle_symbol, handle_thematic_break,
8
+ };
9
+ use super::inline_handlers::{
10
+ finalize_inline_element, handle_image_end, handle_inline_end, handle_inline_start, handle_link_end, handle_math_end,
11
+ };
12
+ use super::state::{ExtractionState, pop_block};
13
+ use super::text_extraction::extract_text_from_events;
14
+ use crate::extractors::djot_format::attributes::parse_jotdown_attributes;
15
+ use crate::types::{Attributes, DjotContent, DjotImage, DjotLink, FormattedBlock};
16
+ use jotdown::{Container, Event};
17
+ use std::collections::HashMap;
18
+
19
+ /// Extract complete djot content with 100% feature extraction.
20
+ ///
21
+ /// Processes ALL djot events to build a rich DjotContent structure including:
22
+ /// - Block structure (headings, lists, blockquotes, divs, sections, code blocks)
23
+ /// - Inline formatting (strong, emphasis, highlight, subscript, superscript, insert, delete)
24
+ /// - Attributes (classes, IDs, key-value pairs)
25
+ /// - Links and images with full metadata (href, src, alt, title)
26
+ /// - Math blocks (inline & display)
27
+ /// - Definition lists (term/description pairs)
28
+ /// - Task lists with checked state
29
+ /// - Raw blocks (HTML/LaTeX)
30
+ /// - Footnotes (references and definitions)
31
+ /// - Captions
32
+ /// - Smart punctuation
33
+ /// - All other djot features
34
+ pub fn extract_complete_djot_content(
35
+ events: &[Event],
36
+ metadata: crate::types::Metadata,
37
+ tables: Vec<crate::types::Table>,
38
+ ) -> DjotContent {
39
+ let plain_text = extract_text_from_events(events);
40
+
41
+ let mut blocks = Vec::new();
42
+ let mut images = Vec::new();
43
+ let mut links = Vec::new();
44
+ let mut footnotes = Vec::new();
45
+ let attributes_map: HashMap<String, Attributes> = HashMap::new();
46
+
47
+ let mut state = ExtractionState::new();
48
+
49
+ for event in events {
50
+ match event {
51
+ Event::Start(container, attrs) => {
52
+ handle_start_event(
53
+ &mut state,
54
+ container,
55
+ attrs,
56
+ &mut blocks,
57
+ &mut images,
58
+ &mut links,
59
+ &mut footnotes,
60
+ );
61
+ }
62
+ Event::End(container) => {
63
+ handle_end_event(
64
+ &mut state,
65
+ container,
66
+ &mut blocks,
67
+ &mut images,
68
+ &mut links,
69
+ &mut footnotes,
70
+ );
71
+ }
72
+ Event::Str(s) => {
73
+ if state.in_code_block || state.in_raw_block {
74
+ state.code_content.push_str(s);
75
+ } else if state.in_math {
76
+ state.math_content.push_str(s);
77
+ } else {
78
+ state.current_text.push_str(s);
79
+ }
80
+ }
81
+ Event::FootnoteReference(label) => {
82
+ handle_footnote_reference(&mut state, label);
83
+ }
84
+ Event::Symbol(sym) => {
85
+ handle_symbol(&mut state, sym);
86
+ }
87
+ Event::Attributes(attrs) => {
88
+ // Store attributes to be applied to the next element
89
+ state.pending_attributes = Some(parse_jotdown_attributes(attrs));
90
+ }
91
+ Event::Softbreak => {
92
+ if state.in_math {
93
+ state.math_content.push(' ');
94
+ } else if !state.inline_type_stack.is_empty() {
95
+ state.current_text.push(' ');
96
+ } else {
97
+ state.current_text.push('\n');
98
+ }
99
+ }
100
+ Event::Hardbreak => {
101
+ if state.in_math {
102
+ state.math_content.push('\n');
103
+ } else {
104
+ state.current_text.push('\n');
105
+ }
106
+ }
107
+ Event::NonBreakingSpace => {
108
+ state.current_text.push(' ');
109
+ }
110
+ Event::Blankline => {
111
+ // Blank lines are typically ignored in block processing
112
+ }
113
+ Event::ThematicBreak(attrs) => {
114
+ handle_thematic_break(&mut state, attrs, &mut blocks);
115
+ }
116
+ // Smart punctuation events
117
+ Event::LeftSingleQuote => {
118
+ state.current_text.push('\'');
119
+ }
120
+ Event::RightSingleQuote => {
121
+ state.current_text.push('\'');
122
+ }
123
+ Event::LeftDoubleQuote => {
124
+ state.current_text.push('"');
125
+ }
126
+ Event::RightDoubleQuote => {
127
+ state.current_text.push('"');
128
+ }
129
+ Event::Ellipsis => {
130
+ state.current_text.push_str("...");
131
+ }
132
+ Event::EnDash => {
133
+ state.current_text.push_str("--");
134
+ }
135
+ Event::EmDash => {
136
+ state.current_text.push_str("---");
137
+ }
138
+ Event::Escape => {
139
+ // Escape is a marker, doesn't produce output
140
+ }
141
+ }
142
+ }
143
+
144
+ // Finalize any remaining content
145
+ state.flush_text();
146
+
147
+ // Pop any remaining blocks
148
+ while !state.block_stack.is_empty() {
149
+ pop_block(&mut state, &mut blocks);
150
+ }
151
+
152
+ // Add any remaining inline elements to the last block if exists
153
+ if !state.current_inline_elements.is_empty()
154
+ && let Some(last_block) = blocks.last_mut()
155
+ {
156
+ last_block.inline_content.append(&mut state.current_inline_elements);
157
+ }
158
+
159
+ DjotContent {
160
+ plain_text,
161
+ blocks,
162
+ metadata,
163
+ tables,
164
+ images,
165
+ links,
166
+ footnotes,
167
+ attributes: attributes_map,
168
+ }
169
+ }
170
+
171
+ /// Handle start of a container event.
172
+ fn handle_start_event(
173
+ state: &mut ExtractionState,
174
+ container: &Container,
175
+ attrs: &jotdown::Attributes,
176
+ _blocks: &mut Vec<FormattedBlock>,
177
+ images: &mut Vec<DjotImage>,
178
+ links: &mut Vec<DjotLink>,
179
+ footnotes: &mut Vec<crate::types::Footnote>,
180
+ ) {
181
+ // Parse attributes from jotdown's Attributes type
182
+ let parsed_attrs = if attrs.is_empty() {
183
+ state.pending_attributes.take()
184
+ } else {
185
+ Some(parse_jotdown_attributes(attrs))
186
+ };
187
+
188
+ // Try block handlers first
189
+ if handle_block_start(state, container, attrs, parsed_attrs.clone(), footnotes) {
190
+ return;
191
+ }
192
+
193
+ // Try inline handlers
194
+ if handle_inline_start(state, container, parsed_attrs, images, links) {
195
+ return;
196
+ }
197
+
198
+ // Handle remaining containers (tables, link definitions, etc.)
199
+ match container {
200
+ Container::Table | Container::TableRow { .. } | Container::TableCell { .. } | Container::Caption => {
201
+ // Tables are extracted separately
202
+ }
203
+ Container::LinkDefinition { .. } => {
204
+ // Link definitions are resolved by jotdown, not needed in output
205
+ }
206
+ _ => {}
207
+ }
208
+ }
209
+
210
+ /// Handle end of a container event.
211
+ fn handle_end_event(
212
+ state: &mut ExtractionState,
213
+ container: &Container,
214
+ blocks: &mut Vec<FormattedBlock>,
215
+ images: &mut [DjotImage],
216
+ links: &mut [DjotLink],
217
+ footnotes: &mut [crate::types::Footnote],
218
+ ) {
219
+ // Check if it's a block container
220
+ if handle_block_end(state, container) {
221
+ finalize_block_element(state, blocks);
222
+ return;
223
+ }
224
+
225
+ // Handle special cases
226
+ match container {
227
+ Container::Footnote { .. } => {
228
+ handle_footnote_end(state, footnotes);
229
+ }
230
+ Container::Math { display } => {
231
+ handle_math_end(state, *display);
232
+ }
233
+ Container::Link(url, _) => {
234
+ handle_link_end(state, url, links);
235
+ }
236
+ Container::Image(src, _) => {
237
+ handle_image_end(state, src, images);
238
+ }
239
+ _ => {
240
+ // Check if it's an inline element
241
+ if handle_inline_end(state, container) {
242
+ finalize_inline_element(state, container);
243
+ }
244
+ }
245
+ }
246
+
247
+ // Handle remaining containers (tables, link definitions, etc.)
248
+ match container {
249
+ Container::Table | Container::TableRow { .. } | Container::TableCell { .. } | Container::Caption => {
250
+ // Tables are handled separately
251
+ }
252
+ Container::LinkDefinition { .. } => {
253
+ // Link definitions don't produce output
254
+ }
255
+ _ => {}
256
+ }
257
+ }
@@ -0,0 +1,101 @@
1
+ //! Event handlers for special Djot elements.
2
+
3
+ use super::state::{ExtractionState, pop_block};
4
+ use crate::extractors::djot_format::attributes::parse_jotdown_attributes;
5
+ use crate::types::{BlockType, FormattedBlock, InlineElement, InlineType};
6
+ use std::collections::HashMap;
7
+
8
+ /// Handle footnote reference event.
9
+ pub(super) fn handle_footnote_reference(state: &mut ExtractionState, label: &str) {
10
+ state.flush_text();
11
+
12
+ let mut meta = HashMap::new();
13
+ meta.insert("label".to_string(), label.to_string());
14
+
15
+ state.current_inline_elements.push(InlineElement {
16
+ element_type: InlineType::FootnoteRef,
17
+ content: label.to_string(),
18
+ attributes: None,
19
+ metadata: Some(meta),
20
+ });
21
+ }
22
+
23
+ /// Handle symbol event.
24
+ pub(super) fn handle_symbol(state: &mut ExtractionState, sym: &str) {
25
+ state.flush_text();
26
+
27
+ state.current_inline_elements.push(InlineElement {
28
+ element_type: InlineType::Symbol,
29
+ content: sym.to_string(),
30
+ attributes: None,
31
+ metadata: None,
32
+ });
33
+ }
34
+
35
+ /// Handle thematic break event.
36
+ pub(super) fn handle_thematic_break(
37
+ state: &mut ExtractionState,
38
+ attrs: &jotdown::Attributes,
39
+ blocks: &mut Vec<FormattedBlock>,
40
+ ) {
41
+ state.flush_text();
42
+
43
+ let parsed_attrs = if attrs.is_empty() {
44
+ None
45
+ } else {
46
+ Some(parse_jotdown_attributes(attrs))
47
+ };
48
+
49
+ let hr_block = FormattedBlock {
50
+ block_type: BlockType::ThematicBreak,
51
+ level: None,
52
+ inline_content: Vec::new(),
53
+ attributes: parsed_attrs,
54
+ language: None,
55
+ code: None,
56
+ children: Vec::new(),
57
+ };
58
+
59
+ if let Some(parent) = state.block_stack.last_mut() {
60
+ parent.children.push(hr_block);
61
+ } else {
62
+ blocks.push(hr_block);
63
+ }
64
+ }
65
+
66
+ /// Handle end of footnote definition.
67
+ pub(super) fn handle_footnote_end(state: &mut ExtractionState, footnotes: &mut [crate::types::Footnote]) {
68
+ state.flush_text();
69
+ // Pop the footnote content block and add to the last footnote
70
+ if let Some(mut block) = state.block_stack.pop() {
71
+ block.inline_content.append(&mut state.current_inline_elements);
72
+ if let Some(footnote) = footnotes.last_mut() {
73
+ footnote.content.push(block);
74
+ }
75
+ }
76
+ }
77
+
78
+ /// Finalize block element content and pop from stack.
79
+ pub(super) fn finalize_block_element(state: &mut ExtractionState, blocks: &mut Vec<FormattedBlock>) {
80
+ // Flush any remaining text
81
+ state.flush_text();
82
+
83
+ // For code blocks, set the accumulated code content
84
+ if state.in_code_block {
85
+ if let Some(block) = state.block_stack.last_mut() {
86
+ block.code = Some(std::mem::take(&mut state.code_content));
87
+ }
88
+ state.in_code_block = false;
89
+ }
90
+
91
+ // For raw blocks
92
+ if state.in_raw_block {
93
+ if let Some(block) = state.block_stack.last_mut() {
94
+ block.code = Some(std::mem::take(&mut state.code_content));
95
+ }
96
+ state.in_raw_block = false;
97
+ state.raw_format = None;
98
+ }
99
+
100
+ pop_block(state, blocks);
101
+ }