kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,27 @@
1
+ //! Type definitions for transformation operations.
2
+
3
+ /// Metadata about a detected list item.
4
+ #[derive(Debug, Clone, PartialEq, Eq)]
5
+ pub struct ListItemMetadata {
6
+ /// Type of list (Bullet, Numbered, etc.)
7
+ pub list_type: ListType,
8
+ /// Starting byte offset in the content string
9
+ pub byte_start: usize,
10
+ /// Ending byte offset in the content string
11
+ pub byte_end: usize,
12
+ /// List item indent level
13
+ pub indent_level: u32,
14
+ }
15
+
16
+ /// Type of list detection.
17
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
18
+ pub enum ListType {
19
+ /// Bullet points (-, *, •, etc.)
20
+ Bullet,
21
+ /// Numbered lists (1., 2., etc.)
22
+ Numbered,
23
+ /// Lettered lists (a., b., A., B., etc.)
24
+ Lettered,
25
+ /// Indented items
26
+ Indented,
27
+ }
@@ -78,6 +78,8 @@ fn build_archive_result(
78
78
  chunks: None,
79
79
  images: None,
80
80
  pages: None,
81
+ djot_content: None,
82
+ elements: None,
81
83
  }
82
84
  }
83
85
 
@@ -172,6 +172,8 @@ impl DocumentExtractor for BibtexExtractor {
172
172
  detected_languages: None,
173
173
  chunks: None,
174
174
  images: None,
175
+ djot_content: None,
176
+ elements: None,
175
177
  })
176
178
  }
177
179
 
@@ -0,0 +1,134 @@
1
+ //! Djot attribute parsing utilities.
2
+ //!
3
+ //! Handles parsing of Djot attributes from jotdown events and string syntax.
4
+
5
+ use std::collections::HashMap;
6
+
7
+ /// Parse jotdown attributes into our Attributes representation.
8
+ ///
9
+ /// Converts jotdown's internal attribute representation to Kreuzberg's
10
+ /// standardized Attributes struct, handling IDs, classes, and key-value pairs.
11
+ pub fn parse_jotdown_attributes(attrs: &jotdown::Attributes) -> crate::types::Attributes {
12
+ use crate::types::Attributes;
13
+ use jotdown::AttributeKind;
14
+
15
+ let mut id = None;
16
+ let mut classes = Vec::new();
17
+ let mut key_values = HashMap::new();
18
+
19
+ for (kind, value) in attrs.iter() {
20
+ match kind {
21
+ AttributeKind::Id => {
22
+ // Last ID wins if multiple are specified
23
+ id = Some(value.to_string());
24
+ }
25
+ AttributeKind::Class => {
26
+ classes.push(value.to_string());
27
+ }
28
+ AttributeKind::Pair { key } => {
29
+ key_values.insert(key.to_string(), value.to_string());
30
+ }
31
+ AttributeKind::Comment => {
32
+ // Comments are ignored in our representation
33
+ }
34
+ }
35
+ }
36
+
37
+ Attributes {
38
+ id,
39
+ classes,
40
+ key_values,
41
+ }
42
+ }
43
+
44
+ /// Parse djot attribute syntax from string: {.class #id key="value"}
45
+ #[allow(dead_code)]
46
+ pub fn parse_djot_attributes(attr_str: &str) -> crate::types::Attributes {
47
+ use crate::types::Attributes;
48
+
49
+ let mut attrs = Attributes {
50
+ id: None,
51
+ classes: Vec::new(),
52
+ key_values: HashMap::new(),
53
+ };
54
+
55
+ // Simple parser for attribute syntax
56
+ let tokens = attr_str.split_whitespace();
57
+
58
+ for token in tokens {
59
+ if let Some(class) = token.strip_prefix('.') {
60
+ // Class
61
+ attrs.classes.push(class.to_string());
62
+ } else if let Some(id) = token.strip_prefix('#') {
63
+ // ID
64
+ attrs.id = Some(id.to_string());
65
+ } else if token.contains('=') {
66
+ // Key-value pair
67
+ if let Some((key, value)) = token.split_once('=') {
68
+ let clean_value = value.trim_matches('"').trim_matches('\'');
69
+ attrs.key_values.insert(key.to_string(), clean_value.to_string());
70
+ }
71
+ }
72
+ }
73
+
74
+ attrs
75
+ }
76
+
77
+ /// Render attributes to djot attribute syntax.
78
+ ///
79
+ /// Converts Kreuzberg's Attributes struct back to djot attribute syntax:
80
+ /// {.class #id key="value"}
81
+ pub fn render_attributes(attrs: &crate::types::Attributes) -> String {
82
+ let mut parts = Vec::new();
83
+
84
+ if let Some(ref id) = attrs.id {
85
+ parts.push(format!("#{}", id));
86
+ }
87
+
88
+ for class in &attrs.classes {
89
+ parts.push(format!(".{}", class));
90
+ }
91
+
92
+ for (key, value) in &attrs.key_values {
93
+ parts.push(format!("{}=\"{}\"", key, value));
94
+ }
95
+
96
+ if parts.is_empty() {
97
+ String::new()
98
+ } else {
99
+ format!("{{{}}}", parts.join(" "))
100
+ }
101
+ }
102
+
103
+ #[cfg(test)]
104
+ mod tests {
105
+ use super::*;
106
+
107
+ #[test]
108
+ fn test_render_attributes_with_all_parts() {
109
+ let mut attrs = crate::types::Attributes {
110
+ id: Some("my-id".to_string()),
111
+ classes: vec!["class1".to_string(), "class2".to_string()],
112
+ key_values: HashMap::new(),
113
+ };
114
+ attrs.key_values.insert("data-test".to_string(), "value".to_string());
115
+
116
+ let rendered = render_attributes(&attrs);
117
+ assert!(rendered.contains("#my-id"));
118
+ assert!(rendered.contains(".class1"));
119
+ assert!(rendered.contains(".class2"));
120
+ assert!(rendered.contains("data-test"));
121
+ }
122
+
123
+ #[test]
124
+ fn test_render_attributes_empty() {
125
+ let attrs = crate::types::Attributes {
126
+ id: None,
127
+ classes: vec![],
128
+ key_values: HashMap::new(),
129
+ };
130
+
131
+ let rendered = render_attributes(&attrs);
132
+ assert_eq!(rendered, "");
133
+ }
134
+ }
@@ -0,0 +1,223 @@
1
+ //! Djot content conversion and HTML rendering APIs.
2
+ //!
3
+ //! Provides public APIs for converting between different representations:
4
+ //! - DjotContent to djot markup
5
+ //! - ExtractionResult to djot markup
6
+ //! - Djot markup to HTML
7
+
8
+ use super::rendering::render_block_to_djot;
9
+ use jotdown::Parser;
10
+
11
+ /// Convert DjotContent back to djot markup.
12
+ ///
13
+ /// This function takes a `DjotContent` structure and generates valid djot markup
14
+ /// from it, preserving:
15
+ /// - Block structure (headings, code blocks, lists, blockquotes, etc.)
16
+ /// - Inline formatting (strong, emphasis, highlight, subscript, superscript, etc.)
17
+ /// - Attributes where present ({.class #id key="value"})
18
+ ///
19
+ /// # Arguments
20
+ ///
21
+ /// * `content` - The DjotContent to convert
22
+ ///
23
+ /// # Returns
24
+ ///
25
+ /// A String containing valid djot markup
26
+ ///
27
+ /// # Example
28
+ ///
29
+ /// ```ignore
30
+ /// let djot_content = // ... extract from some source
31
+ /// let markup = djot_content_to_djot(&djot_content);
32
+ /// println!("{}", markup);
33
+ /// ```
34
+ pub fn djot_content_to_djot(content: &crate::types::DjotContent) -> String {
35
+ let mut output = String::new();
36
+
37
+ for block in &content.blocks {
38
+ render_block_to_djot(&mut output, block, 0);
39
+ }
40
+
41
+ output
42
+ }
43
+
44
+ /// Convert any ExtractionResult to djot format.
45
+ ///
46
+ /// This function converts an `ExtractionResult` to djot markup:
47
+ /// - If `djot_content` is `Some`, uses `djot_content_to_djot` for full fidelity conversion
48
+ /// - Otherwise, wraps the plain text content in paragraphs
49
+ ///
50
+ /// # Arguments
51
+ ///
52
+ /// * `result` - The ExtractionResult to convert
53
+ ///
54
+ /// # Returns
55
+ ///
56
+ /// A `Result` containing the djot markup string
57
+ ///
58
+ /// # Example
59
+ ///
60
+ /// ```ignore
61
+ /// let result = extractor.extract_bytes(bytes, "text/plain", &config).await?;
62
+ /// let djot_markup = extraction_result_to_djot(&result)?;
63
+ /// ```
64
+ pub fn extraction_result_to_djot(result: &crate::types::ExtractionResult) -> crate::Result<String> {
65
+ if let Some(ref djot_content) = result.djot_content {
66
+ Ok(djot_content_to_djot(djot_content))
67
+ } else {
68
+ // Convert plain text to basic djot paragraphs
69
+ let mut output = String::new();
70
+
71
+ // Split content by double newlines to create paragraphs
72
+ let paragraphs: Vec<&str> = result.content.split("\n\n").collect();
73
+
74
+ for para in paragraphs {
75
+ let trimmed = para.trim();
76
+ if !trimmed.is_empty() {
77
+ output.push_str(trimmed);
78
+ output.push_str("\n\n");
79
+ }
80
+ }
81
+
82
+ Ok(output)
83
+ }
84
+ }
85
+
86
+ /// Render djot content to HTML.
87
+ ///
88
+ /// This function takes djot source text and renders it to HTML using jotdown's
89
+ /// built-in HTML renderer.
90
+ ///
91
+ /// # Arguments
92
+ ///
93
+ /// * `djot_source` - The djot markup text to render
94
+ ///
95
+ /// # Returns
96
+ ///
97
+ /// A `Result` containing the rendered HTML string
98
+ ///
99
+ /// # Example
100
+ ///
101
+ /// ```ignore
102
+ /// let djot = "# Hello\n\nThis is *bold* and _italic_.";
103
+ /// let html = djot_to_html(djot)?;
104
+ /// assert!(html.contains("<h1>"));
105
+ /// assert!(html.contains("<strong>"));
106
+ /// assert!(html.contains("<em>"));
107
+ /// ```
108
+ pub fn djot_to_html(djot_source: &str) -> crate::Result<String> {
109
+ let parser = Parser::new(djot_source);
110
+ let html = jotdown::html::render_to_string(parser);
111
+ Ok(html)
112
+ }
113
+
114
+ #[cfg(test)]
115
+ mod tests {
116
+ use super::*;
117
+ use crate::types::{BlockType, DjotContent, ExtractionResult, FormattedBlock, InlineElement, InlineType, Metadata};
118
+
119
+ #[test]
120
+ fn test_djot_content_to_djot_heading() {
121
+ let content = DjotContent {
122
+ plain_text: "Test Heading".to_string(),
123
+ blocks: vec![FormattedBlock {
124
+ block_type: BlockType::Heading,
125
+ level: Some(1),
126
+ inline_content: vec![InlineElement {
127
+ element_type: InlineType::Text,
128
+ content: "Test Heading".to_string(),
129
+ attributes: None,
130
+ metadata: None,
131
+ }],
132
+ attributes: None,
133
+ language: None,
134
+ code: None,
135
+ children: vec![],
136
+ }],
137
+ metadata: Metadata::default(),
138
+ tables: vec![],
139
+ images: vec![],
140
+ links: vec![],
141
+ footnotes: vec![],
142
+ attributes: Default::default(),
143
+ };
144
+
145
+ let markup = djot_content_to_djot(&content);
146
+ assert!(markup.contains("# Test Heading"));
147
+ }
148
+
149
+ #[test]
150
+ fn test_extraction_result_to_djot_with_djot_content() {
151
+ let result = ExtractionResult {
152
+ content: "Test content".to_string(),
153
+ mime_type: "text/djot".to_string(),
154
+ metadata: Metadata::default(),
155
+ tables: vec![],
156
+ detected_languages: None,
157
+ chunks: None,
158
+ images: None,
159
+ pages: None,
160
+ djot_content: Some(DjotContent {
161
+ plain_text: "Test content".to_string(),
162
+ blocks: vec![FormattedBlock {
163
+ block_type: BlockType::Paragraph,
164
+ level: None,
165
+ inline_content: vec![InlineElement {
166
+ element_type: InlineType::Text,
167
+ content: "Test content".to_string(),
168
+ attributes: None,
169
+ metadata: None,
170
+ }],
171
+ attributes: None,
172
+ language: None,
173
+ code: None,
174
+ children: vec![],
175
+ }],
176
+ metadata: Metadata::default(),
177
+ tables: vec![],
178
+ images: vec![],
179
+ links: vec![],
180
+ footnotes: vec![],
181
+ attributes: Default::default(),
182
+ }),
183
+ elements: None,
184
+ };
185
+
186
+ let markup = extraction_result_to_djot(&result).expect("Should convert");
187
+ assert!(markup.contains("Test content"));
188
+ }
189
+
190
+ #[test]
191
+ fn test_extraction_result_to_djot_without_djot_content() {
192
+ let result = ExtractionResult {
193
+ content: "Paragraph one\n\nParagraph two".to_string(),
194
+ mime_type: "text/plain".to_string(),
195
+ metadata: Metadata::default(),
196
+ tables: vec![],
197
+ detected_languages: None,
198
+ chunks: None,
199
+ images: None,
200
+ pages: None,
201
+ djot_content: None,
202
+ elements: None,
203
+ };
204
+
205
+ let markup = extraction_result_to_djot(&result).expect("Should convert");
206
+ assert!(markup.contains("Paragraph one"));
207
+ assert!(markup.contains("Paragraph two"));
208
+ }
209
+
210
+ #[test]
211
+ fn test_djot_to_html_heading() {
212
+ let djot = "# Hello";
213
+ let html = djot_to_html(djot).expect("Should render");
214
+ assert!(html.contains("<h1>") || html.contains("<H1>"));
215
+ }
216
+
217
+ #[test]
218
+ fn test_djot_to_html_formatting() {
219
+ let djot = "This is *bold* and _italic_.";
220
+ let html = djot_to_html(djot).expect("Should render");
221
+ assert!(html.contains("<strong>") || html.contains("<em>"));
222
+ }
223
+ }
@@ -0,0 +1,172 @@
1
+ //! Djot document extractor with plugin integration.
2
+ //!
3
+ //! Implements the DocumentExtractor and Plugin traits for Djot markup files.
4
+
5
+ use super::parsing::{extract_complete_djot_content, extract_tables_from_events, extract_text_from_events};
6
+ use crate::Result;
7
+ use crate::core::config::ExtractionConfig;
8
+ use crate::plugins::{DocumentExtractor, Plugin};
9
+ use crate::types::{ExtractionResult, Metadata};
10
+ use async_trait::async_trait;
11
+ use jotdown::{Event, Parser};
12
+
13
+ /// Djot markup extractor with metadata and table support.
14
+ ///
15
+ /// Parses Djot documents with YAML frontmatter, extracting:
16
+ /// - Metadata from YAML frontmatter
17
+ /// - Plain text content
18
+ /// - Tables as structured data
19
+ /// - Document structure (headings, links, code blocks)
20
+ #[derive(Debug, Clone)]
21
+ pub struct DjotExtractor;
22
+
23
+ impl DjotExtractor {
24
+ /// Create a new Djot extractor.
25
+ pub fn new() -> Self {
26
+ Self
27
+ }
28
+ }
29
+
30
+ impl Default for DjotExtractor {
31
+ fn default() -> Self {
32
+ Self::new()
33
+ }
34
+ }
35
+
36
+ impl Plugin for DjotExtractor {
37
+ fn name(&self) -> &str {
38
+ "djot-extractor"
39
+ }
40
+
41
+ fn version(&self) -> String {
42
+ env!("CARGO_PKG_VERSION").to_string()
43
+ }
44
+
45
+ fn initialize(&self) -> Result<()> {
46
+ Ok(())
47
+ }
48
+
49
+ fn shutdown(&self) -> Result<()> {
50
+ Ok(())
51
+ }
52
+
53
+ fn description(&self) -> &str {
54
+ "Extracts content from Djot markup files with YAML frontmatter and table support"
55
+ }
56
+
57
+ fn author(&self) -> &str {
58
+ "Kreuzberg Team"
59
+ }
60
+ }
61
+
62
+ #[async_trait]
63
+ impl DocumentExtractor for DjotExtractor {
64
+ #[cfg_attr(
65
+ feature = "otel",
66
+ tracing::instrument(
67
+ skip(self, content, _config),
68
+ fields(
69
+ extractor.name = self.name(),
70
+ content.size_bytes = content.len(),
71
+ )
72
+ )
73
+ )]
74
+ async fn extract_bytes(
75
+ &self,
76
+ content: &[u8],
77
+ mime_type: &str,
78
+ _config: &ExtractionConfig,
79
+ ) -> Result<ExtractionResult> {
80
+ let text = String::from_utf8_lossy(content).into_owned();
81
+
82
+ let (yaml, remaining_content) = crate::extractors::frontmatter_utils::extract_frontmatter(&text);
83
+
84
+ let mut metadata = if let Some(ref yaml_value) = yaml {
85
+ crate::extractors::frontmatter_utils::extract_metadata_from_yaml(yaml_value)
86
+ } else {
87
+ Metadata::default()
88
+ };
89
+
90
+ if !metadata.additional.contains_key("title")
91
+ && let Some(title) = crate::extractors::frontmatter_utils::extract_title_from_content(&remaining_content)
92
+ {
93
+ metadata.additional.insert("title".to_string(), title.into());
94
+ }
95
+
96
+ // Parse with jotdown and collect events once for extraction
97
+ let parser = Parser::new(&remaining_content);
98
+ let events: Vec<Event> = parser.collect();
99
+
100
+ let extracted_text = extract_text_from_events(&events);
101
+ let tables = extract_tables_from_events(&events);
102
+
103
+ // Extract complete djot content with all features
104
+ let djot_content = extract_complete_djot_content(&events, metadata.clone(), tables.clone());
105
+
106
+ Ok(ExtractionResult {
107
+ content: extracted_text,
108
+ mime_type: mime_type.to_string(),
109
+ metadata,
110
+ tables,
111
+ detected_languages: None,
112
+ chunks: None,
113
+ images: None,
114
+ pages: None,
115
+ djot_content: Some(djot_content),
116
+ elements: None,
117
+ })
118
+ }
119
+
120
+ fn supported_mime_types(&self) -> &[&str] {
121
+ &["text/djot", "text/x-djot"]
122
+ }
123
+
124
+ fn priority(&self) -> i32 {
125
+ 50
126
+ }
127
+ }
128
+
129
+ #[cfg(test)]
130
+ mod tests {
131
+ use super::*;
132
+
133
+ #[test]
134
+ fn test_djot_extractor_creation() {
135
+ let extractor = DjotExtractor::new();
136
+ assert_eq!(extractor.name(), "djot-extractor");
137
+ }
138
+
139
+ #[test]
140
+ fn test_can_extract_djot_mime_types() {
141
+ let extractor = DjotExtractor::new();
142
+ let mime_types = extractor.supported_mime_types();
143
+
144
+ assert!(mime_types.contains(&"text/djot"));
145
+ assert!(mime_types.contains(&"text/x-djot"));
146
+ }
147
+
148
+ #[test]
149
+ fn test_plugin_interface() {
150
+ let extractor = DjotExtractor::new();
151
+ assert_eq!(extractor.author(), "Kreuzberg Team");
152
+ assert!(!extractor.version().is_empty());
153
+ assert!(!extractor.description().is_empty());
154
+ }
155
+
156
+ #[tokio::test]
157
+ async fn test_extract_simple_djot() {
158
+ let content =
159
+ b"# Header\n\nThis is a paragraph with *bold* and _italic_ text.\n\n## Subheading\n\nMore content here.";
160
+ let extractor = DjotExtractor::new();
161
+ let config = ExtractionConfig::default();
162
+
163
+ let result = extractor.extract_bytes(content, "text/djot", &config).await;
164
+ assert!(result.is_ok());
165
+
166
+ let result = result.unwrap();
167
+ assert!(result.content.contains("Header"));
168
+ assert!(result.content.contains("This is a paragraph"));
169
+ assert!(result.content.contains("bold"));
170
+ assert!(result.content.contains("italic"));
171
+ }
172
+ }
@@ -0,0 +1,24 @@
1
+ //! Djot markup format extractor and utilities.
2
+ //!
3
+ //! This module provides:
4
+ //! - Djot parsing using the jotdown crate
5
+ //! - YAML frontmatter metadata extraction (same as Markdown)
6
+ //! - Table extraction as structured data
7
+ //! - Heading structure preservation
8
+ //! - Code block and link extraction
9
+ //! - Djot content rendering and conversion APIs
10
+ //!
11
+ //! Djot is a modern markup language with simpler parsing rules than CommonMark.
12
+ //! See https://djot.net for the specification.
13
+ //!
14
+ //! Requires the `djot` feature.
15
+
16
+ pub mod attributes;
17
+ pub mod conversion;
18
+ pub mod extractor;
19
+ pub mod parsing;
20
+ pub mod rendering;
21
+
22
+ // Re-export public API
23
+ pub use conversion::{djot_content_to_djot, djot_to_html, extraction_result_to_djot};
24
+ pub use extractor::DjotExtractor;