kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,27 @@
1
+ //! Type definitions for transformation operations.
2
+
3
+ /// Metadata about a detected list item.
4
+ #[derive(Debug, Clone, PartialEq, Eq)]
5
+ pub struct ListItemMetadata {
6
+ /// Type of list (Bullet, Numbered, etc.)
7
+ pub list_type: ListType,
8
+ /// Starting byte offset in the content string
9
+ pub byte_start: usize,
10
+ /// Ending byte offset in the content string
11
+ pub byte_end: usize,
12
+ /// List item indent level
13
+ pub indent_level: u32,
14
+ }
15
+
16
+ /// Type of list detection.
17
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
18
+ pub enum ListType {
19
+ /// Bullet points (-, *, •, etc.)
20
+ Bullet,
21
+ /// Numbered lists (1., 2., etc.)
22
+ Numbered,
23
+ /// Lettered lists (a., b., A., B., etc.)
24
+ Lettered,
25
+ /// Indented items
26
+ Indented,
27
+ }
@@ -78,6 +78,8 @@ fn build_archive_result(
78
78
  chunks: None,
79
79
  images: None,
80
80
  pages: None,
81
+ djot_content: None,
82
+ elements: None,
81
83
  }
82
84
  }
83
85
 
@@ -172,6 +172,8 @@ impl DocumentExtractor for BibtexExtractor {
172
172
  detected_languages: None,
173
173
  chunks: None,
174
174
  images: None,
175
+ djot_content: None,
176
+ elements: None,
175
177
  })
176
178
  }
177
179
 
@@ -0,0 +1,134 @@
1
+ //! Djot attribute parsing utilities.
2
+ //!
3
+ //! Handles parsing of Djot attributes from jotdown events and string syntax.
4
+
5
+ use std::collections::HashMap;
6
+
7
+ /// Parse jotdown attributes into our Attributes representation.
8
+ ///
9
+ /// Converts jotdown's internal attribute representation to Kreuzberg's
10
+ /// standardized Attributes struct, handling IDs, classes, and key-value pairs.
11
+ pub fn parse_jotdown_attributes(attrs: &jotdown::Attributes) -> crate::types::Attributes {
12
+ use crate::types::Attributes;
13
+ use jotdown::AttributeKind;
14
+
15
+ let mut id = None;
16
+ let mut classes = Vec::new();
17
+ let mut key_values = HashMap::new();
18
+
19
+ for (kind, value) in attrs.iter() {
20
+ match kind {
21
+ AttributeKind::Id => {
22
+ // Last ID wins if multiple are specified
23
+ id = Some(value.to_string());
24
+ }
25
+ AttributeKind::Class => {
26
+ classes.push(value.to_string());
27
+ }
28
+ AttributeKind::Pair { key } => {
29
+ key_values.insert(key.to_string(), value.to_string());
30
+ }
31
+ AttributeKind::Comment => {
32
+ // Comments are ignored in our representation
33
+ }
34
+ }
35
+ }
36
+
37
+ Attributes {
38
+ id,
39
+ classes,
40
+ key_values,
41
+ }
42
+ }
43
+
44
+ /// Parse djot attribute syntax from string: {.class #id key="value"}
45
+ #[allow(dead_code)]
46
+ pub fn parse_djot_attributes(attr_str: &str) -> crate::types::Attributes {
47
+ use crate::types::Attributes;
48
+
49
+ let mut attrs = Attributes {
50
+ id: None,
51
+ classes: Vec::new(),
52
+ key_values: HashMap::new(),
53
+ };
54
+
55
+ // Simple parser for attribute syntax
56
+ let tokens = attr_str.split_whitespace();
57
+
58
+ for token in tokens {
59
+ if let Some(class) = token.strip_prefix('.') {
60
+ // Class
61
+ attrs.classes.push(class.to_string());
62
+ } else if let Some(id) = token.strip_prefix('#') {
63
+ // ID
64
+ attrs.id = Some(id.to_string());
65
+ } else if token.contains('=') {
66
+ // Key-value pair
67
+ if let Some((key, value)) = token.split_once('=') {
68
+ let clean_value = value.trim_matches('"').trim_matches('\'');
69
+ attrs.key_values.insert(key.to_string(), clean_value.to_string());
70
+ }
71
+ }
72
+ }
73
+
74
+ attrs
75
+ }
76
+
77
+ /// Render attributes to djot attribute syntax.
78
+ ///
79
+ /// Converts Kreuzberg's Attributes struct back to djot attribute syntax:
80
+ /// {.class #id key="value"}
81
+ pub fn render_attributes(attrs: &crate::types::Attributes) -> String {
82
+ let mut parts = Vec::new();
83
+
84
+ if let Some(ref id) = attrs.id {
85
+ parts.push(format!("#{}", id));
86
+ }
87
+
88
+ for class in &attrs.classes {
89
+ parts.push(format!(".{}", class));
90
+ }
91
+
92
+ for (key, value) in &attrs.key_values {
93
+ parts.push(format!("{}=\"{}\"", key, value));
94
+ }
95
+
96
+ if parts.is_empty() {
97
+ String::new()
98
+ } else {
99
+ format!("{{{}}}", parts.join(" "))
100
+ }
101
+ }
102
+
103
+ #[cfg(test)]
104
+ mod tests {
105
+ use super::*;
106
+
107
+ #[test]
108
+ fn test_render_attributes_with_all_parts() {
109
+ let mut attrs = crate::types::Attributes {
110
+ id: Some("my-id".to_string()),
111
+ classes: vec!["class1".to_string(), "class2".to_string()],
112
+ key_values: HashMap::new(),
113
+ };
114
+ attrs.key_values.insert("data-test".to_string(), "value".to_string());
115
+
116
+ let rendered = render_attributes(&attrs);
117
+ assert!(rendered.contains("#my-id"));
118
+ assert!(rendered.contains(".class1"));
119
+ assert!(rendered.contains(".class2"));
120
+ assert!(rendered.contains("data-test"));
121
+ }
122
+
123
+ #[test]
124
+ fn test_render_attributes_empty() {
125
+ let attrs = crate::types::Attributes {
126
+ id: None,
127
+ classes: vec![],
128
+ key_values: HashMap::new(),
129
+ };
130
+
131
+ let rendered = render_attributes(&attrs);
132
+ assert_eq!(rendered, "");
133
+ }
134
+ }
@@ -0,0 +1,223 @@
1
+ //! Djot content conversion and HTML rendering APIs.
2
+ //!
3
+ //! Provides public APIs for converting between different representations:
4
+ //! - DjotContent to djot markup
5
+ //! - ExtractionResult to djot markup
6
+ //! - Djot markup to HTML
7
+
8
+ use super::rendering::render_block_to_djot;
9
+ use jotdown::Parser;
10
+
11
+ /// Convert DjotContent back to djot markup.
12
+ ///
13
+ /// This function takes a `DjotContent` structure and generates valid djot markup
14
+ /// from it, preserving:
15
+ /// - Block structure (headings, code blocks, lists, blockquotes, etc.)
16
+ /// - Inline formatting (strong, emphasis, highlight, subscript, superscript, etc.)
17
+ /// - Attributes where present ({.class #id key="value"})
18
+ ///
19
+ /// # Arguments
20
+ ///
21
+ /// * `content` - The DjotContent to convert
22
+ ///
23
+ /// # Returns
24
+ ///
25
+ /// A String containing valid djot markup
26
+ ///
27
+ /// # Example
28
+ ///
29
+ /// ```ignore
30
+ /// let djot_content = // ... extract from some source
31
+ /// let markup = djot_content_to_djot(&djot_content);
32
+ /// println!("{}", markup);
33
+ /// ```
34
+ pub fn djot_content_to_djot(content: &crate::types::DjotContent) -> String {
35
+ let mut output = String::new();
36
+
37
+ for block in &content.blocks {
38
+ render_block_to_djot(&mut output, block, 0);
39
+ }
40
+
41
+ output
42
+ }
43
+
44
+ /// Convert any ExtractionResult to djot format.
45
+ ///
46
+ /// This function converts an `ExtractionResult` to djot markup:
47
+ /// - If `djot_content` is `Some`, uses `djot_content_to_djot` for full fidelity conversion
48
+ /// - Otherwise, wraps the plain text content in paragraphs
49
+ ///
50
+ /// # Arguments
51
+ ///
52
+ /// * `result` - The ExtractionResult to convert
53
+ ///
54
+ /// # Returns
55
+ ///
56
+ /// A `Result` containing the djot markup string
57
+ ///
58
+ /// # Example
59
+ ///
60
+ /// ```ignore
61
+ /// let result = extractor.extract_bytes(bytes, "text/plain", &config).await?;
62
+ /// let djot_markup = extraction_result_to_djot(&result)?;
63
+ /// ```
64
+ pub fn extraction_result_to_djot(result: &crate::types::ExtractionResult) -> crate::Result<String> {
65
+ if let Some(ref djot_content) = result.djot_content {
66
+ Ok(djot_content_to_djot(djot_content))
67
+ } else {
68
+ // Convert plain text to basic djot paragraphs
69
+ let mut output = String::new();
70
+
71
+ // Split content by double newlines to create paragraphs
72
+ let paragraphs: Vec<&str> = result.content.split("\n\n").collect();
73
+
74
+ for para in paragraphs {
75
+ let trimmed = para.trim();
76
+ if !trimmed.is_empty() {
77
+ output.push_str(trimmed);
78
+ output.push_str("\n\n");
79
+ }
80
+ }
81
+
82
+ Ok(output)
83
+ }
84
+ }
85
+
86
+ /// Render djot content to HTML.
87
+ ///
88
+ /// This function takes djot source text and renders it to HTML using jotdown's
89
+ /// built-in HTML renderer.
90
+ ///
91
+ /// # Arguments
92
+ ///
93
+ /// * `djot_source` - The djot markup text to render
94
+ ///
95
+ /// # Returns
96
+ ///
97
+ /// A `Result` containing the rendered HTML string
98
+ ///
99
+ /// # Example
100
+ ///
101
+ /// ```ignore
102
+ /// let djot = "# Hello\n\nThis is *bold* and _italic_.";
103
+ /// let html = djot_to_html(djot)?;
104
+ /// assert!(html.contains("<h1>"));
105
+ /// assert!(html.contains("<strong>"));
106
+ /// assert!(html.contains("<em>"));
107
+ /// ```
108
+ pub fn djot_to_html(djot_source: &str) -> crate::Result<String> {
109
+ let parser = Parser::new(djot_source);
110
+ let html = jotdown::html::render_to_string(parser);
111
+ Ok(html)
112
+ }
113
+
114
+ #[cfg(test)]
115
+ mod tests {
116
+ use super::*;
117
+ use crate::types::{BlockType, DjotContent, ExtractionResult, FormattedBlock, InlineElement, InlineType, Metadata};
118
+
119
+ #[test]
120
+ fn test_djot_content_to_djot_heading() {
121
+ let content = DjotContent {
122
+ plain_text: "Test Heading".to_string(),
123
+ blocks: vec![FormattedBlock {
124
+ block_type: BlockType::Heading,
125
+ level: Some(1),
126
+ inline_content: vec![InlineElement {
127
+ element_type: InlineType::Text,
128
+ content: "Test Heading".to_string(),
129
+ attributes: None,
130
+ metadata: None,
131
+ }],
132
+ attributes: None,
133
+ language: None,
134
+ code: None,
135
+ children: vec![],
136
+ }],
137
+ metadata: Metadata::default(),
138
+ tables: vec![],
139
+ images: vec![],
140
+ links: vec![],
141
+ footnotes: vec![],
142
+ attributes: Default::default(),
143
+ };
144
+
145
+ let markup = djot_content_to_djot(&content);
146
+ assert!(markup.contains("# Test Heading"));
147
+ }
148
+
149
+ #[test]
150
+ fn test_extraction_result_to_djot_with_djot_content() {
151
+ let result = ExtractionResult {
152
+ content: "Test content".to_string(),
153
+ mime_type: "text/djot".to_string(),
154
+ metadata: Metadata::default(),
155
+ tables: vec![],
156
+ detected_languages: None,
157
+ chunks: None,
158
+ images: None,
159
+ pages: None,
160
+ djot_content: Some(DjotContent {
161
+ plain_text: "Test content".to_string(),
162
+ blocks: vec![FormattedBlock {
163
+ block_type: BlockType::Paragraph,
164
+ level: None,
165
+ inline_content: vec![InlineElement {
166
+ element_type: InlineType::Text,
167
+ content: "Test content".to_string(),
168
+ attributes: None,
169
+ metadata: None,
170
+ }],
171
+ attributes: None,
172
+ language: None,
173
+ code: None,
174
+ children: vec![],
175
+ }],
176
+ metadata: Metadata::default(),
177
+ tables: vec![],
178
+ images: vec![],
179
+ links: vec![],
180
+ footnotes: vec![],
181
+ attributes: Default::default(),
182
+ }),
183
+ elements: None,
184
+ };
185
+
186
+ let markup = extraction_result_to_djot(&result).expect("Should convert");
187
+ assert!(markup.contains("Test content"));
188
+ }
189
+
190
+ #[test]
191
+ fn test_extraction_result_to_djot_without_djot_content() {
192
+ let result = ExtractionResult {
193
+ content: "Paragraph one\n\nParagraph two".to_string(),
194
+ mime_type: "text/plain".to_string(),
195
+ metadata: Metadata::default(),
196
+ tables: vec![],
197
+ detected_languages: None,
198
+ chunks: None,
199
+ images: None,
200
+ pages: None,
201
+ djot_content: None,
202
+ elements: None,
203
+ };
204
+
205
+ let markup = extraction_result_to_djot(&result).expect("Should convert");
206
+ assert!(markup.contains("Paragraph one"));
207
+ assert!(markup.contains("Paragraph two"));
208
+ }
209
+
210
+ #[test]
211
+ fn test_djot_to_html_heading() {
212
+ let djot = "# Hello";
213
+ let html = djot_to_html(djot).expect("Should render");
214
+ assert!(html.contains("<h1>") || html.contains("<H1>"));
215
+ }
216
+
217
+ #[test]
218
+ fn test_djot_to_html_formatting() {
219
+ let djot = "This is *bold* and _italic_.";
220
+ let html = djot_to_html(djot).expect("Should render");
221
+ assert!(html.contains("<strong>") || html.contains("<em>"));
222
+ }
223
+ }
@@ -0,0 +1,172 @@
1
+ //! Djot document extractor with plugin integration.
2
+ //!
3
+ //! Implements the DocumentExtractor and Plugin traits for Djot markup files.
4
+
5
+ use super::parsing::{extract_complete_djot_content, extract_tables_from_events, extract_text_from_events};
6
+ use crate::Result;
7
+ use crate::core::config::ExtractionConfig;
8
+ use crate::plugins::{DocumentExtractor, Plugin};
9
+ use crate::types::{ExtractionResult, Metadata};
10
+ use async_trait::async_trait;
11
+ use jotdown::{Event, Parser};
12
+
13
+ /// Djot markup extractor with metadata and table support.
14
+ ///
15
+ /// Parses Djot documents with YAML frontmatter, extracting:
16
+ /// - Metadata from YAML frontmatter
17
+ /// - Plain text content
18
+ /// - Tables as structured data
19
+ /// - Document structure (headings, links, code blocks)
20
+ #[derive(Debug, Clone)]
21
+ pub struct DjotExtractor;
22
+
23
+ impl DjotExtractor {
24
+ /// Create a new Djot extractor.
25
+ pub fn new() -> Self {
26
+ Self
27
+ }
28
+ }
29
+
30
+ impl Default for DjotExtractor {
31
+ fn default() -> Self {
32
+ Self::new()
33
+ }
34
+ }
35
+
36
+ impl Plugin for DjotExtractor {
37
+ fn name(&self) -> &str {
38
+ "djot-extractor"
39
+ }
40
+
41
+ fn version(&self) -> String {
42
+ env!("CARGO_PKG_VERSION").to_string()
43
+ }
44
+
45
+ fn initialize(&self) -> Result<()> {
46
+ Ok(())
47
+ }
48
+
49
+ fn shutdown(&self) -> Result<()> {
50
+ Ok(())
51
+ }
52
+
53
+ fn description(&self) -> &str {
54
+ "Extracts content from Djot markup files with YAML frontmatter and table support"
55
+ }
56
+
57
+ fn author(&self) -> &str {
58
+ "Kreuzberg Team"
59
+ }
60
+ }
61
+
62
+ #[async_trait]
63
+ impl DocumentExtractor for DjotExtractor {
64
+ #[cfg_attr(
65
+ feature = "otel",
66
+ tracing::instrument(
67
+ skip(self, content, _config),
68
+ fields(
69
+ extractor.name = self.name(),
70
+ content.size_bytes = content.len(),
71
+ )
72
+ )
73
+ )]
74
+ async fn extract_bytes(
75
+ &self,
76
+ content: &[u8],
77
+ mime_type: &str,
78
+ _config: &ExtractionConfig,
79
+ ) -> Result<ExtractionResult> {
80
+ let text = String::from_utf8_lossy(content).into_owned();
81
+
82
+ let (yaml, remaining_content) = crate::extractors::frontmatter_utils::extract_frontmatter(&text);
83
+
84
+ let mut metadata = if let Some(ref yaml_value) = yaml {
85
+ crate::extractors::frontmatter_utils::extract_metadata_from_yaml(yaml_value)
86
+ } else {
87
+ Metadata::default()
88
+ };
89
+
90
+ if !metadata.additional.contains_key("title")
91
+ && let Some(title) = crate::extractors::frontmatter_utils::extract_title_from_content(&remaining_content)
92
+ {
93
+ metadata.additional.insert("title".to_string(), title.into());
94
+ }
95
+
96
+ // Parse with jotdown and collect events once for extraction
97
+ let parser = Parser::new(&remaining_content);
98
+ let events: Vec<Event> = parser.collect();
99
+
100
+ let extracted_text = extract_text_from_events(&events);
101
+ let tables = extract_tables_from_events(&events);
102
+
103
+ // Extract complete djot content with all features
104
+ let djot_content = extract_complete_djot_content(&events, metadata.clone(), tables.clone());
105
+
106
+ Ok(ExtractionResult {
107
+ content: extracted_text,
108
+ mime_type: mime_type.to_string(),
109
+ metadata,
110
+ tables,
111
+ detected_languages: None,
112
+ chunks: None,
113
+ images: None,
114
+ pages: None,
115
+ djot_content: Some(djot_content),
116
+ elements: None,
117
+ })
118
+ }
119
+
120
+ fn supported_mime_types(&self) -> &[&str] {
121
+ &["text/djot", "text/x-djot"]
122
+ }
123
+
124
+ fn priority(&self) -> i32 {
125
+ 50
126
+ }
127
+ }
128
+
129
+ #[cfg(test)]
130
+ mod tests {
131
+ use super::*;
132
+
133
+ #[test]
134
+ fn test_djot_extractor_creation() {
135
+ let extractor = DjotExtractor::new();
136
+ assert_eq!(extractor.name(), "djot-extractor");
137
+ }
138
+
139
+ #[test]
140
+ fn test_can_extract_djot_mime_types() {
141
+ let extractor = DjotExtractor::new();
142
+ let mime_types = extractor.supported_mime_types();
143
+
144
+ assert!(mime_types.contains(&"text/djot"));
145
+ assert!(mime_types.contains(&"text/x-djot"));
146
+ }
147
+
148
+ #[test]
149
+ fn test_plugin_interface() {
150
+ let extractor = DjotExtractor::new();
151
+ assert_eq!(extractor.author(), "Kreuzberg Team");
152
+ assert!(!extractor.version().is_empty());
153
+ assert!(!extractor.description().is_empty());
154
+ }
155
+
156
+ #[tokio::test]
157
+ async fn test_extract_simple_djot() {
158
+ let content =
159
+ b"# Header\n\nThis is a paragraph with *bold* and _italic_ text.\n\n## Subheading\n\nMore content here.";
160
+ let extractor = DjotExtractor::new();
161
+ let config = ExtractionConfig::default();
162
+
163
+ let result = extractor.extract_bytes(content, "text/djot", &config).await;
164
+ assert!(result.is_ok());
165
+
166
+ let result = result.unwrap();
167
+ assert!(result.content.contains("Header"));
168
+ assert!(result.content.contains("This is a paragraph"));
169
+ assert!(result.content.contains("bold"));
170
+ assert!(result.content.contains("italic"));
171
+ }
172
+ }
@@ -0,0 +1,24 @@
1
+ //! Djot markup format extractor and utilities.
2
+ //!
3
+ //! This module provides:
4
+ //! - Djot parsing using the jotdown crate
5
+ //! - YAML frontmatter metadata extraction (same as Markdown)
6
+ //! - Table extraction as structured data
7
+ //! - Heading structure preservation
8
+ //! - Code block and link extraction
9
+ //! - Djot content rendering and conversion APIs
10
+ //!
11
+ //! Djot is a modern markup language with simpler parsing rules than CommonMark.
12
+ //! See https://djot.net for the specification.
13
+ //!
14
+ //! Requires the `djot` feature.
15
+
16
+ pub mod attributes;
17
+ pub mod conversion;
18
+ pub mod extractor;
19
+ pub mod parsing;
20
+ pub mod rendering;
21
+
22
+ // Re-export public API
23
+ pub use conversion::{djot_content_to_djot, djot_to_html, extraction_result_to_djot};
24
+ pub use extractor::DjotExtractor;