kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,259 @@
1
+ //! Core RTF parsing logic.
2
+
3
+ use crate::extractors::rtf::encoding::{decode_windows_1252, parse_hex_byte, parse_rtf_control_word};
4
+ use crate::extractors::rtf::formatting::normalize_whitespace;
5
+ use crate::extractors::rtf::images::extract_image_metadata;
6
+ use crate::extractors::rtf::tables::TableState;
7
+ use crate::types::Table;
8
+
9
+ /// Extract text and image metadata from RTF document.
10
+ ///
11
+ /// This function extracts plain text from an RTF document by:
12
+ /// 1. Tokenizing control sequences and text
13
+ /// 2. Converting encoded characters to Unicode
14
+ /// 3. Extracting text while skipping formatting groups
15
+ /// 4. Detecting and extracting image metadata (\pict sections)
16
+ /// 5. Normalizing whitespace
17
+ pub fn extract_text_from_rtf(content: &str) -> (String, Vec<Table>) {
18
+ let mut result = String::new();
19
+ let mut chars = content.chars().peekable();
20
+ let mut tables: Vec<Table> = Vec::new();
21
+ let mut table_state: Option<TableState> = None;
22
+
23
+ let ensure_table = |table_state: &mut Option<TableState>| {
24
+ if table_state.is_none() {
25
+ *table_state = Some(TableState::new());
26
+ }
27
+ };
28
+
29
+ let finalize_table = |state_opt: &mut Option<TableState>, tables: &mut Vec<Table>| {
30
+ if let Some(state) = state_opt.take()
31
+ && let Some(table) = state.finalize()
32
+ {
33
+ tables.push(table);
34
+ }
35
+ };
36
+
37
+ while let Some(ch) = chars.next() {
38
+ match ch {
39
+ '\\' => {
40
+ if let Some(&next_ch) = chars.peek() {
41
+ match next_ch {
42
+ '\\' | '{' | '}' => {
43
+ chars.next();
44
+ result.push(next_ch);
45
+ }
46
+ '\'' => {
47
+ chars.next();
48
+ let hex1 = chars.next();
49
+ let hex2 = chars.next();
50
+ if let (Some(h1), Some(h2)) = (hex1, hex2)
51
+ && let Some(byte) = parse_hex_byte(h1, h2)
52
+ {
53
+ let decoded = decode_windows_1252(byte);
54
+ result.push(decoded);
55
+ if let Some(state) = table_state.as_mut()
56
+ && state.in_row
57
+ {
58
+ state.current_cell.push(decoded);
59
+ }
60
+ }
61
+ }
62
+ 'u' => {
63
+ chars.next();
64
+ let mut num_str = String::new();
65
+ while let Some(&c) = chars.peek() {
66
+ if c.is_ascii_digit() || c == '-' {
67
+ num_str.push(c);
68
+ chars.next();
69
+ } else {
70
+ break;
71
+ }
72
+ }
73
+ if let Ok(code_num) = num_str.parse::<i32>() {
74
+ let code_u = if code_num < 0 {
75
+ (code_num + 65536) as u32
76
+ } else {
77
+ code_num as u32
78
+ };
79
+ if let Some(c) = char::from_u32(code_u) {
80
+ result.push(c);
81
+ if let Some(state) = table_state.as_mut()
82
+ && state.in_row
83
+ {
84
+ state.current_cell.push(c);
85
+ }
86
+ }
87
+ }
88
+ }
89
+ _ => {
90
+ let (control_word, _) = parse_rtf_control_word(&mut chars);
91
+ handle_control_word(
92
+ &control_word,
93
+ &mut chars,
94
+ &mut result,
95
+ &mut table_state,
96
+ &mut tables,
97
+ &ensure_table,
98
+ &finalize_table,
99
+ );
100
+ }
101
+ }
102
+ }
103
+ }
104
+ '{' | '}' => {
105
+ if !result.is_empty() && !result.ends_with(' ') {
106
+ result.push(' ');
107
+ }
108
+ }
109
+ ' ' | '\t' | '\n' | '\r' => {
110
+ if !result.is_empty() && !result.ends_with(' ') {
111
+ result.push(' ');
112
+ }
113
+ if let Some(state) = table_state.as_mut()
114
+ && state.in_row
115
+ && !state.current_cell.ends_with(' ')
116
+ {
117
+ state.current_cell.push(' ');
118
+ }
119
+ }
120
+ _ => {
121
+ if let Some(state) = table_state.as_ref()
122
+ && !state.in_row
123
+ && !state.rows.is_empty()
124
+ {
125
+ finalize_table(&mut table_state, &mut tables);
126
+ }
127
+ result.push(ch);
128
+ if let Some(state) = table_state.as_mut()
129
+ && state.in_row
130
+ {
131
+ state.current_cell.push(ch);
132
+ }
133
+ }
134
+ }
135
+ }
136
+
137
+ if table_state.is_some() {
138
+ finalize_table(&mut table_state, &mut tables);
139
+ }
140
+
141
+ (normalize_whitespace(&result), tables)
142
+ }
143
+
144
+ /// Handle an RTF control word during parsing.
145
+ #[allow(clippy::too_many_arguments)]
146
+ fn handle_control_word(
147
+ control_word: &str,
148
+ chars: &mut std::iter::Peekable<std::str::Chars>,
149
+ result: &mut String,
150
+ table_state: &mut Option<TableState>,
151
+ tables: &mut Vec<Table>,
152
+ ensure_table: &dyn Fn(&mut Option<TableState>),
153
+ finalize_table: &dyn Fn(&mut Option<TableState>, &mut Vec<Table>),
154
+ ) {
155
+ match control_word {
156
+ "pict" => {
157
+ let image_metadata = extract_image_metadata(chars);
158
+ if !image_metadata.is_empty() {
159
+ result.push('!');
160
+ result.push('[');
161
+ result.push_str("image");
162
+ result.push(']');
163
+ result.push('(');
164
+ result.push_str(&image_metadata);
165
+ result.push(')');
166
+ result.push(' ');
167
+ if let Some(state) = table_state.as_mut()
168
+ && state.in_row
169
+ {
170
+ state.current_cell.push('!');
171
+ state.current_cell.push('[');
172
+ state.current_cell.push_str("image");
173
+ state.current_cell.push(']');
174
+ state.current_cell.push('(');
175
+ state.current_cell.push_str(&image_metadata);
176
+ state.current_cell.push(')');
177
+ state.current_cell.push(' ');
178
+ }
179
+ }
180
+ }
181
+ "par" => {
182
+ if table_state.is_some() {
183
+ finalize_table(table_state, tables);
184
+ }
185
+ if !result.is_empty() && !result.ends_with('\n') {
186
+ result.push('\n');
187
+ result.push('\n');
188
+ }
189
+ }
190
+ "tab" => {
191
+ result.push('\t');
192
+ if let Some(state) = table_state.as_mut()
193
+ && state.in_row
194
+ {
195
+ state.current_cell.push('\t');
196
+ }
197
+ }
198
+ "bullet" => {
199
+ result.push('•');
200
+ }
201
+ "lquote" => {
202
+ result.push('\u{2018}');
203
+ }
204
+ "rquote" => {
205
+ result.push('\u{2019}');
206
+ }
207
+ "ldblquote" => {
208
+ result.push('\u{201C}');
209
+ }
210
+ "rdblquote" => {
211
+ result.push('\u{201D}');
212
+ }
213
+ "endash" => {
214
+ result.push('\u{2013}');
215
+ }
216
+ "emdash" => {
217
+ result.push('\u{2014}');
218
+ }
219
+ "trowd" => {
220
+ ensure_table(table_state);
221
+ if let Some(state) = table_state.as_mut() {
222
+ state.start_row();
223
+ }
224
+ if !result.is_empty() && !result.ends_with('\n') {
225
+ result.push('\n');
226
+ }
227
+ if !result.ends_with('|') {
228
+ result.push('|');
229
+ result.push(' ');
230
+ }
231
+ }
232
+ "cell" => {
233
+ if !result.ends_with('|') {
234
+ if !result.ends_with(' ') && !result.is_empty() {
235
+ result.push(' ');
236
+ }
237
+ result.push('|');
238
+ }
239
+ if !result.ends_with(' ') {
240
+ result.push(' ');
241
+ }
242
+ }
243
+ "row" => {
244
+ ensure_table(table_state);
245
+ if let Some(state) = table_state.as_mut()
246
+ && (state.in_row || !state.current_cell.is_empty())
247
+ {
248
+ state.push_row();
249
+ }
250
+ if !result.ends_with('|') {
251
+ result.push('|');
252
+ }
253
+ if !result.ends_with('\n') {
254
+ result.push('\n');
255
+ }
256
+ }
257
+ _ => {}
258
+ }
259
+ }
@@ -0,0 +1,83 @@
1
+ //! Table extraction and state management for RTF documents.
2
+
3
+ use crate::extraction::cells_to_markdown;
4
+ use crate::types::Table;
5
+
6
+ /// State machine for tracking table construction during RTF parsing.
7
+ pub struct TableState {
8
+ pub rows: Vec<Vec<String>>,
9
+ pub current_row: Vec<String>,
10
+ pub current_cell: String,
11
+ pub in_row: bool,
12
+ }
13
+
14
+ impl TableState {
15
+ /// Create a new empty table state.
16
+ pub fn new() -> Self {
17
+ Self {
18
+ rows: Vec::new(),
19
+ current_row: Vec::new(),
20
+ current_cell: String::new(),
21
+ in_row: false,
22
+ }
23
+ }
24
+
25
+ /// Push the current cell content to the current row.
26
+ pub fn push_cell(&mut self) {
27
+ let cell = self.current_cell.trim().to_string();
28
+ self.current_row.push(cell);
29
+ self.current_cell.clear();
30
+ }
31
+
32
+ /// Push the current row to the rows collection.
33
+ pub fn push_row(&mut self) {
34
+ if self.in_row || !self.current_cell.is_empty() {
35
+ self.push_cell();
36
+ self.in_row = false;
37
+ }
38
+ if !self.current_row.is_empty() {
39
+ self.rows.push(self.current_row.clone());
40
+ self.current_row.clear();
41
+ }
42
+ }
43
+
44
+ /// Start a new table row.
45
+ pub fn start_row(&mut self) {
46
+ if self.in_row {
47
+ self.push_row();
48
+ }
49
+ self.in_row = true;
50
+ self.current_cell.clear();
51
+ self.current_row.clear();
52
+ }
53
+
54
+ /// Check if this table has any content.
55
+ #[allow(dead_code)]
56
+ pub fn is_empty(&self) -> bool {
57
+ self.rows.is_empty() && self.current_row.is_empty() && self.current_cell.is_empty()
58
+ }
59
+
60
+ /// Finalize the table and convert it to a Table struct.
61
+ pub fn finalize(mut self) -> Option<Table> {
62
+ if self.in_row || !self.current_cell.is_empty() || !self.current_row.is_empty() {
63
+ self.push_row();
64
+ }
65
+
66
+ if self.rows.is_empty() {
67
+ return None;
68
+ }
69
+
70
+ let markdown = cells_to_markdown(&self.rows);
71
+ Some(Table {
72
+ cells: self.rows,
73
+ markdown,
74
+ page_number: 1,
75
+ })
76
+ }
77
+ }
78
+
79
+ impl Default for TableState {
80
+ fn default() -> Self {
81
+ Self::new()
82
+ }
83
+ }
@@ -86,6 +86,8 @@ impl DocumentExtractor for StructuredExtractor {
86
86
  detected_languages: None,
87
87
  chunks: None,
88
88
  images: None,
89
+ elements: None,
90
+ djot_content: None,
89
91
  })
90
92
  }
91
93
 
@@ -91,6 +91,8 @@ impl DocumentExtractor for PlainTextExtractor {
91
91
  detected_languages: None,
92
92
  chunks: None,
93
93
  images: None,
94
+ elements: None,
95
+ djot_content: None,
94
96
  })
95
97
  }
96
98
 
@@ -184,6 +186,8 @@ impl DocumentExtractor for MarkdownExtractor {
184
186
  detected_languages: None,
185
187
  chunks: None,
186
188
  images: None,
189
+ elements: None,
190
+ djot_content: None,
187
191
  })
188
192
  }
189
193
 
@@ -112,7 +112,9 @@ impl DocumentExtractor for TypstExtractor {
112
112
  detected_languages: None,
113
113
  chunks: None,
114
114
  images: None,
115
+ djot_content: None,
115
116
  pages: None,
117
+ elements: None,
116
118
  })
117
119
  }
118
120
 
@@ -71,6 +71,8 @@ impl SyncExtractor for XmlExtractor {
71
71
  chunks: None,
72
72
  images: None,
73
73
  pages: None,
74
+ djot_content: None,
75
+ elements: None,
74
76
  })
75
77
  }
76
78
  }
@@ -114,6 +114,8 @@ machine learning that uses neural networks with multiple layers.
114
114
  chunks: None,
115
115
  images: None,
116
116
  pages: None,
117
+ elements: None,
118
+ djot_content: None,
117
119
  };
118
120
 
119
121
  processor.process(&mut result, &config).await.unwrap();
@@ -143,6 +145,8 @@ machine learning that uses neural networks with multiple layers.
143
145
  chunks: None,
144
146
  images: None,
145
147
  pages: None,
148
+ elements: None,
149
+ djot_content: None,
146
150
  };
147
151
 
148
152
  processor.process(&mut result, &config).await.unwrap();
@@ -168,6 +172,8 @@ machine learning that uses neural networks with multiple layers.
168
172
  chunks: None,
169
173
  images: None,
170
174
  pages: None,
175
+ elements: None,
176
+ djot_content: None,
171
177
  };
172
178
 
173
179
  processor.process(&mut result, &config).await.unwrap();
@@ -193,6 +199,8 @@ machine learning that uses neural networks with multiple layers.
193
199
  chunks: None,
194
200
  images: None,
195
201
  pages: None,
202
+ elements: None,
203
+ djot_content: None,
196
204
  };
197
205
 
198
206
  processor.process(&mut result, &config).await.unwrap();
@@ -229,6 +237,8 @@ machine learning that uses neural networks with multiple layers.
229
237
  chunks: None,
230
238
  images: None,
231
239
  pages: None,
240
+ elements: None,
241
+ djot_content: None,
232
242
  };
233
243
 
234
244
  let config_with_keywords = ExtractionConfig {
@@ -254,6 +264,8 @@ machine learning that uses neural networks with multiple layers.
254
264
  chunks: None,
255
265
  images: None,
256
266
  pages: None,
267
+ elements: None,
268
+ djot_content: None,
257
269
  };
258
270
 
259
271
  let long_result = ExtractionResult {
@@ -265,6 +277,8 @@ machine learning that uses neural networks with multiple layers.
265
277
  chunks: None,
266
278
  images: None,
267
279
  pages: None,
280
+ elements: None,
281
+ djot_content: None,
268
282
  };
269
283
 
270
284
  let short_duration = processor.estimated_duration_ms(&short_result);
@@ -108,7 +108,9 @@ mod tests {
108
108
  detected_languages: None,
109
109
  chunks: None,
110
110
  images: None,
111
+ djot_content: None,
111
112
  pages: None,
113
+ elements: None,
112
114
  };
113
115
 
114
116
  processor.process(&mut result, &config).await.unwrap();
@@ -132,7 +134,9 @@ mod tests {
132
134
  detected_languages: None,
133
135
  chunks: None,
134
136
  images: None,
137
+ djot_content: None,
135
138
  pages: None,
139
+ elements: None,
136
140
  };
137
141
 
138
142
  processor.process(&mut result, &config).await.unwrap();
@@ -167,7 +171,9 @@ mod tests {
167
171
  detected_languages: None,
168
172
  chunks: None,
169
173
  images: None,
174
+ djot_content: None,
170
175
  pages: None,
176
+ elements: None,
171
177
  };
172
178
 
173
179
  let config_with_lang = ExtractionConfig {
@@ -196,7 +202,9 @@ mod tests {
196
202
  detected_languages: None,
197
203
  chunks: None,
198
204
  images: None,
205
+ djot_content: None,
199
206
  pages: None,
207
+ elements: None,
200
208
  };
201
209
 
202
210
  let long_result = ExtractionResult {
@@ -207,7 +215,9 @@ mod tests {
207
215
  detected_languages: None,
208
216
  chunks: None,
209
217
  images: None,
218
+ djot_content: None,
210
219
  pages: None,
220
+ elements: None,
211
221
  };
212
222
 
213
223
  let short_duration = processor.estimated_duration_ms(&short_result);
@@ -89,14 +89,14 @@ pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
89
89
 
90
90
  pub use core::config::{
91
91
  ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
92
- LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
92
+ LanguageDetectionConfig, OcrConfig, OutputFormat, PageConfig, PostProcessorConfig, TokenReductionConfig,
93
93
  };
94
94
 
95
95
  #[cfg(feature = "api")]
96
96
  pub use core::server_config::ServerConfig;
97
97
 
98
98
  #[cfg(feature = "pdf")]
99
- pub use core::config::PdfConfig;
99
+ pub use core::config::{HierarchyConfig, PdfConfig};
100
100
 
101
101
  pub use core::mime::{
102
102
  DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,