kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,116 @@
1
+ //! Byte array extraction operations.
2
+ //!
3
+ //! This module handles extraction from in-memory byte arrays, including:
4
+ //! - MIME type validation
5
+ //! - Legacy format conversion (DOC, PPT)
6
+ //! - Extraction pipeline orchestration
7
+
8
+ #[cfg(not(feature = "office"))]
9
+ use crate::KreuzbergError;
10
+ use crate::Result;
11
+ use crate::core::config::ExtractionConfig;
12
+ use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
13
+ #[cfg(feature = "office")]
14
+ use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
15
+ use crate::types::ExtractionResult;
16
+
17
+ #[cfg(feature = "office")]
18
+ use super::file::apply_libreoffice_metadata;
19
+ use super::file::extract_bytes_with_extractor;
20
+ #[cfg(feature = "otel")]
21
+ use super::file::record_error;
22
+
23
+ /// Extract content from a byte array.
24
+ ///
25
+ /// This is the main entry point for in-memory extraction. It performs the following steps:
26
+ /// 1. Validate MIME type
27
+ /// 2. Handle legacy format conversion if needed
28
+ /// 3. Select appropriate extractor from registry
29
+ /// 4. Extract content
30
+ /// 5. Run post-processing pipeline
31
+ ///
32
+ /// # Arguments
33
+ ///
34
+ /// * `content` - The byte array to extract
35
+ /// * `mime_type` - MIME type of the content
36
+ /// * `config` - Extraction configuration
37
+ ///
38
+ /// # Returns
39
+ ///
40
+ /// An `ExtractionResult` containing the extracted content and metadata.
41
+ ///
42
+ /// # Errors
43
+ ///
44
+ /// Returns `KreuzbergError::Validation` if MIME type is invalid.
45
+ /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
46
+ ///
47
+ /// # Example
48
+ ///
49
+ /// ```rust,no_run
50
+ /// use kreuzberg::core::extractor::extract_bytes;
51
+ /// use kreuzberg::core::config::ExtractionConfig;
52
+ ///
53
+ /// # async fn example() -> kreuzberg::Result<()> {
54
+ /// let config = ExtractionConfig::default();
55
+ /// let bytes = b"Hello, world!";
56
+ /// let result = extract_bytes(bytes, "text/plain", &config).await?;
57
+ /// println!("Content: {}", result.content);
58
+ /// # Ok(())
59
+ /// # }
60
+ /// ```
61
+ #[cfg_attr(feature = "otel", tracing::instrument(
62
+ skip(config, content),
63
+ fields(
64
+ extraction.mime_type = mime_type,
65
+ extraction.size_bytes = content.len(),
66
+ )
67
+ ))]
68
+ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
69
+ use crate::core::mime;
70
+
71
+ let result = async {
72
+ let validated_mime = mime::validate_mime_type(mime_type)?;
73
+
74
+ match validated_mime.as_str() {
75
+ #[cfg(feature = "office")]
76
+ LEGACY_WORD_MIME_TYPE => {
77
+ let conversion = convert_doc_to_docx(content).await?;
78
+ let mut result =
79
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
80
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
81
+ return Ok(result);
82
+ }
83
+ #[cfg(not(feature = "office"))]
84
+ LEGACY_WORD_MIME_TYPE => {
85
+ return Err(KreuzbergError::UnsupportedFormat(
86
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
87
+ ));
88
+ }
89
+ #[cfg(feature = "office")]
90
+ LEGACY_POWERPOINT_MIME_TYPE => {
91
+ let conversion = convert_ppt_to_pptx(content).await?;
92
+ let mut result =
93
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
94
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
95
+ return Ok(result);
96
+ }
97
+ #[cfg(not(feature = "office"))]
98
+ LEGACY_POWERPOINT_MIME_TYPE => {
99
+ return Err(KreuzbergError::UnsupportedFormat(
100
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
101
+ ));
102
+ }
103
+ _ => {}
104
+ }
105
+
106
+ extract_bytes_with_extractor(content, &validated_mime, config).await
107
+ }
108
+ .await;
109
+
110
+ #[cfg(feature = "otel")]
111
+ if let Err(ref e) = result {
112
+ record_error(e);
113
+ }
114
+
115
+ result
116
+ }
@@ -0,0 +1,240 @@
1
+ //! File-based extraction operations.
2
+ //!
3
+ //! This module handles extraction from filesystem paths, including:
4
+ //! - MIME type detection and validation
5
+ //! - Legacy format conversion (DOC, PPT)
6
+ //! - File validation and reading
7
+ //! - Extraction pipeline orchestration
8
+
9
+ #[cfg(any(feature = "otel", not(feature = "office")))]
10
+ use crate::KreuzbergError;
11
+ use crate::Result;
12
+ use crate::core::config::ExtractionConfig;
13
+ use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
14
+ #[cfg(feature = "office")]
15
+ use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
16
+ use crate::types::ExtractionResult;
17
+ #[cfg(feature = "office")]
18
+ use crate::types::LibreOfficeConversionResult;
19
+ #[cfg(feature = "office")]
20
+ use serde_json::json;
21
+ use std::path::Path;
22
+
23
+ #[cfg(feature = "office")]
24
+ use super::helpers::pool_mime_type;
25
+
26
+ use super::helpers::get_extractor;
27
+
28
+ /// Sanitize a file path to return only the filename.
29
+ ///
30
+ /// This function extracts the filename from a path to avoid recording
31
+ /// potentially sensitive full file paths in telemetry data.
32
+ ///
33
+ /// # Arguments
34
+ ///
35
+ /// * `path` - The path to sanitize
36
+ ///
37
+ /// # Returns
38
+ ///
39
+ /// The filename as a string, or "unknown" if extraction fails
40
+ ///
41
+ /// # Security
42
+ ///
43
+ /// This prevents PII (personally identifiable information) from appearing in
44
+ /// traces by only recording filenames instead of full paths.
45
+ ///
46
+ /// # Example
47
+ ///
48
+ /// ```rust,ignore
49
+ /// let path = Path::new("/home/user/documents/secret.pdf");
50
+ /// assert_eq!(sanitize_path(path), "secret.pdf");
51
+ /// ```
52
+ #[cfg(feature = "otel")]
53
+ pub(super) fn sanitize_path(path: &Path) -> String {
54
+ path.file_name()
55
+ .and_then(|n| n.to_str())
56
+ .unwrap_or("unknown")
57
+ .to_string()
58
+ }
59
+
60
+ /// Record error information in the current OpenTelemetry span.
61
+ ///
62
+ /// This function records error details in the current span when the `otel` feature is enabled.
63
+ /// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
64
+ ///
65
+ /// # Arguments
66
+ ///
67
+ /// * `error` - The error to record in the span
68
+ ///
69
+ /// # Example
70
+ ///
71
+ /// ```rust,ignore
72
+ /// let result = extract_file("doc.pdf", None, &config).await;
73
+ /// #[cfg(feature = "otel")]
74
+ /// if let Err(ref e) = result {
75
+ /// record_error(e);
76
+ /// }
77
+ /// result
78
+ /// ```
79
+ #[cfg(feature = "otel")]
80
+ pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
81
+ let span = tracing::Span::current();
82
+ span.record("otel.status_code", "ERROR");
83
+ span.record("error.type", format!("{:?}", error));
84
+ span.record("error.message", error.to_string());
85
+ }
86
+
87
+ /// Extract content from a file.
88
+ ///
89
+ /// This is the main entry point for file-based extraction. It performs the following steps:
90
+ /// 1. Check cache for existing result (if caching enabled)
91
+ /// 2. Detect or validate MIME type
92
+ /// 3. Select appropriate extractor from registry
93
+ /// 4. Extract content
94
+ /// 5. Run post-processing pipeline
95
+ /// 6. Store result in cache (if caching enabled)
96
+ ///
97
+ /// # Arguments
98
+ ///
99
+ /// * `path` - Path to the file to extract
100
+ /// * `mime_type` - Optional MIME type override. If None, will be auto-detected
101
+ /// * `config` - Extraction configuration
102
+ ///
103
+ /// # Returns
104
+ ///
105
+ /// An `ExtractionResult` containing the extracted content and metadata.
106
+ ///
107
+ /// # Errors
108
+ ///
109
+ /// Returns `KreuzbergError::Validation` if the file doesn't exist or path is invalid.
110
+ /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
111
+ /// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
112
+ ///
113
+ /// # Example
114
+ ///
115
+ /// ```rust,no_run
116
+ /// use kreuzberg::core::extractor::extract_file;
117
+ /// use kreuzberg::core::config::ExtractionConfig;
118
+ ///
119
+ /// # async fn example() -> kreuzberg::Result<()> {
120
+ /// let config = ExtractionConfig::default();
121
+ /// let result = extract_file("document.pdf", None, &config).await?;
122
+ /// println!("Content: {}", result.content);
123
+ /// # Ok(())
124
+ /// # }
125
+ /// ```
126
+ #[cfg_attr(feature = "otel", tracing::instrument(
127
+ skip(config, path),
128
+ fields(
129
+ extraction.filename = tracing::field::Empty,
130
+ )
131
+ ))]
132
+ pub async fn extract_file(
133
+ path: impl AsRef<Path>,
134
+ mime_type: Option<&str>,
135
+ config: &ExtractionConfig,
136
+ ) -> Result<ExtractionResult> {
137
+ use crate::core::{io, mime};
138
+
139
+ let path = path.as_ref();
140
+
141
+ #[cfg(feature = "otel")]
142
+ {
143
+ let span = tracing::Span::current();
144
+ span.record("extraction.filename", sanitize_path(path));
145
+ }
146
+
147
+ let result = async {
148
+ io::validate_file_exists(path)?;
149
+
150
+ let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
151
+
152
+ match detected_mime.as_str() {
153
+ #[cfg(feature = "office")]
154
+ LEGACY_WORD_MIME_TYPE => {
155
+ let original_bytes = tokio::fs::read(path).await?;
156
+ let conversion = convert_doc_to_docx(&original_bytes).await?;
157
+ let mut result =
158
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
159
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
160
+ return Ok(result);
161
+ }
162
+ #[cfg(not(feature = "office"))]
163
+ LEGACY_WORD_MIME_TYPE => {
164
+ return Err(KreuzbergError::UnsupportedFormat(
165
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
166
+ ));
167
+ }
168
+ #[cfg(feature = "office")]
169
+ LEGACY_POWERPOINT_MIME_TYPE => {
170
+ let original_bytes = tokio::fs::read(path).await?;
171
+ let conversion = convert_ppt_to_pptx(&original_bytes).await?;
172
+ let mut result =
173
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
174
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
175
+ return Ok(result);
176
+ }
177
+ #[cfg(not(feature = "office"))]
178
+ LEGACY_POWERPOINT_MIME_TYPE => {
179
+ return Err(KreuzbergError::UnsupportedFormat(
180
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
181
+ ));
182
+ }
183
+ _ => {}
184
+ }
185
+
186
+ extract_file_with_extractor(path, &detected_mime, config).await
187
+ }
188
+ .await;
189
+
190
+ #[cfg(feature = "otel")]
191
+ if let Err(ref e) = result {
192
+ record_error(e);
193
+ }
194
+
195
+ result
196
+ }
197
+
198
+ pub(in crate::core::extractor) async fn extract_file_with_extractor(
199
+ path: &Path,
200
+ mime_type: &str,
201
+ config: &ExtractionConfig,
202
+ ) -> Result<ExtractionResult> {
203
+ crate::extractors::ensure_initialized()?;
204
+
205
+ let extractor = get_extractor(mime_type)?;
206
+ let mut result = extractor.extract_file(path, mime_type, config).await?;
207
+ result = crate::core::pipeline::run_pipeline(result, config).await?;
208
+ Ok(result)
209
+ }
210
+
211
+ pub(in crate::core::extractor) async fn extract_bytes_with_extractor(
212
+ content: &[u8],
213
+ mime_type: &str,
214
+ config: &ExtractionConfig,
215
+ ) -> Result<ExtractionResult> {
216
+ crate::extractors::ensure_initialized()?;
217
+
218
+ let extractor = get_extractor(mime_type)?;
219
+ let mut result = extractor.extract_bytes(content, mime_type, config).await?;
220
+ result = crate::core::pipeline::run_pipeline(result, config).await?;
221
+ Ok(result)
222
+ }
223
+
224
+ #[cfg(feature = "office")]
225
+ pub(in crate::core::extractor) fn apply_libreoffice_metadata(
226
+ result: &mut ExtractionResult,
227
+ legacy_mime: &str,
228
+ conversion: &LibreOfficeConversionResult,
229
+ ) {
230
+ result.mime_type = pool_mime_type(legacy_mime);
231
+ result.metadata.additional.insert(
232
+ "libreoffice_conversion".to_string(),
233
+ json!({
234
+ "converter": "libreoffice",
235
+ "original_format": conversion.original_format,
236
+ "target_format": conversion.target_format,
237
+ "target_mime": conversion.target_mime,
238
+ }),
239
+ );
240
+ }
@@ -0,0 +1,71 @@
1
+ //! Helper functions and utilities for extraction operations.
2
+ //!
3
+ //! This module provides shared utilities used across extraction modules.
4
+
5
+ use crate::plugins::DocumentExtractor;
6
+ #[cfg(feature = "office")]
7
+ use crate::utils::intern_mime_type;
8
+ use crate::utils::{PoolSizeHint, estimate_pool_size};
9
+ use crate::{KreuzbergError, Result};
10
+ use std::sync::Arc;
11
+
12
+ /// Get an extractor from the registry.
13
+ ///
14
+ /// This function acquires the registry read lock and retrieves the appropriate
15
+ /// extractor for the given MIME type.
16
+ ///
17
+ /// # Performance
18
+ ///
19
+ /// RwLock read + HashMap lookup is ~100ns, fast enough without caching.
20
+ /// Removed thread-local cache to avoid Tokio work-stealing scheduler issues.
21
+ pub(in crate::core::extractor) fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
22
+ let registry = crate::plugins::registry::get_document_extractor_registry();
23
+ let registry_read = registry
24
+ .read()
25
+ .map_err(|e| KreuzbergError::Other(format!("Document extractor registry lock poisoned: {}", e)))?;
26
+ registry_read.get(mime_type)
27
+ }
28
+
29
+ /// Get optimal pool sizing hint for a document.
30
+ ///
31
+ /// This function calculates recommended pool sizes based on the document's
32
+ /// file size and MIME type. The hint can be used to create appropriately
33
+ /// sized thread pools for extraction, reducing memory waste from over-allocation.
34
+ ///
35
+ /// # Arguments
36
+ ///
37
+ /// * `file_size` - The size of the file in bytes
38
+ /// * `mime_type` - The MIME type of the document
39
+ ///
40
+ /// # Returns
41
+ ///
42
+ /// A `PoolSizeHint` with recommended pool configurations
43
+ ///
44
+ /// # Example
45
+ ///
46
+ /// ```rust,ignore
47
+ /// use kreuzberg::core::extractor::get_pool_sizing_hint;
48
+ ///
49
+ /// let hint = get_pool_sizing_hint(5_000_000, "application/pdf");
50
+ /// println!("Recommended string buffers: {}", hint.string_buffer_count);
51
+ /// ```
52
+ #[inline]
53
+ pub fn get_pool_sizing_hint(file_size: u64, mime_type: &str) -> PoolSizeHint {
54
+ estimate_pool_size(file_size, mime_type)
55
+ }
56
+
57
+ /// Convert a MIME type string to a pooled String for efficient deduplication.
58
+ ///
59
+ /// This function uses the string interning pool to reduce memory allocations
60
+ /// for repeatedly used MIME types (e.g., "application/pdf" appears thousands of times
61
+ /// in batch processing). The interned string is converted to an owned String to satisfy
62
+ /// the ExtractionResult::mime_type field type.
63
+ ///
64
+ /// # Performance
65
+ ///
66
+ /// For pre-interned MIME types (all common types), this is O(1) pointer dereference.
67
+ /// For unknown MIME types, this allocates once per unique type and caches the result.
68
+ #[cfg(feature = "office")]
69
+ pub(in crate::core::extractor) fn pool_mime_type(mime_type: &str) -> String {
70
+ intern_mime_type(mime_type).to_string()
71
+ }
@@ -0,0 +1,62 @@
1
+ //! Legacy synchronous extraction for WASM compatibility.
2
+ //!
3
+ //! This module provides truly synchronous extraction implementations
4
+ //! for environments where Tokio runtime is not available (e.g., WASM).
5
+
6
+ /// Synchronous extraction implementation for WASM compatibility.
7
+ ///
8
+ /// This function performs extraction without requiring a tokio runtime.
9
+ /// It calls the sync extractor methods directly.
10
+ ///
11
+ /// # Arguments
12
+ ///
13
+ /// * `content` - The byte content to extract
14
+ /// * `mime_type` - Optional MIME type to validate/use
15
+ /// * `config` - Optional extraction configuration
16
+ ///
17
+ /// # Returns
18
+ ///
19
+ /// An `ExtractionResult` or a `KreuzbergError`
20
+ ///
21
+ /// # Implementation Notes
22
+ ///
23
+ /// This is called when the `tokio-runtime` feature is disabled.
24
+ /// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
25
+ #[cfg(not(feature = "tokio-runtime"))]
26
+ pub(super) fn extract_bytes_sync_impl(
27
+ content: Vec<u8>,
28
+ mime_type: Option<String>,
29
+ config: Option<crate::core::config::ExtractionConfig>,
30
+ ) -> crate::Result<crate::types::ExtractionResult> {
31
+ use crate::KreuzbergError;
32
+ use crate::core::extractor::helpers::get_extractor;
33
+ use crate::core::mime;
34
+
35
+ let config = config.unwrap_or_default();
36
+
37
+ let validated_mime = if let Some(mime) = mime_type {
38
+ mime::validate_mime_type(&mime)?
39
+ } else {
40
+ return Err(KreuzbergError::Validation {
41
+ message: "MIME type is required for synchronous extraction".to_string(),
42
+ source: None,
43
+ });
44
+ };
45
+
46
+ crate::extractors::ensure_initialized()?;
47
+
48
+ let extractor = get_extractor(&validated_mime)?;
49
+
50
+ let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
51
+ KreuzbergError::UnsupportedFormat(format!(
52
+ "Extractor for '{}' does not support synchronous extraction",
53
+ validated_mime
54
+ ))
55
+ })?;
56
+
57
+ let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
58
+
59
+ result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
60
+
61
+ Ok(result)
62
+ }