kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,116 @@
1
+ //! Byte array extraction operations.
2
+ //!
3
+ //! This module handles extraction from in-memory byte arrays, including:
4
+ //! - MIME type validation
5
+ //! - Legacy format conversion (DOC, PPT)
6
+ //! - Extraction pipeline orchestration
7
+
8
+ #[cfg(not(feature = "office"))]
9
+ use crate::KreuzbergError;
10
+ use crate::Result;
11
+ use crate::core::config::ExtractionConfig;
12
+ use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
13
+ #[cfg(feature = "office")]
14
+ use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
15
+ use crate::types::ExtractionResult;
16
+
17
+ #[cfg(feature = "office")]
18
+ use super::file::apply_libreoffice_metadata;
19
+ use super::file::extract_bytes_with_extractor;
20
+ #[cfg(feature = "otel")]
21
+ use super::file::record_error;
22
+
23
+ /// Extract content from a byte array.
24
+ ///
25
+ /// This is the main entry point for in-memory extraction. It performs the following steps:
26
+ /// 1. Validate MIME type
27
+ /// 2. Handle legacy format conversion if needed
28
+ /// 3. Select appropriate extractor from registry
29
+ /// 4. Extract content
30
+ /// 5. Run post-processing pipeline
31
+ ///
32
+ /// # Arguments
33
+ ///
34
+ /// * `content` - The byte array to extract
35
+ /// * `mime_type` - MIME type of the content
36
+ /// * `config` - Extraction configuration
37
+ ///
38
+ /// # Returns
39
+ ///
40
+ /// An `ExtractionResult` containing the extracted content and metadata.
41
+ ///
42
+ /// # Errors
43
+ ///
44
+ /// Returns `KreuzbergError::Validation` if MIME type is invalid.
45
+ /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
46
+ ///
47
+ /// # Example
48
+ ///
49
+ /// ```rust,no_run
50
+ /// use kreuzberg::core::extractor::extract_bytes;
51
+ /// use kreuzberg::core::config::ExtractionConfig;
52
+ ///
53
+ /// # async fn example() -> kreuzberg::Result<()> {
54
+ /// let config = ExtractionConfig::default();
55
+ /// let bytes = b"Hello, world!";
56
+ /// let result = extract_bytes(bytes, "text/plain", &config).await?;
57
+ /// println!("Content: {}", result.content);
58
+ /// # Ok(())
59
+ /// # }
60
+ /// ```
61
+ #[cfg_attr(feature = "otel", tracing::instrument(
62
+ skip(config, content),
63
+ fields(
64
+ extraction.mime_type = mime_type,
65
+ extraction.size_bytes = content.len(),
66
+ )
67
+ ))]
68
+ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
69
+ use crate::core::mime;
70
+
71
+ let result = async {
72
+ let validated_mime = mime::validate_mime_type(mime_type)?;
73
+
74
+ match validated_mime.as_str() {
75
+ #[cfg(feature = "office")]
76
+ LEGACY_WORD_MIME_TYPE => {
77
+ let conversion = convert_doc_to_docx(content).await?;
78
+ let mut result =
79
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
80
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
81
+ return Ok(result);
82
+ }
83
+ #[cfg(not(feature = "office"))]
84
+ LEGACY_WORD_MIME_TYPE => {
85
+ return Err(KreuzbergError::UnsupportedFormat(
86
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
87
+ ));
88
+ }
89
+ #[cfg(feature = "office")]
90
+ LEGACY_POWERPOINT_MIME_TYPE => {
91
+ let conversion = convert_ppt_to_pptx(content).await?;
92
+ let mut result =
93
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
94
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
95
+ return Ok(result);
96
+ }
97
+ #[cfg(not(feature = "office"))]
98
+ LEGACY_POWERPOINT_MIME_TYPE => {
99
+ return Err(KreuzbergError::UnsupportedFormat(
100
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
101
+ ));
102
+ }
103
+ _ => {}
104
+ }
105
+
106
+ extract_bytes_with_extractor(content, &validated_mime, config).await
107
+ }
108
+ .await;
109
+
110
+ #[cfg(feature = "otel")]
111
+ if let Err(ref e) = result {
112
+ record_error(e);
113
+ }
114
+
115
+ result
116
+ }
@@ -0,0 +1,240 @@
1
+ //! File-based extraction operations.
2
+ //!
3
+ //! This module handles extraction from filesystem paths, including:
4
+ //! - MIME type detection and validation
5
+ //! - Legacy format conversion (DOC, PPT)
6
+ //! - File validation and reading
7
+ //! - Extraction pipeline orchestration
8
+
9
+ #[cfg(any(feature = "otel", not(feature = "office")))]
10
+ use crate::KreuzbergError;
11
+ use crate::Result;
12
+ use crate::core::config::ExtractionConfig;
13
+ use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
14
+ #[cfg(feature = "office")]
15
+ use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
16
+ use crate::types::ExtractionResult;
17
+ #[cfg(feature = "office")]
18
+ use crate::types::LibreOfficeConversionResult;
19
+ #[cfg(feature = "office")]
20
+ use serde_json::json;
21
+ use std::path::Path;
22
+
23
+ #[cfg(feature = "office")]
24
+ use super::helpers::pool_mime_type;
25
+
26
+ use super::helpers::get_extractor;
27
+
28
+ /// Sanitize a file path to return only the filename.
29
+ ///
30
+ /// This function extracts the filename from a path to avoid recording
31
+ /// potentially sensitive full file paths in telemetry data.
32
+ ///
33
+ /// # Arguments
34
+ ///
35
+ /// * `path` - The path to sanitize
36
+ ///
37
+ /// # Returns
38
+ ///
39
+ /// The filename as a string, or "unknown" if extraction fails
40
+ ///
41
+ /// # Security
42
+ ///
43
+ /// This prevents PII (personally identifiable information) from appearing in
44
+ /// traces by only recording filenames instead of full paths.
45
+ ///
46
+ /// # Example
47
+ ///
48
+ /// ```rust,ignore
49
+ /// let path = Path::new("/home/user/documents/secret.pdf");
50
+ /// assert_eq!(sanitize_path(path), "secret.pdf");
51
+ /// ```
52
+ #[cfg(feature = "otel")]
53
+ pub(super) fn sanitize_path(path: &Path) -> String {
54
+ path.file_name()
55
+ .and_then(|n| n.to_str())
56
+ .unwrap_or("unknown")
57
+ .to_string()
58
+ }
59
+
60
+ /// Record error information in the current OpenTelemetry span.
61
+ ///
62
+ /// This function records error details in the current span when the `otel` feature is enabled.
63
+ /// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
64
+ ///
65
+ /// # Arguments
66
+ ///
67
+ /// * `error` - The error to record in the span
68
+ ///
69
+ /// # Example
70
+ ///
71
+ /// ```rust,ignore
72
+ /// let result = extract_file("doc.pdf", None, &config).await;
73
+ /// #[cfg(feature = "otel")]
74
+ /// if let Err(ref e) = result {
75
+ /// record_error(e);
76
+ /// }
77
+ /// result
78
+ /// ```
79
+ #[cfg(feature = "otel")]
80
+ pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
81
+ let span = tracing::Span::current();
82
+ span.record("otel.status_code", "ERROR");
83
+ span.record("error.type", format!("{:?}", error));
84
+ span.record("error.message", error.to_string());
85
+ }
86
+
87
+ /// Extract content from a file.
88
+ ///
89
+ /// This is the main entry point for file-based extraction. It performs the following steps:
90
+ /// 1. Check cache for existing result (if caching enabled)
91
+ /// 2. Detect or validate MIME type
92
+ /// 3. Select appropriate extractor from registry
93
+ /// 4. Extract content
94
+ /// 5. Run post-processing pipeline
95
+ /// 6. Store result in cache (if caching enabled)
96
+ ///
97
+ /// # Arguments
98
+ ///
99
+ /// * `path` - Path to the file to extract
100
+ /// * `mime_type` - Optional MIME type override. If None, will be auto-detected
101
+ /// * `config` - Extraction configuration
102
+ ///
103
+ /// # Returns
104
+ ///
105
+ /// An `ExtractionResult` containing the extracted content and metadata.
106
+ ///
107
+ /// # Errors
108
+ ///
109
+ /// Returns `KreuzbergError::Validation` if the file doesn't exist or path is invalid.
110
+ /// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
111
+ /// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
112
+ ///
113
+ /// # Example
114
+ ///
115
+ /// ```rust,no_run
116
+ /// use kreuzberg::core::extractor::extract_file;
117
+ /// use kreuzberg::core::config::ExtractionConfig;
118
+ ///
119
+ /// # async fn example() -> kreuzberg::Result<()> {
120
+ /// let config = ExtractionConfig::default();
121
+ /// let result = extract_file("document.pdf", None, &config).await?;
122
+ /// println!("Content: {}", result.content);
123
+ /// # Ok(())
124
+ /// # }
125
+ /// ```
126
+ #[cfg_attr(feature = "otel", tracing::instrument(
127
+ skip(config, path),
128
+ fields(
129
+ extraction.filename = tracing::field::Empty,
130
+ )
131
+ ))]
132
+ pub async fn extract_file(
133
+ path: impl AsRef<Path>,
134
+ mime_type: Option<&str>,
135
+ config: &ExtractionConfig,
136
+ ) -> Result<ExtractionResult> {
137
+ use crate::core::{io, mime};
138
+
139
+ let path = path.as_ref();
140
+
141
+ #[cfg(feature = "otel")]
142
+ {
143
+ let span = tracing::Span::current();
144
+ span.record("extraction.filename", sanitize_path(path));
145
+ }
146
+
147
+ let result = async {
148
+ io::validate_file_exists(path)?;
149
+
150
+ let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
151
+
152
+ match detected_mime.as_str() {
153
+ #[cfg(feature = "office")]
154
+ LEGACY_WORD_MIME_TYPE => {
155
+ let original_bytes = tokio::fs::read(path).await?;
156
+ let conversion = convert_doc_to_docx(&original_bytes).await?;
157
+ let mut result =
158
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
159
+ apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
160
+ return Ok(result);
161
+ }
162
+ #[cfg(not(feature = "office"))]
163
+ LEGACY_WORD_MIME_TYPE => {
164
+ return Err(KreuzbergError::UnsupportedFormat(
165
+ "Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
166
+ ));
167
+ }
168
+ #[cfg(feature = "office")]
169
+ LEGACY_POWERPOINT_MIME_TYPE => {
170
+ let original_bytes = tokio::fs::read(path).await?;
171
+ let conversion = convert_ppt_to_pptx(&original_bytes).await?;
172
+ let mut result =
173
+ extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
174
+ apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
175
+ return Ok(result);
176
+ }
177
+ #[cfg(not(feature = "office"))]
178
+ LEGACY_POWERPOINT_MIME_TYPE => {
179
+ return Err(KreuzbergError::UnsupportedFormat(
180
+ "Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
181
+ ));
182
+ }
183
+ _ => {}
184
+ }
185
+
186
+ extract_file_with_extractor(path, &detected_mime, config).await
187
+ }
188
+ .await;
189
+
190
+ #[cfg(feature = "otel")]
191
+ if let Err(ref e) = result {
192
+ record_error(e);
193
+ }
194
+
195
+ result
196
+ }
197
+
198
+ pub(in crate::core::extractor) async fn extract_file_with_extractor(
199
+ path: &Path,
200
+ mime_type: &str,
201
+ config: &ExtractionConfig,
202
+ ) -> Result<ExtractionResult> {
203
+ crate::extractors::ensure_initialized()?;
204
+
205
+ let extractor = get_extractor(mime_type)?;
206
+ let mut result = extractor.extract_file(path, mime_type, config).await?;
207
+ result = crate::core::pipeline::run_pipeline(result, config).await?;
208
+ Ok(result)
209
+ }
210
+
211
+ pub(in crate::core::extractor) async fn extract_bytes_with_extractor(
212
+ content: &[u8],
213
+ mime_type: &str,
214
+ config: &ExtractionConfig,
215
+ ) -> Result<ExtractionResult> {
216
+ crate::extractors::ensure_initialized()?;
217
+
218
+ let extractor = get_extractor(mime_type)?;
219
+ let mut result = extractor.extract_bytes(content, mime_type, config).await?;
220
+ result = crate::core::pipeline::run_pipeline(result, config).await?;
221
+ Ok(result)
222
+ }
223
+
224
+ #[cfg(feature = "office")]
225
+ pub(in crate::core::extractor) fn apply_libreoffice_metadata(
226
+ result: &mut ExtractionResult,
227
+ legacy_mime: &str,
228
+ conversion: &LibreOfficeConversionResult,
229
+ ) {
230
+ result.mime_type = pool_mime_type(legacy_mime);
231
+ result.metadata.additional.insert(
232
+ "libreoffice_conversion".to_string(),
233
+ json!({
234
+ "converter": "libreoffice",
235
+ "original_format": conversion.original_format,
236
+ "target_format": conversion.target_format,
237
+ "target_mime": conversion.target_mime,
238
+ }),
239
+ );
240
+ }
@@ -0,0 +1,71 @@
1
+ //! Helper functions and utilities for extraction operations.
2
+ //!
3
+ //! This module provides shared utilities used across extraction modules.
4
+
5
+ use crate::plugins::DocumentExtractor;
6
+ #[cfg(feature = "office")]
7
+ use crate::utils::intern_mime_type;
8
+ use crate::utils::{PoolSizeHint, estimate_pool_size};
9
+ use crate::{KreuzbergError, Result};
10
+ use std::sync::Arc;
11
+
12
+ /// Get an extractor from the registry.
13
+ ///
14
+ /// This function acquires the registry read lock and retrieves the appropriate
15
+ /// extractor for the given MIME type.
16
+ ///
17
+ /// # Performance
18
+ ///
19
+ /// RwLock read + HashMap lookup is ~100ns, fast enough without caching.
20
+ /// Removed thread-local cache to avoid Tokio work-stealing scheduler issues.
21
+ pub(in crate::core::extractor) fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
22
+ let registry = crate::plugins::registry::get_document_extractor_registry();
23
+ let registry_read = registry
24
+ .read()
25
+ .map_err(|e| KreuzbergError::Other(format!("Document extractor registry lock poisoned: {}", e)))?;
26
+ registry_read.get(mime_type)
27
+ }
28
+
29
+ /// Get optimal pool sizing hint for a document.
30
+ ///
31
+ /// This function calculates recommended pool sizes based on the document's
32
+ /// file size and MIME type. The hint can be used to create appropriately
33
+ /// sized thread pools for extraction, reducing memory waste from over-allocation.
34
+ ///
35
+ /// # Arguments
36
+ ///
37
+ /// * `file_size` - The size of the file in bytes
38
+ /// * `mime_type` - The MIME type of the document
39
+ ///
40
+ /// # Returns
41
+ ///
42
+ /// A `PoolSizeHint` with recommended pool configurations
43
+ ///
44
+ /// # Example
45
+ ///
46
+ /// ```rust,ignore
47
+ /// use kreuzberg::core::extractor::get_pool_sizing_hint;
48
+ ///
49
+ /// let hint = get_pool_sizing_hint(5_000_000, "application/pdf");
50
+ /// println!("Recommended string buffers: {}", hint.string_buffer_count);
51
+ /// ```
52
+ #[inline]
53
+ pub fn get_pool_sizing_hint(file_size: u64, mime_type: &str) -> PoolSizeHint {
54
+ estimate_pool_size(file_size, mime_type)
55
+ }
56
+
57
+ /// Convert a MIME type string to a pooled String for efficient deduplication.
58
+ ///
59
+ /// This function uses the string interning pool to reduce memory allocations
60
+ /// for repeatedly used MIME types (e.g., "application/pdf" appears thousands of times
61
+ /// in batch processing). The interned string is converted to an owned String to satisfy
62
+ /// the ExtractionResult::mime_type field type.
63
+ ///
64
+ /// # Performance
65
+ ///
66
+ /// For pre-interned MIME types (all common types), this is O(1) pointer dereference.
67
+ /// For unknown MIME types, this allocates once per unique type and caches the result.
68
+ #[cfg(feature = "office")]
69
+ pub(in crate::core::extractor) fn pool_mime_type(mime_type: &str) -> String {
70
+ intern_mime_type(mime_type).to_string()
71
+ }
@@ -0,0 +1,62 @@
1
+ //! Legacy synchronous extraction for WASM compatibility.
2
+ //!
3
+ //! This module provides truly synchronous extraction implementations
4
+ //! for environments where Tokio runtime is not available (e.g., WASM).
5
+
6
+ /// Synchronous extraction implementation for WASM compatibility.
7
+ ///
8
+ /// This function performs extraction without requiring a tokio runtime.
9
+ /// It calls the sync extractor methods directly.
10
+ ///
11
+ /// # Arguments
12
+ ///
13
+ /// * `content` - The byte content to extract
14
+ /// * `mime_type` - Optional MIME type to validate/use
15
+ /// * `config` - Optional extraction configuration
16
+ ///
17
+ /// # Returns
18
+ ///
19
+ /// An `ExtractionResult` or a `KreuzbergError`
20
+ ///
21
+ /// # Implementation Notes
22
+ ///
23
+ /// This is called when the `tokio-runtime` feature is disabled.
24
+ /// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
25
+ #[cfg(not(feature = "tokio-runtime"))]
26
+ pub(super) fn extract_bytes_sync_impl(
27
+ content: Vec<u8>,
28
+ mime_type: Option<String>,
29
+ config: Option<crate::core::config::ExtractionConfig>,
30
+ ) -> crate::Result<crate::types::ExtractionResult> {
31
+ use crate::KreuzbergError;
32
+ use crate::core::extractor::helpers::get_extractor;
33
+ use crate::core::mime;
34
+
35
+ let config = config.unwrap_or_default();
36
+
37
+ let validated_mime = if let Some(mime) = mime_type {
38
+ mime::validate_mime_type(&mime)?
39
+ } else {
40
+ return Err(KreuzbergError::Validation {
41
+ message: "MIME type is required for synchronous extraction".to_string(),
42
+ source: None,
43
+ });
44
+ };
45
+
46
+ crate::extractors::ensure_initialized()?;
47
+
48
+ let extractor = get_extractor(&validated_mime)?;
49
+
50
+ let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
51
+ KreuzbergError::UnsupportedFormat(format!(
52
+ "Extractor for '{}' does not support synchronous extraction",
53
+ validated_mime
54
+ ))
55
+ })?;
56
+
57
+ let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
58
+
59
+ result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
60
+
61
+ Ok(result)
62
+ }