kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,313 @@
1
+ //! Integration tests for the /chunk API endpoint.
2
+
3
+ #![cfg(feature = "api")]
4
+
5
+ use axum::{
6
+ body::Body,
7
+ http::{Request, StatusCode},
8
+ };
9
+ use serde_json::json;
10
+ use tower::ServiceExt;
11
+
12
+ use kreuzberg::{ExtractionConfig, api::create_router};
13
+
14
+ #[tokio::test]
15
+ async fn test_chunk_basic() {
16
+ let app = create_router(ExtractionConfig::default());
17
+ let response = app
18
+ .oneshot(
19
+ Request::builder()
20
+ .uri("/chunk")
21
+ .method("POST")
22
+ .header("content-type", "application/json")
23
+ .body(Body::from(
24
+ json!({
25
+ "text": "Short text. More text here. Even more content to chunk."
26
+ })
27
+ .to_string(),
28
+ ))
29
+ .unwrap(),
30
+ )
31
+ .await
32
+ .unwrap();
33
+
34
+ assert_eq!(response.status(), StatusCode::OK);
35
+ }
36
+
37
+ #[tokio::test]
38
+ async fn test_chunk_empty_text_returns_400() {
39
+ let app = create_router(ExtractionConfig::default());
40
+ let response = app
41
+ .oneshot(
42
+ Request::builder()
43
+ .uri("/chunk")
44
+ .method("POST")
45
+ .header("content-type", "application/json")
46
+ .body(Body::from(json!({"text": ""}).to_string()))
47
+ .unwrap(),
48
+ )
49
+ .await
50
+ .unwrap();
51
+
52
+ assert_eq!(response.status(), StatusCode::BAD_REQUEST);
53
+ }
54
+
55
+ #[tokio::test]
56
+ async fn test_chunk_markdown_strategy() {
57
+ let app = create_router(ExtractionConfig::default());
58
+ let response = app
59
+ .oneshot(
60
+ Request::builder()
61
+ .uri("/chunk")
62
+ .method("POST")
63
+ .header("content-type", "application/json")
64
+ .body(Body::from(
65
+ json!({
66
+ "text": "# Heading\n\nParagraph text here.",
67
+ "chunker_type": "markdown"
68
+ })
69
+ .to_string(),
70
+ ))
71
+ .unwrap(),
72
+ )
73
+ .await
74
+ .unwrap();
75
+
76
+ assert_eq!(response.status(), StatusCode::OK);
77
+ }
78
+
79
+ #[tokio::test]
80
+ async fn test_chunk_response_structure() {
81
+ use kreuzberg::api::ChunkResponse;
82
+
83
+ let app = create_router(ExtractionConfig::default());
84
+ let response = app
85
+ .oneshot(
86
+ Request::builder()
87
+ .uri("/chunk")
88
+ .method("POST")
89
+ .header("content-type", "application/json")
90
+ .body(Body::from(
91
+ json!({
92
+ "text": "This is a test. Another sentence here. And one more sentence to ensure we get chunks.",
93
+ "config": {
94
+ "max_characters": 50,
95
+ "overlap": 10,
96
+ "trim": true
97
+ },
98
+ "chunker_type": "text"
99
+ })
100
+ .to_string(),
101
+ ))
102
+ .unwrap(),
103
+ )
104
+ .await
105
+ .unwrap();
106
+
107
+ assert_eq!(response.status(), StatusCode::OK);
108
+
109
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
110
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
111
+
112
+ // Verify response structure
113
+ assert!(chunk_response.chunk_count > 0);
114
+ assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
115
+ assert_eq!(chunk_response.chunker_type, "text");
116
+ assert_eq!(chunk_response.config.max_characters, 50);
117
+ assert_eq!(chunk_response.config.overlap, 10);
118
+ assert!(chunk_response.config.trim);
119
+ assert!(chunk_response.input_size_bytes > 0);
120
+
121
+ // Verify chunk metadata
122
+ for (idx, chunk) in chunk_response.chunks.iter().enumerate() {
123
+ assert!(!chunk.content.is_empty());
124
+ assert_eq!(chunk.chunk_index, idx);
125
+ assert_eq!(chunk.total_chunks, chunk_response.chunk_count);
126
+ assert!(chunk.byte_end > chunk.byte_start);
127
+ }
128
+ }
129
+
130
+ #[tokio::test]
131
+ async fn test_chunk_invalid_strategy_returns_400() {
132
+ let app = create_router(ExtractionConfig::default());
133
+ let response = app
134
+ .oneshot(
135
+ Request::builder()
136
+ .uri("/chunk")
137
+ .method("POST")
138
+ .header("content-type", "application/json")
139
+ .body(Body::from(
140
+ json!({
141
+ "text": "Test text",
142
+ "chunker_type": "invalid_type"
143
+ })
144
+ .to_string(),
145
+ ))
146
+ .unwrap(),
147
+ )
148
+ .await
149
+ .unwrap();
150
+
151
+ assert_eq!(response.status(), StatusCode::BAD_REQUEST);
152
+ }
153
+
154
+ #[tokio::test]
155
+ async fn test_chunk_with_defaults() {
156
+ use kreuzberg::api::ChunkResponse;
157
+
158
+ let app = create_router(ExtractionConfig::default());
159
+ let response = app
160
+ .oneshot(
161
+ Request::builder()
162
+ .uri("/chunk")
163
+ .method("POST")
164
+ .header("content-type", "application/json")
165
+ .body(Body::from(
166
+ json!({
167
+ "text": "This is a test sentence. Another sentence here."
168
+ })
169
+ .to_string(),
170
+ ))
171
+ .unwrap(),
172
+ )
173
+ .await
174
+ .unwrap();
175
+
176
+ assert_eq!(response.status(), StatusCode::OK);
177
+
178
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
179
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
180
+
181
+ // Verify defaults are applied
182
+ assert_eq!(chunk_response.config.max_characters, 2000);
183
+ assert_eq!(chunk_response.config.overlap, 100);
184
+ assert!(chunk_response.config.trim);
185
+ assert_eq!(chunk_response.chunker_type, "text");
186
+ }
187
+
188
+ #[tokio::test]
189
+ async fn test_chunk_malformed_json_returns_400() {
190
+ let app = create_router(ExtractionConfig::default());
191
+ let response = app
192
+ .oneshot(
193
+ Request::builder()
194
+ .uri("/chunk")
195
+ .method("POST")
196
+ .header("content-type", "application/json")
197
+ .body(Body::from("{invalid json}"))
198
+ .unwrap(),
199
+ )
200
+ .await
201
+ .unwrap();
202
+
203
+ assert_eq!(response.status(), StatusCode::BAD_REQUEST);
204
+ }
205
+
206
+ #[tokio::test]
207
+ async fn test_chunk_case_insensitive_chunker_type() {
208
+ use kreuzberg::api::ChunkResponse;
209
+
210
+ let app = create_router(ExtractionConfig::default());
211
+ let response = app
212
+ .oneshot(
213
+ Request::builder()
214
+ .uri("/chunk")
215
+ .method("POST")
216
+ .header("content-type", "application/json")
217
+ .body(Body::from(
218
+ json!({
219
+ "text": "# Title\n\nContent here.",
220
+ "chunker_type": "MARKDOWN"
221
+ })
222
+ .to_string(),
223
+ ))
224
+ .unwrap(),
225
+ )
226
+ .await
227
+ .unwrap();
228
+
229
+ assert_eq!(response.status(), StatusCode::OK);
230
+
231
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
232
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
233
+
234
+ // Verify it's normalized to lowercase
235
+ assert_eq!(chunk_response.chunker_type, "markdown");
236
+ }
237
+
238
+ #[tokio::test]
239
+ async fn test_chunk_long_text() {
240
+ use kreuzberg::api::ChunkResponse;
241
+
242
+ let app = create_router(ExtractionConfig::default());
243
+ let long_text = "Lorem ipsum dolor sit amet. ".repeat(200);
244
+
245
+ let response = app
246
+ .oneshot(
247
+ Request::builder()
248
+ .uri("/chunk")
249
+ .method("POST")
250
+ .header("content-type", "application/json")
251
+ .body(Body::from(
252
+ json!({
253
+ "text": long_text,
254
+ "config": {
255
+ "max_characters": 500,
256
+ "overlap": 50
257
+ }
258
+ })
259
+ .to_string(),
260
+ ))
261
+ .unwrap(),
262
+ )
263
+ .await
264
+ .unwrap();
265
+
266
+ assert_eq!(response.status(), StatusCode::OK);
267
+
268
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
269
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
270
+
271
+ // Should have multiple chunks
272
+ assert!(chunk_response.chunk_count > 1);
273
+ assert_eq!(chunk_response.chunks.len(), chunk_response.chunk_count);
274
+ }
275
+
276
+ #[tokio::test]
277
+ async fn test_chunk_custom_config() {
278
+ use kreuzberg::api::ChunkResponse;
279
+
280
+ let app = create_router(ExtractionConfig::default());
281
+ let response = app
282
+ .oneshot(
283
+ Request::builder()
284
+ .uri("/chunk")
285
+ .method("POST")
286
+ .header("content-type", "application/json")
287
+ .body(Body::from(
288
+ json!({
289
+ "text": "Test sentence one. Test sentence two. Test sentence three.",
290
+ "config": {
291
+ "max_characters": 30,
292
+ "overlap": 5,
293
+ "trim": false
294
+ },
295
+ "chunker_type": "text"
296
+ })
297
+ .to_string(),
298
+ ))
299
+ .unwrap(),
300
+ )
301
+ .await
302
+ .unwrap();
303
+
304
+ assert_eq!(response.status(), StatusCode::OK);
305
+
306
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
307
+ let chunk_response: ChunkResponse = serde_json::from_slice(&body).unwrap();
308
+
309
+ // Verify custom config was applied
310
+ assert_eq!(chunk_response.config.max_characters, 30);
311
+ assert_eq!(chunk_response.config.overlap, 5);
312
+ assert!(!chunk_response.config.trim);
313
+ }
@@ -84,9 +84,8 @@ async fn test_embed_with_custom_config() {
84
84
  "texts": ["Test embedding with custom config"],
85
85
  "config": {
86
86
  "model": {
87
- "preset": {
88
- "name": "fast"
89
- }
87
+ "type": "preset",
88
+ "name": "fast"
90
89
  },
91
90
  "batch_size": 32
92
91
  }
@@ -295,9 +294,8 @@ async fn test_embed_different_presets() {
295
294
  "texts": ["Test text"],
296
295
  "config": {
297
296
  "model": {
298
- "preset": {
299
- "name": "fast"
300
- }
297
+ "type": "preset",
298
+ "name": "fast"
301
299
  }
302
300
  }
303
301
  });
@@ -327,9 +325,8 @@ async fn test_embed_different_presets() {
327
325
  "texts": ["Test text"],
328
326
  "config": {
329
327
  "model": {
330
- "preset": {
331
- "name": "balanced"
332
- }
328
+ "type": "preset",
329
+ "name": "balanced"
333
330
  }
334
331
  }
335
332
  });
@@ -257,6 +257,7 @@ fn test_ocr_multipage_efficiency() {
257
257
  backend: "tesseract".to_string(),
258
258
  language: "eng".to_string(),
259
259
  tesseract_config: None,
260
+ output_format: None,
260
261
  }),
261
262
  force_ocr: false,
262
263
  use_cache: true,
@@ -193,6 +193,7 @@ async fn test_concurrent_ocr_processing() {
193
193
  backend: "tesseract".to_string(),
194
194
  language: "eng".to_string(),
195
195
  tesseract_config: None,
196
+ output_format: None,
196
197
  }),
197
198
  force_ocr: false,
198
199
  use_cache: true,
@@ -262,6 +263,7 @@ fn test_concurrent_ocr_cache_stress() {
262
263
  backend: "tesseract".to_string(),
263
264
  language: "eng".to_string(),
264
265
  tesseract_config: None,
266
+ output_format: None,
265
267
  }),
266
268
  force_ocr: false,
267
269
  use_cache: true,
@@ -313,7 +315,10 @@ fn test_concurrent_ocr_cache_stress() {
313
315
  /// - Pipeline can process multiple results in parallel
314
316
  /// - Processors don't interfere with each other
315
317
  /// - Registry reads are thread-safe
318
+ ///
319
+ /// Note: This test is flaky due to timing-dependent concurrent operations.
316
320
  #[tokio::test]
321
+ #[ignore = "flaky concurrency test - timing dependent on system load"]
317
322
  async fn test_concurrent_pipeline_processing() {
318
323
  struct ConcurrentTestProcessor;
319
324
 
@@ -378,6 +383,8 @@ async fn test_concurrent_pipeline_processing() {
378
383
  chunks: None,
379
384
  images: None,
380
385
  pages: None,
386
+ elements: None,
387
+ djot_content: None,
381
388
  };
382
389
 
383
390
  run_pipeline(result, &config).await
@@ -459,6 +459,7 @@ async fn test_extraction_with_ocr_config() {
459
459
  tesseract_config: None,
460
460
  backend: "tesseract".to_string(),
461
461
  language: "eng".to_string(),
462
+ output_format: None,
462
463
  }),
463
464
  force_ocr: true,
464
465
  ..Default::default()
@@ -120,3 +120,133 @@ async fn test_docx_minimal_metadata_extraction() {
120
120
 
121
121
  println!("✅ DOCX minimal metadata extraction test passed!");
122
122
  }
123
+
124
+ #[tokio::test]
125
+ async fn test_docx_keywords_extraction() {
126
+ // This test verifies that DOCX keywords metadata is properly parsed
127
+ // from comma-separated strings into Vec<String> in Metadata.keywords
128
+ //
129
+ // Addresses GitHub issue #309: DOCX keyword extraction was returning
130
+ // strings instead of parsed keyword lists, causing FunctionClauseError
131
+ // in the Elixir binding.
132
+
133
+ use std::io::Write;
134
+ use tempfile::NamedTempFile;
135
+ use zip::CompressionMethod;
136
+ use zip::write::{FileOptions, ZipWriter};
137
+
138
+ // Create a minimal DOCX with keywords metadata
139
+ let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
140
+
141
+ {
142
+ let mut zip = ZipWriter::new(&mut temp_file);
143
+ let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
144
+
145
+ // Add [Content_Types].xml
146
+ zip.start_file("[Content_Types].xml", options).unwrap();
147
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
148
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
149
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
150
+ <Default Extension="xml" ContentType="application/xml"/>
151
+ <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
152
+ <Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>
153
+ </Types>"#).unwrap();
154
+
155
+ // Add _rels/.rels
156
+ zip.start_file("_rels/.rels", options).unwrap();
157
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
158
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
159
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
160
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>
161
+ </Relationships>"#).unwrap();
162
+
163
+ // Add word/document.xml with simple content
164
+ zip.start_file("word/document.xml", options).unwrap();
165
+ zip.write_all(
166
+ br#"<?xml version="1.0" encoding="UTF-8"?>
167
+ <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
168
+ <w:body>
169
+ <w:p>
170
+ <w:r>
171
+ <w:t>Test document for keyword extraction</w:t>
172
+ </w:r>
173
+ </w:p>
174
+ </w:body>
175
+ </w:document>"#,
176
+ )
177
+ .unwrap();
178
+
179
+ // Add docProps/core.xml with keywords (comma-separated string)
180
+ zip.start_file("docProps/core.xml", options).unwrap();
181
+ zip.write_all(
182
+ br#"<?xml version="1.0" encoding="UTF-8"?>
183
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
184
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
185
+ xmlns:dcterms="http://purl.org/dc/terms/">
186
+ <dc:title>Test Document</dc:title>
187
+ <dc:creator>Test Author</dc:creator>
188
+ <cp:keywords>rust, docx, extraction, metadata, test</cp:keywords>
189
+ <dc:subject>Testing keyword extraction</dc:subject>
190
+ </cp:coreProperties>"#,
191
+ )
192
+ .unwrap();
193
+
194
+ zip.finish().unwrap();
195
+ }
196
+
197
+ // Extract the DOCX file
198
+ let result = extract_file(
199
+ temp_file.path(),
200
+ Some("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
201
+ &ExtractionConfig::default(),
202
+ )
203
+ .await
204
+ .expect("Should extract DOCX with keywords successfully");
205
+
206
+ // Verify content was extracted
207
+ assert!(!result.content.is_empty(), "Content should not be empty");
208
+ assert!(
209
+ result.content.contains("Test document for keyword extraction"),
210
+ "Content should match document text"
211
+ );
212
+
213
+ // Verify keywords were parsed into Vec<String> in Metadata.keywords
214
+ assert!(
215
+ result.metadata.keywords.is_some(),
216
+ "Keywords should be present in metadata.keywords"
217
+ );
218
+
219
+ let keywords = result.metadata.keywords.as_ref().unwrap();
220
+ assert_eq!(
221
+ keywords.len(),
222
+ 5,
223
+ "Should have 5 keywords parsed from comma-separated string"
224
+ );
225
+
226
+ // Verify individual keywords were trimmed and parsed correctly
227
+ assert_eq!(keywords[0], "rust", "First keyword should be 'rust'");
228
+ assert_eq!(keywords[1], "docx", "Second keyword should be 'docx'");
229
+ assert_eq!(keywords[2], "extraction", "Third keyword should be 'extraction'");
230
+ assert_eq!(keywords[3], "metadata", "Fourth keyword should be 'metadata'");
231
+ assert_eq!(keywords[4], "test", "Fifth keyword should be 'test'");
232
+
233
+ // Verify other metadata was also extracted
234
+ assert_eq!(
235
+ result.metadata.additional.get("created_by").and_then(|v| v.as_str()),
236
+ Some("Test Author"),
237
+ "Should have correct creator"
238
+ );
239
+ assert_eq!(
240
+ result.metadata.additional.get("title").and_then(|v| v.as_str()),
241
+ Some("Test Document"),
242
+ "Should have correct title"
243
+ );
244
+ assert_eq!(
245
+ result.metadata.additional.get("subject").and_then(|v| v.as_str()),
246
+ Some("Testing keyword extraction"),
247
+ "Should have correct subject"
248
+ );
249
+
250
+ println!("✅ DOCX keywords extraction test passed!");
251
+ println!(" Extracted keywords: {:?}", keywords);
252
+ }
@@ -53,20 +53,14 @@ async fn test_native_epub_wasteland_extraction() {
53
53
  result.content.len()
54
54
  );
55
55
 
56
- assert!(
57
- result.metadata.additional.contains_key("title"),
58
- "Should extract title metadata"
59
- );
56
+ assert!(result.metadata.title.is_some(), "Should extract title metadata");
60
57
  assert_eq!(
61
- result.metadata.additional.get("title").and_then(|v| v.as_str()),
58
+ result.metadata.title.as_deref(),
62
59
  Some("The Waste Land"),
63
60
  "Should have correct title"
64
61
  );
65
62
 
66
- assert!(
67
- result.metadata.additional.contains_key("creator"),
68
- "Should extract creator metadata"
69
- );
63
+ assert!(result.metadata.authors.is_some(), "Should extract creator metadata");
70
64
 
71
65
  assert!(
72
66
  result.content.contains("April") || result.content.contains("cruellest"),
@@ -105,10 +99,7 @@ async fn test_native_epub_images_extraction() {
105
99
  result.content.len()
106
100
  );
107
101
 
108
- assert!(
109
- result.metadata.additional.contains_key("title"),
110
- "Should extract title metadata"
111
- );
102
+ assert!(result.metadata.title.is_some(), "Should extract title metadata");
112
103
 
113
104
  println!("✅ Images EPUB extraction test passed ({} bytes)", result.content.len());
114
105
  }
@@ -179,7 +170,7 @@ async fn test_native_epub2_cover_extraction() {
179
170
  );
180
171
 
181
172
  assert_eq!(
182
- result.metadata.additional.get("title").and_then(|v| v.as_str()),
173
+ result.metadata.title.as_deref(),
183
174
  Some("Pandoc EPUB Test"),
184
175
  "Should have correct title"
185
176
  );
@@ -100,6 +100,7 @@ async fn test_ocr_simple_english_image_async() {
100
100
  backend: "tesseract".to_string(),
101
101
  language: "eng".to_string(),
102
102
  tesseract_config: None,
103
+ output_format: None,
103
104
  }),
104
105
  force_ocr: true,
105
106
  ..Default::default()
@@ -142,6 +143,7 @@ async fn test_ocr_image_without_text_async() {
142
143
  backend: "tesseract".to_string(),
143
144
  language: "eng".to_string(),
144
145
  tesseract_config: None,
146
+ output_format: None,
145
147
  }),
146
148
  force_ocr: true,
147
149
  ..Default::default()
@@ -115,6 +115,7 @@ pub fn test_config_with_ocr() -> kreuzberg::core::config::ExtractionConfig {
115
115
  backend: "tesseract".to_string(),
116
116
  language: "eng".to_string(),
117
117
  tesseract_config: None,
118
+ output_format: None,
118
119
  }),
119
120
  force_ocr: false,
120
121
  ..Default::default()