kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,289 @@
1
+ //! Bounding box geometry for PDF text positioning.
2
+ //!
3
+ //! This module provides the BoundingBox type and geometric operations used
4
+ //! for spatial analysis of text elements in PDF documents.
5
+
6
+ // Constants for weighted distance calculation
7
+ const WEIGHTED_DISTANCE_X_WEIGHT: f32 = 5.0;
8
+ const WEIGHTED_DISTANCE_Y_WEIGHT: f32 = 1.0;
9
+
10
+ /// A bounding box for text or elements.
11
+ #[derive(Debug, Clone, Copy, PartialEq)]
12
+ pub struct BoundingBox {
13
+ /// Left x-coordinate
14
+ pub left: f32,
15
+ /// Top y-coordinate
16
+ pub top: f32,
17
+ /// Right x-coordinate
18
+ pub right: f32,
19
+ /// Bottom y-coordinate
20
+ pub bottom: f32,
21
+ }
22
+
23
+ impl BoundingBox {
24
+ /// Create a new bounding box with zero-size validation.
25
+ ///
26
+ /// # Arguments
27
+ ///
28
+ /// * `left` - Left x-coordinate
29
+ /// * `top` - Top y-coordinate
30
+ /// * `right` - Right x-coordinate
31
+ /// * `bottom` - Bottom y-coordinate
32
+ ///
33
+ /// # Returns
34
+ ///
35
+ /// `Ok(BoundingBox)` if the box has non-zero area, or
36
+ /// `Err` if the box has zero width or height
37
+ ///
38
+ /// # Errors
39
+ ///
40
+ /// Returns an error if:
41
+ /// - Width (`right - left`) is less than 1e-10 (near-zero)
42
+ /// - Height (`bottom - top`) is less than 1e-10 (near-zero)
43
+ pub fn new(left: f32, top: f32, right: f32, bottom: f32) -> std::result::Result<BoundingBox, String> {
44
+ let width = (right - left).abs();
45
+ let height = (bottom - top).abs();
46
+
47
+ if width < 1e-10 || height < 1e-10 {
48
+ return Err(format!(
49
+ "BoundingBox has zero or near-zero area: width={}, height={}",
50
+ width, height
51
+ ));
52
+ }
53
+
54
+ Ok(BoundingBox {
55
+ left,
56
+ top,
57
+ right,
58
+ bottom,
59
+ })
60
+ }
61
+
62
+ /// Create a new bounding box without validation (unchecked).
63
+ ///
64
+ /// This is useful when you know the coordinates are valid or want to
65
+ /// defer validation. Use with caution - invalid boxes may cause issues
66
+ /// in calculations like area, width, and height.
67
+ ///
68
+ /// # Arguments
69
+ ///
70
+ /// * `left` - Left x-coordinate
71
+ /// * `top` - Top y-coordinate
72
+ /// * `right` - Right x-coordinate
73
+ /// * `bottom` - Bottom y-coordinate
74
+ ///
75
+ /// # Returns
76
+ ///
77
+ /// A BoundingBox without any validation
78
+ pub fn new_unchecked(left: f32, top: f32, right: f32, bottom: f32) -> BoundingBox {
79
+ BoundingBox {
80
+ left,
81
+ top,
82
+ right,
83
+ bottom,
84
+ }
85
+ }
86
+
87
+ /// Get the width of the bounding box.
88
+ ///
89
+ /// # Returns
90
+ ///
91
+ /// The width (right - left). No absolute value is taken as
92
+ /// the BoundingBox::new() constructor ensures correct ordering.
93
+ pub fn width(&self) -> f32 {
94
+ self.right - self.left
95
+ }
96
+
97
+ /// Get the height of the bounding box.
98
+ ///
99
+ /// # Returns
100
+ ///
101
+ /// The height (bottom - top). No absolute value is taken as
102
+ /// the BoundingBox::new() constructor ensures correct ordering.
103
+ pub fn height(&self) -> f32 {
104
+ self.bottom - self.top
105
+ }
106
+
107
+ /// Calculate the Intersection over Union (IOU) between this bounding box and another.
108
+ ///
109
+ /// IOU = intersection_area / union_area
110
+ ///
111
+ /// # Arguments
112
+ ///
113
+ /// * `other` - The other bounding box to compare with
114
+ ///
115
+ /// # Returns
116
+ ///
117
+ /// The IOU value between 0.0 and 1.0
118
+ pub fn iou(&self, other: &BoundingBox) -> f32 {
119
+ let intersection_area = self.calculate_intersection_area(other);
120
+ let self_area = self.calculate_area();
121
+ let other_area = other.calculate_area();
122
+ let union_area = self_area + other_area - intersection_area;
123
+
124
+ if union_area <= 0.0 {
125
+ 0.0
126
+ } else {
127
+ intersection_area / union_area
128
+ }
129
+ }
130
+
131
+ /// Calculate the weighted distance between the centers of two bounding boxes.
132
+ ///
133
+ /// The distance is weighted with X-axis having weight 5.0 and Y-axis having weight 1.0.
134
+ /// This reflects the greater importance of horizontal distance in text layout.
135
+ ///
136
+ /// # Arguments
137
+ ///
138
+ /// * `other` - The other bounding box to compare with
139
+ ///
140
+ /// # Returns
141
+ ///
142
+ /// The weighted distance value
143
+ pub fn weighted_distance(&self, other: &BoundingBox) -> f32 {
144
+ let (self_center_x, self_center_y) = self.center();
145
+ let (other_center_x, other_center_y) = other.center();
146
+
147
+ let dx = (self_center_x - other_center_x).abs();
148
+ let dy = (self_center_y - other_center_y).abs();
149
+
150
+ dx * WEIGHTED_DISTANCE_X_WEIGHT + dy * WEIGHTED_DISTANCE_Y_WEIGHT
151
+ }
152
+
153
+ /// Calculate the intersection ratio relative to this bounding box's area.
154
+ ///
155
+ /// intersection_ratio = intersection_area / self_area
156
+ ///
157
+ /// # Arguments
158
+ ///
159
+ /// * `other` - The other bounding box to compare with
160
+ ///
161
+ /// # Returns
162
+ ///
163
+ /// The intersection ratio between 0.0 and 1.0
164
+ pub fn intersection_ratio(&self, other: &BoundingBox) -> f32 {
165
+ let intersection_area = self.calculate_intersection_area(other);
166
+ let self_area = self.calculate_area();
167
+
168
+ if self_area <= 0.0 {
169
+ 0.0
170
+ } else {
171
+ intersection_area / self_area
172
+ }
173
+ }
174
+
175
+ /// Check if this bounding box contains another bounding box.
176
+ pub fn contains(&self, other: &BoundingBox) -> bool {
177
+ other.left >= self.left && other.right <= self.right && other.top >= self.top && other.bottom <= self.bottom
178
+ }
179
+
180
+ /// Calculate the center coordinates of this bounding box.
181
+ pub fn center(&self) -> (f32, f32) {
182
+ ((self.left + self.right) / 2.0, (self.top + self.bottom) / 2.0)
183
+ }
184
+
185
+ /// Merge this bounding box with another, creating a box that contains both.
186
+ pub fn merge(&self, other: &BoundingBox) -> BoundingBox {
187
+ BoundingBox {
188
+ left: self.left.min(other.left),
189
+ top: self.top.min(other.top),
190
+ right: self.right.max(other.right),
191
+ bottom: self.bottom.max(other.bottom),
192
+ }
193
+ }
194
+
195
+ /// Calculate a relaxed IOU with an expansion factor.
196
+ pub fn relaxed_iou(&self, other: &BoundingBox, relaxation: f32) -> f32 {
197
+ let self_width = self.right - self.left;
198
+ let self_height = self.bottom - self.top;
199
+ let self_expansion = relaxation * self_width.min(self_height).max(0.0);
200
+
201
+ let other_width = other.right - other.left;
202
+ let other_height = other.bottom - other.top;
203
+ let other_expansion = relaxation * other_width.min(other_height).max(0.0);
204
+
205
+ let expanded_self = BoundingBox {
206
+ left: (self.left - self_expansion).max(0.0),
207
+ top: (self.top - self_expansion).max(0.0),
208
+ right: self.right + self_expansion,
209
+ bottom: self.bottom + self_expansion,
210
+ };
211
+
212
+ let expanded_other = BoundingBox {
213
+ left: (other.left - other_expansion).max(0.0),
214
+ top: (other.top - other_expansion).max(0.0),
215
+ right: other.right + other_expansion,
216
+ bottom: other.bottom + other_expansion,
217
+ };
218
+
219
+ expanded_self.iou(&expanded_other)
220
+ }
221
+
222
+ /// Calculate the area of this bounding box.
223
+ fn calculate_area(&self) -> f32 {
224
+ let width = (self.right - self.left).max(0.0);
225
+ let height = (self.bottom - self.top).max(0.0);
226
+ width * height
227
+ }
228
+
229
+ /// Calculate the intersection area between this bounding box and another.
230
+ fn calculate_intersection_area(&self, other: &BoundingBox) -> f32 {
231
+ let left = self.left.max(other.left);
232
+ let top = self.top.max(other.top);
233
+ let right = self.right.min(other.right);
234
+ let bottom = self.bottom.min(other.bottom);
235
+
236
+ let width = (right - left).max(0.0);
237
+ let height = (bottom - top).max(0.0);
238
+ width * height
239
+ }
240
+ }
241
+
242
+ #[cfg(test)]
243
+ mod tests {
244
+ use super::*;
245
+
246
+ #[test]
247
+ fn test_bounding_box_new_valid() {
248
+ let bbox = BoundingBox::new(10.0, 20.0, 30.0, 40.0);
249
+ assert!(bbox.is_ok());
250
+ let bbox = bbox.unwrap();
251
+ assert_eq!(bbox.width(), 20.0);
252
+ assert_eq!(bbox.height(), 20.0);
253
+ }
254
+
255
+ #[test]
256
+ fn test_bounding_box_new_zero_width() {
257
+ let bbox = BoundingBox::new(10.0, 20.0, 10.0, 40.0);
258
+ assert!(bbox.is_err());
259
+ let error_msg = bbox.unwrap_err();
260
+ assert!(error_msg.contains("zero or near-zero area"));
261
+ }
262
+
263
+ #[test]
264
+ fn test_bounding_box_new_zero_height() {
265
+ let bbox = BoundingBox::new(10.0, 20.0, 30.0, 20.0);
266
+ assert!(bbox.is_err());
267
+ let error_msg = bbox.unwrap_err();
268
+ assert!(error_msg.contains("zero or near-zero area"));
269
+ }
270
+
271
+ #[test]
272
+ fn test_bounding_box_new_unchecked() {
273
+ let bbox = BoundingBox::new_unchecked(10.0, 20.0, 30.0, 40.0);
274
+ assert_eq!(bbox.width(), 20.0);
275
+ assert_eq!(bbox.height(), 20.0);
276
+ }
277
+
278
+ #[test]
279
+ fn test_bounding_box_width_and_height() {
280
+ let bbox = BoundingBox {
281
+ left: 5.0,
282
+ top: 10.0,
283
+ right: 25.0,
284
+ bottom: 50.0,
285
+ };
286
+ assert_eq!(bbox.width(), 20.0);
287
+ assert_eq!(bbox.height(), 40.0);
288
+ }
289
+ }
@@ -0,0 +1,199 @@
1
+ //! Font size clustering for PDF hierarchy extraction.
2
+ //!
3
+ //! This module implements k-means clustering on font sizes to identify
4
+ //! document hierarchy levels (headings vs body text).
5
+
6
+ use super::extraction::TextBlock;
7
+ use crate::pdf::error::{PdfError, Result};
8
+
9
+ // K-means algorithm constants
10
+ const KMEANS_MAX_ITERATIONS: usize = 100;
11
+ const KMEANS_CONVERGENCE_THRESHOLD: f32 = 0.01;
12
+
13
+ /// A cluster of text blocks with the same font size characteristics.
14
+ #[derive(Debug, Clone)]
15
+ pub struct FontSizeCluster {
16
+ /// The centroid (mean) font size of this cluster
17
+ pub centroid: f32,
18
+ /// The text blocks that belong to this cluster
19
+ pub members: Vec<TextBlock>,
20
+ }
21
+
22
+ /// Cluster text blocks by font size using k-means algorithm.
23
+ ///
24
+ /// Uses k-means clustering to group text blocks by their font size, which helps
25
+ /// identify document hierarchy levels (H1, H2, Body, etc.). The algorithm:
26
+ /// 1. Extracts font sizes from text blocks
27
+ /// 2. Applies k-means clustering to group similar font sizes
28
+ /// 3. Sorts clusters by centroid size in descending order (largest = H1)
29
+ /// 4. Returns clusters with their member blocks
30
+ ///
31
+ /// # Arguments
32
+ ///
33
+ /// * `blocks` - Slice of TextBlock objects to cluster
34
+ /// * `k` - Number of clusters to create
35
+ ///
36
+ /// # Returns
37
+ ///
38
+ /// Result with vector of FontSizeCluster ordered by size (descending),
39
+ /// or an error if clustering fails
40
+ ///
41
+ /// # Example
42
+ ///
43
+ /// ```rust,no_run
44
+ /// # #[cfg(feature = "pdf")]
45
+ /// # {
46
+ /// use kreuzberg::pdf::hierarchy::{TextBlock, BoundingBox, cluster_font_sizes};
47
+ ///
48
+ /// let blocks = vec![
49
+ /// TextBlock {
50
+ /// text: "Title".to_string(),
51
+ /// bbox: BoundingBox { left: 0.0, top: 0.0, right: 100.0, bottom: 24.0 },
52
+ /// font_size: 24.0,
53
+ /// },
54
+ /// TextBlock {
55
+ /// text: "Body".to_string(),
56
+ /// bbox: BoundingBox { left: 0.0, top: 30.0, right: 100.0, bottom: 42.0 },
57
+ /// font_size: 12.0,
58
+ /// },
59
+ /// ];
60
+ ///
61
+ /// let clusters = cluster_font_sizes(&blocks, 2).unwrap();
62
+ /// assert_eq!(clusters.len(), 2);
63
+ /// assert_eq!(clusters[0].centroid, 24.0); // Largest is first
64
+ /// # }
65
+ /// ```
66
+ pub fn cluster_font_sizes(blocks: &[TextBlock], k: usize) -> Result<Vec<FontSizeCluster>> {
67
+ if blocks.is_empty() {
68
+ return Ok(Vec::new());
69
+ }
70
+
71
+ if k == 0 {
72
+ return Err(PdfError::TextExtractionFailed("K must be greater than 0".to_string()));
73
+ }
74
+
75
+ let actual_k = k.min(blocks.len());
76
+
77
+ // Extract unique font sizes for initialization
78
+ let mut font_sizes: Vec<f32> = blocks.iter().map(|b| b.font_size).collect();
79
+ font_sizes.sort_by(|a, b| b.partial_cmp(a).expect("Failed to compare font sizes during sorting")); // Sort descending
80
+ font_sizes.dedup(); // Remove duplicates to get unique font sizes
81
+
82
+ // Initialize centroids using actual font sizes from the data
83
+ // This is more robust than dividing the range uniformly
84
+ let mut centroids: Vec<f32> = Vec::new();
85
+
86
+ if font_sizes.len() >= actual_k {
87
+ // If we have at least k unique font sizes, pick them evenly spaced
88
+ let step = font_sizes.len() / actual_k;
89
+ for i in 0..actual_k {
90
+ let idx = i * step;
91
+ centroids.push(font_sizes[idx.min(font_sizes.len() - 1)]);
92
+ }
93
+ } else {
94
+ // If we have fewer unique sizes than k, use all of them and fill with interpolated values
95
+ centroids = font_sizes.clone();
96
+
97
+ // Add interpolated centroids between existing ones to reach desired k
98
+ let min_font = font_sizes[font_sizes.len() - 1];
99
+ let max_font = font_sizes[0];
100
+ let range = max_font - min_font;
101
+
102
+ while centroids.len() < actual_k {
103
+ let t = centroids.len() as f32 / (actual_k - 1) as f32;
104
+ let interpolated = max_font - t * range;
105
+ centroids.push(interpolated);
106
+ }
107
+
108
+ centroids.sort_by(|a, b| b.partial_cmp(a).expect("Failed to compare centroids during sorting"));
109
+ // Keep sorted descending
110
+ }
111
+
112
+ // Run k-means clustering for a fixed number of iterations
113
+ for _ in 0..KMEANS_MAX_ITERATIONS {
114
+ // Assign blocks to nearest centroid
115
+ let clusters = assign_blocks_to_centroids(blocks, &centroids);
116
+
117
+ // Update centroids
118
+ let mut new_centroids = Vec::with_capacity(actual_k);
119
+ for (i, cluster) in clusters.iter().enumerate() {
120
+ if !cluster.is_empty() {
121
+ new_centroids.push(cluster.iter().map(|b| b.font_size).sum::<f32>() / cluster.len() as f32);
122
+ } else {
123
+ new_centroids.push(centroids[i]);
124
+ }
125
+ }
126
+
127
+ // Check for convergence
128
+ let converged = centroids
129
+ .iter()
130
+ .zip(new_centroids.iter())
131
+ .all(|(old, new)| (old - new).abs() < KMEANS_CONVERGENCE_THRESHOLD);
132
+
133
+ std::mem::swap(&mut centroids, &mut new_centroids);
134
+
135
+ if converged {
136
+ break;
137
+ }
138
+ }
139
+
140
+ // Final assignment to create result
141
+ let clusters = assign_blocks_to_centroids(blocks, &centroids);
142
+
143
+ // Create FontSizeCluster objects with centroids
144
+ let mut result: Vec<FontSizeCluster> = Vec::new();
145
+
146
+ for i in 0..actual_k {
147
+ if !clusters[i].is_empty() {
148
+ let centroid_value = centroids[i];
149
+ result.push(FontSizeCluster {
150
+ centroid: centroid_value,
151
+ members: clusters[i].clone(),
152
+ });
153
+ }
154
+ }
155
+
156
+ // Sort by centroid size in descending order (largest font = H1)
157
+ result.sort_by(|a, b| {
158
+ b.centroid
159
+ .partial_cmp(&a.centroid)
160
+ .expect("Failed to compare centroids during final sort")
161
+ });
162
+
163
+ Ok(result)
164
+ }
165
+
166
+ /// Helper function to assign blocks to their nearest centroid.
167
+ ///
168
+ /// Iterates through blocks and finds the closest centroid for each block,
169
+ /// grouping them into clusters. Used in k-means clustering iterations.
170
+ ///
171
+ /// # Arguments
172
+ ///
173
+ /// * `blocks` - Slice of TextBlock objects to assign
174
+ /// * `centroids` - Slice of centroid values (one per cluster)
175
+ ///
176
+ /// # Returns
177
+ ///
178
+ /// A vector of clusters, where each cluster contains the TextBlock objects
179
+ /// assigned to that centroid
180
+ fn assign_blocks_to_centroids(blocks: &[TextBlock], centroids: &[f32]) -> Vec<Vec<TextBlock>> {
181
+ let mut clusters: Vec<Vec<TextBlock>> = vec![Vec::new(); centroids.len()];
182
+
183
+ for block in blocks {
184
+ let mut min_distance = f32::INFINITY;
185
+ let mut best_cluster = 0;
186
+
187
+ for (i, &centroid) in centroids.iter().enumerate() {
188
+ let distance = (block.font_size - centroid).abs();
189
+ if distance < min_distance {
190
+ min_distance = distance;
191
+ best_cluster = i;
192
+ }
193
+ }
194
+
195
+ clusters[best_cluster].push(block.clone());
196
+ }
197
+
198
+ clusters
199
+ }