kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -1,173 +1,21 @@
1
- //! PDF text hierarchy extraction using pdfium character positions.
1
+ //! PDF text hierarchy extraction and text block analysis.
2
2
  //!
3
3
  //! This module provides functions for extracting character information from PDFs,
4
- //! preserving font size and position data for text hierarchy analysis.
5
- //!
6
- //! Note: Requires the "pdf" feature to be enabled.
4
+ //! merging characters into text blocks, and assigning hierarchy levels based on
5
+ //! font size analysis.
7
6
 
8
- use super::error::{PdfError, Result};
7
+ use super::bounding_box::BoundingBox;
8
+ use super::clustering::FontSizeCluster;
9
9
  use crate::core::config::ExtractionConfig;
10
+ use crate::pdf::error::{PdfError, Result};
10
11
  use pdfium_render::prelude::*;
11
12
 
12
13
  // Magic number constants
13
14
  const DEFAULT_FONT_SIZE: f32 = 12.0;
14
- const WEIGHTED_DISTANCE_X_WEIGHT: f32 = 5.0;
15
- const WEIGHTED_DISTANCE_Y_WEIGHT: f32 = 1.0;
16
- const KMEANS_MAX_ITERATIONS: usize = 100;
17
- const KMEANS_CONVERGENCE_THRESHOLD: f32 = 0.01;
18
15
  const MERGE_INTERSECTION_THRESHOLD: f32 = 0.05;
19
16
  const MERGE_X_THRESHOLD_MULTIPLIER: f32 = 2.0;
20
17
  const MERGE_Y_THRESHOLD_MULTIPLIER: f32 = 1.5;
21
18
 
22
- /// A bounding box for text or elements.
23
- #[derive(Debug, Clone, Copy, PartialEq)]
24
- pub struct BoundingBox {
25
- /// Left x-coordinate
26
- pub left: f32,
27
- /// Top y-coordinate
28
- pub top: f32,
29
- /// Right x-coordinate
30
- pub right: f32,
31
- /// Bottom y-coordinate
32
- pub bottom: f32,
33
- }
34
-
35
- impl BoundingBox {
36
- /// Calculate the Intersection over Union (IOU) between this bounding box and another.
37
- ///
38
- /// IOU = intersection_area / union_area
39
- ///
40
- /// # Arguments
41
- ///
42
- /// * `other` - The other bounding box to compare with
43
- ///
44
- /// # Returns
45
- ///
46
- /// The IOU value between 0.0 and 1.0
47
- pub fn iou(&self, other: &BoundingBox) -> f32 {
48
- let intersection_area = self.calculate_intersection_area(other);
49
- let self_area = self.calculate_area();
50
- let other_area = other.calculate_area();
51
- let union_area = self_area + other_area - intersection_area;
52
-
53
- if union_area <= 0.0 {
54
- 0.0
55
- } else {
56
- intersection_area / union_area
57
- }
58
- }
59
-
60
- /// Calculate the weighted distance between the centers of two bounding boxes.
61
- ///
62
- /// The distance is weighted with X-axis having weight 5.0 and Y-axis having weight 1.0.
63
- /// This reflects the greater importance of horizontal distance in text layout.
64
- ///
65
- /// # Arguments
66
- ///
67
- /// * `other` - The other bounding box to compare with
68
- ///
69
- /// # Returns
70
- ///
71
- /// The weighted distance value
72
- pub fn weighted_distance(&self, other: &BoundingBox) -> f32 {
73
- let (self_center_x, self_center_y) = self.center();
74
- let (other_center_x, other_center_y) = other.center();
75
-
76
- let dx = (self_center_x - other_center_x).abs();
77
- let dy = (self_center_y - other_center_y).abs();
78
-
79
- dx * WEIGHTED_DISTANCE_X_WEIGHT + dy * WEIGHTED_DISTANCE_Y_WEIGHT
80
- }
81
-
82
- /// Calculate the intersection ratio relative to this bounding box's area.
83
- ///
84
- /// intersection_ratio = intersection_area / self_area
85
- ///
86
- /// # Arguments
87
- ///
88
- /// * `other` - The other bounding box to compare with
89
- ///
90
- /// # Returns
91
- ///
92
- /// The intersection ratio between 0.0 and 1.0
93
- pub fn intersection_ratio(&self, other: &BoundingBox) -> f32 {
94
- let intersection_area = self.calculate_intersection_area(other);
95
- let self_area = self.calculate_area();
96
-
97
- if self_area <= 0.0 {
98
- 0.0
99
- } else {
100
- intersection_area / self_area
101
- }
102
- }
103
-
104
- /// Check if this bounding box contains another bounding box.
105
- pub fn contains(&self, other: &BoundingBox) -> bool {
106
- other.left >= self.left && other.right <= self.right && other.top >= self.top && other.bottom <= self.bottom
107
- }
108
-
109
- /// Calculate the center coordinates of this bounding box.
110
- pub fn center(&self) -> (f32, f32) {
111
- ((self.left + self.right) / 2.0, (self.top + self.bottom) / 2.0)
112
- }
113
-
114
- /// Merge this bounding box with another, creating a box that contains both.
115
- pub fn merge(&self, other: &BoundingBox) -> BoundingBox {
116
- BoundingBox {
117
- left: self.left.min(other.left),
118
- top: self.top.min(other.top),
119
- right: self.right.max(other.right),
120
- bottom: self.bottom.max(other.bottom),
121
- }
122
- }
123
-
124
- /// Calculate a relaxed IOU with an expansion factor.
125
- pub fn relaxed_iou(&self, other: &BoundingBox, relaxation: f32) -> f32 {
126
- let self_width = self.right - self.left;
127
- let self_height = self.bottom - self.top;
128
- let self_expansion = relaxation * self_width.min(self_height).max(0.0);
129
-
130
- let other_width = other.right - other.left;
131
- let other_height = other.bottom - other.top;
132
- let other_expansion = relaxation * other_width.min(other_height).max(0.0);
133
-
134
- let expanded_self = BoundingBox {
135
- left: (self.left - self_expansion).max(0.0),
136
- top: (self.top - self_expansion).max(0.0),
137
- right: self.right + self_expansion,
138
- bottom: self.bottom + self_expansion,
139
- };
140
-
141
- let expanded_other = BoundingBox {
142
- left: (other.left - other_expansion).max(0.0),
143
- top: (other.top - other_expansion).max(0.0),
144
- right: other.right + other_expansion,
145
- bottom: other.bottom + other_expansion,
146
- };
147
-
148
- expanded_self.iou(&expanded_other)
149
- }
150
-
151
- /// Calculate the area of this bounding box.
152
- fn calculate_area(&self) -> f32 {
153
- let width = (self.right - self.left).max(0.0);
154
- let height = (self.bottom - self.top).max(0.0);
155
- width * height
156
- }
157
-
158
- /// Calculate the intersection area between this bounding box and another.
159
- fn calculate_intersection_area(&self, other: &BoundingBox) -> f32 {
160
- let left = self.left.max(other.left);
161
- let top = self.top.max(other.top);
162
- let right = self.right.min(other.right);
163
- let bottom = self.bottom.min(other.bottom);
164
-
165
- let width = (right - left).max(0.0);
166
- let height = (bottom - top).max(0.0);
167
- width * height
168
- }
169
- }
170
-
171
19
  /// Character information extracted from PDF with font metrics.
172
20
  #[derive(Debug, Clone)]
173
21
  pub struct CharData {
@@ -196,15 +44,6 @@ pub struct TextBlock {
196
44
  pub font_size: f32,
197
45
  }
198
46
 
199
- /// A cluster of text blocks with the same font size characteristics.
200
- #[derive(Debug, Clone)]
201
- pub struct FontSizeCluster {
202
- /// The centroid (mean) font size of this cluster
203
- pub centroid: f32,
204
- /// The text blocks that belong to this cluster
205
- pub members: Vec<TextBlock>,
206
- }
207
-
208
47
  /// Result of KMeans clustering on font sizes.
209
48
  ///
210
49
  /// Contains cluster labels for each block, where cluster index indicates
@@ -401,185 +240,6 @@ pub fn assign_hierarchy_levels_from_clusters(
401
240
  result
402
241
  }
403
242
 
404
- /// Cluster text blocks by font size using k-means algorithm.
405
- ///
406
- /// Uses k-means clustering to group text blocks by their font size, which helps
407
- /// identify document hierarchy levels (H1, H2, Body, etc.). The algorithm:
408
- /// 1. Extracts font sizes from text blocks
409
- /// 2. Applies k-means clustering to group similar font sizes
410
- /// 3. Sorts clusters by centroid size in descending order (largest = H1)
411
- /// 4. Returns clusters with their member blocks
412
- ///
413
- /// # Arguments
414
- ///
415
- /// * `blocks` - Slice of TextBlock objects to cluster
416
- /// * `k` - Number of clusters to create
417
- ///
418
- /// # Returns
419
- ///
420
- /// Result with vector of FontSizeCluster ordered by size (descending),
421
- /// or an error if clustering fails
422
- ///
423
- /// # Example
424
- ///
425
- /// ```rust,no_run
426
- /// # #[cfg(feature = "pdf")]
427
- /// # {
428
- /// use kreuzberg::pdf::hierarchy::{TextBlock, BoundingBox, cluster_font_sizes};
429
- ///
430
- /// let blocks = vec![
431
- /// TextBlock {
432
- /// text: "Title".to_string(),
433
- /// bbox: BoundingBox { left: 0.0, top: 0.0, right: 100.0, bottom: 24.0 },
434
- /// font_size: 24.0,
435
- /// },
436
- /// TextBlock {
437
- /// text: "Body".to_string(),
438
- /// bbox: BoundingBox { left: 0.0, top: 30.0, right: 100.0, bottom: 42.0 },
439
- /// font_size: 12.0,
440
- /// },
441
- /// ];
442
- ///
443
- /// let clusters = cluster_font_sizes(&blocks, 2).unwrap();
444
- /// assert_eq!(clusters.len(), 2);
445
- /// assert_eq!(clusters[0].centroid, 24.0); // Largest is first
446
- /// # }
447
- /// ```
448
- /// Helper function to assign blocks to their nearest centroid.
449
- ///
450
- /// Iterates through blocks and finds the closest centroid for each block,
451
- /// grouping them into clusters. Used in k-means clustering iterations.
452
- ///
453
- /// # Arguments
454
- ///
455
- /// * `blocks` - Slice of TextBlock objects to assign
456
- /// * `centroids` - Slice of centroid values (one per cluster)
457
- ///
458
- /// # Returns
459
- ///
460
- /// A vector of clusters, where each cluster contains the TextBlock objects
461
- /// assigned to that centroid
462
- fn assign_blocks_to_centroids(blocks: &[TextBlock], centroids: &[f32]) -> Vec<Vec<TextBlock>> {
463
- let mut clusters: Vec<Vec<TextBlock>> = vec![Vec::new(); centroids.len()];
464
-
465
- for block in blocks {
466
- let mut min_distance = f32::INFINITY;
467
- let mut best_cluster = 0;
468
-
469
- for (i, &centroid) in centroids.iter().enumerate() {
470
- let distance = (block.font_size - centroid).abs();
471
- if distance < min_distance {
472
- min_distance = distance;
473
- best_cluster = i;
474
- }
475
- }
476
-
477
- clusters[best_cluster].push(block.clone());
478
- }
479
-
480
- clusters
481
- }
482
-
483
- pub fn cluster_font_sizes(blocks: &[TextBlock], k: usize) -> Result<Vec<FontSizeCluster>> {
484
- if blocks.is_empty() {
485
- return Ok(Vec::new());
486
- }
487
-
488
- if k == 0 {
489
- return Err(PdfError::TextExtractionFailed("K must be greater than 0".to_string()));
490
- }
491
-
492
- let actual_k = k.min(blocks.len());
493
-
494
- // Extract unique font sizes for initialization
495
- let mut font_sizes: Vec<f32> = blocks.iter().map(|b| b.font_size).collect();
496
- font_sizes.sort_by(|a, b| b.partial_cmp(a).expect("Failed to compare font sizes during sorting")); // Sort descending
497
- font_sizes.dedup(); // Remove duplicates to get unique font sizes
498
-
499
- // Initialize centroids using actual font sizes from the data
500
- // This is more robust than dividing the range uniformly
501
- let mut centroids: Vec<f32> = Vec::new();
502
-
503
- if font_sizes.len() >= actual_k {
504
- // If we have at least k unique font sizes, pick them evenly spaced
505
- let step = font_sizes.len() / actual_k;
506
- for i in 0..actual_k {
507
- let idx = i * step;
508
- centroids.push(font_sizes[idx.min(font_sizes.len() - 1)]);
509
- }
510
- } else {
511
- // If we have fewer unique sizes than k, use all of them and fill with interpolated values
512
- centroids = font_sizes.clone();
513
-
514
- // Add interpolated centroids between existing ones to reach desired k
515
- let min_font = font_sizes[font_sizes.len() - 1];
516
- let max_font = font_sizes[0];
517
- let range = max_font - min_font;
518
-
519
- while centroids.len() < actual_k {
520
- let t = centroids.len() as f32 / (actual_k - 1) as f32;
521
- let interpolated = max_font - t * range;
522
- centroids.push(interpolated);
523
- }
524
-
525
- centroids.sort_by(|a, b| b.partial_cmp(a).expect("Failed to compare centroids during sorting"));
526
- // Keep sorted descending
527
- }
528
-
529
- // Run k-means clustering for a fixed number of iterations
530
- for _ in 0..KMEANS_MAX_ITERATIONS {
531
- // Assign blocks to nearest centroid
532
- let clusters = assign_blocks_to_centroids(blocks, &centroids);
533
-
534
- // Update centroids
535
- let mut new_centroids = Vec::with_capacity(actual_k);
536
- for (i, cluster) in clusters.iter().enumerate() {
537
- if !cluster.is_empty() {
538
- new_centroids.push(cluster.iter().map(|b| b.font_size).sum::<f32>() / cluster.len() as f32);
539
- } else {
540
- new_centroids.push(centroids[i]);
541
- }
542
- }
543
-
544
- // Check for convergence
545
- let converged = centroids
546
- .iter()
547
- .zip(new_centroids.iter())
548
- .all(|(old, new)| (old - new).abs() < KMEANS_CONVERGENCE_THRESHOLD);
549
-
550
- std::mem::swap(&mut centroids, &mut new_centroids);
551
-
552
- if converged {
553
- break;
554
- }
555
- }
556
-
557
- // Final assignment to create result
558
- let clusters = assign_blocks_to_centroids(blocks, &centroids);
559
-
560
- // Create FontSizeCluster objects with centroids
561
- let mut result: Vec<FontSizeCluster> = Vec::new();
562
-
563
- for i in 0..actual_k {
564
- if !clusters[i].is_empty() {
565
- let centroid_value = centroids[i];
566
- result.push(FontSizeCluster {
567
- centroid: centroid_value,
568
- members: clusters[i].clone(),
569
- });
570
- }
571
- }
572
-
573
- // Sort by centroid size in descending order (largest font = H1)
574
- result.sort_by(|a, b| {
575
- b.centroid
576
- .partial_cmp(&a.centroid)
577
- .expect("Failed to compare centroids during final sort")
578
- });
579
-
580
- Ok(result)
581
- }
582
-
583
243
  /// Extract characters with fonts from a PDF page.
584
244
  ///
585
245
  /// Iterates through all characters on a page, extracting text, position,
@@ -0,0 +1,18 @@
1
+ //! PDF text hierarchy extraction using pdfium character positions.
2
+ //!
3
+ //! This module provides functions for extracting character information from PDFs,
4
+ //! preserving font size and position data for text hierarchy analysis.
5
+ //!
6
+ //! Note: Requires the "pdf" feature to be enabled.
7
+
8
+ mod bounding_box;
9
+ mod clustering;
10
+ mod extraction;
11
+
12
+ // Re-export all public types and functions for backward compatibility
13
+ pub use bounding_box::BoundingBox;
14
+ pub use clustering::{FontSizeCluster, cluster_font_sizes};
15
+ pub use extraction::{
16
+ CharData, HierarchyBlock, HierarchyLevel, KMeansResult, TextBlock, assign_hierarchy_levels,
17
+ assign_hierarchy_levels_from_clusters, extract_chars_with_fonts, merge_chars_into_blocks, should_trigger_ocr,
18
+ };