kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -1,233 +1,31 @@
1
1
  //! PDF document extractor.
2
+ //!
3
+ //! Provides extraction of text, metadata, tables, and images from PDF documents
4
+ //! using pypdfium2 and playa-pdf. Supports both native text extraction and OCR fallback.
5
+
6
+ mod extraction;
7
+ mod ocr;
8
+ mod pages;
2
9
 
3
10
  use crate::Result;
4
11
  use crate::core::config::ExtractionConfig;
5
12
  use crate::plugins::{DocumentExtractor, Plugin};
6
- use crate::types::{ExtractionResult, Metadata, PageContent};
13
+ use crate::types::{ExtractionResult, Metadata};
7
14
  use async_trait::async_trait;
8
15
  #[cfg(feature = "tokio-runtime")]
9
16
  use std::path::Path;
10
17
 
11
18
  #[cfg(feature = "pdf")]
12
19
  use crate::pdf::error::PdfError;
13
- #[cfg(feature = "ocr")]
14
- use crate::pdf::rendering::{PageRenderOptions, PdfRenderer};
15
- #[cfg(feature = "pdf")]
16
- use crate::types::Table;
17
- #[cfg(feature = "pdf")]
18
- use pdfium_render::prelude::*;
19
-
20
- #[cfg(feature = "pdf")]
21
- type PdfExtractionPhaseResult = (
22
- crate::pdf::metadata::PdfExtractionMetadata,
23
- String,
24
- Vec<Table>,
25
- Option<Vec<PageContent>>,
26
- );
27
20
 
21
+ // Re-export for backward compatibility
28
22
  #[cfg(feature = "ocr")]
29
- const MIN_TOTAL_NON_WHITESPACE: usize = 64;
30
- #[cfg(feature = "ocr")]
31
- const MIN_NON_WHITESPACE_PER_PAGE: f64 = 32.0;
32
- #[cfg(feature = "ocr")]
33
- const MIN_MEANINGFUL_WORD_LEN: usize = 4;
34
- #[cfg(feature = "ocr")]
35
- const MIN_MEANINGFUL_WORDS: usize = 3;
36
- #[cfg(feature = "ocr")]
37
- const MIN_ALNUM_RATIO: f64 = 0.3;
38
-
39
- #[cfg(feature = "ocr")]
40
- struct NativeTextStats {
41
- non_whitespace: usize,
42
- alnum: usize,
43
- meaningful_words: usize,
44
- alnum_ratio: f64,
45
- }
46
-
47
- #[cfg(feature = "ocr")]
48
- struct OcrFallbackDecision {
49
- stats: NativeTextStats,
50
- avg_non_whitespace: f64,
51
- avg_alnum: f64,
52
- fallback: bool,
53
- }
54
-
55
- #[cfg(feature = "ocr")]
56
- impl NativeTextStats {
57
- fn from(text: &str) -> Self {
58
- let mut non_whitespace = 0usize;
59
- let mut alnum = 0usize;
60
-
61
- for ch in text.chars() {
62
- if !ch.is_whitespace() {
63
- non_whitespace += 1;
64
- if ch.is_alphanumeric() {
65
- alnum += 1;
66
- }
67
- }
68
- }
69
-
70
- let meaningful_words = text
71
- .split_whitespace()
72
- .filter(|word| {
73
- word.chars()
74
- .filter(|c| c.is_alphanumeric())
75
- .take(MIN_MEANINGFUL_WORD_LEN)
76
- .count()
77
- >= MIN_MEANINGFUL_WORD_LEN
78
- })
79
- .take(MIN_MEANINGFUL_WORDS)
80
- .count();
81
-
82
- let alnum_ratio = if non_whitespace == 0 {
83
- 0.0
84
- } else {
85
- alnum as f64 / non_whitespace as f64
86
- };
87
-
88
- Self {
89
- non_whitespace,
90
- alnum,
91
- meaningful_words,
92
- alnum_ratio,
93
- }
94
- }
95
- }
23
+ pub use ocr::{NativeTextStats, OcrFallbackDecision, evaluate_native_text_for_ocr};
96
24
 
25
+ use extraction::extract_all_from_document;
97
26
  #[cfg(feature = "ocr")]
98
- fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>) -> OcrFallbackDecision {
99
- let trimmed = native_text.trim();
100
-
101
- if trimmed.is_empty() {
102
- let empty_stats = NativeTextStats {
103
- non_whitespace: 0,
104
- alnum: 0,
105
- meaningful_words: 0,
106
- alnum_ratio: 0.0,
107
- };
108
- return OcrFallbackDecision {
109
- stats: empty_stats,
110
- avg_non_whitespace: 0.0,
111
- avg_alnum: 0.0,
112
- fallback: true,
113
- };
114
- }
115
-
116
- let stats = NativeTextStats::from(trimmed);
117
- let pages = page_count.unwrap_or(1).max(1) as f64;
118
- let avg_non_whitespace = stats.non_whitespace as f64 / pages;
119
- let avg_alnum = stats.alnum as f64 / pages;
120
-
121
- let has_substantial_text = stats.non_whitespace >= MIN_TOTAL_NON_WHITESPACE
122
- && avg_non_whitespace >= MIN_NON_WHITESPACE_PER_PAGE
123
- && stats.meaningful_words >= MIN_MEANINGFUL_WORDS;
124
-
125
- let fallback = if stats.non_whitespace == 0 || stats.alnum == 0 {
126
- true
127
- } else if has_substantial_text {
128
- false
129
- } else if (stats.alnum_ratio < MIN_ALNUM_RATIO && avg_alnum < MIN_NON_WHITESPACE_PER_PAGE)
130
- || (stats.non_whitespace < MIN_TOTAL_NON_WHITESPACE && avg_non_whitespace < MIN_NON_WHITESPACE_PER_PAGE)
131
- {
132
- true
133
- } else {
134
- stats.meaningful_words == 0 && avg_non_whitespace < MIN_NON_WHITESPACE_PER_PAGE
135
- };
136
-
137
- OcrFallbackDecision {
138
- stats,
139
- avg_non_whitespace,
140
- avg_alnum,
141
- fallback,
142
- }
143
- }
144
-
145
- /// Extract tables from PDF document using native text positions.
146
- ///
147
- /// This function converts PDF character positions to HocrWord format,
148
- /// then uses the existing table reconstruction logic to detect tables.
149
- ///
150
- /// Uses the shared PdfDocument reference (wrapped in Arc<RwLock<>> for thread-safety).
151
- #[cfg(all(feature = "pdf", feature = "ocr"))]
152
- fn extract_tables_from_document(
153
- document: &PdfDocument,
154
- _metadata: &crate::pdf::metadata::PdfExtractionMetadata,
155
- ) -> Result<Vec<Table>> {
156
- use crate::ocr::table::{reconstruct_table, table_to_markdown};
157
- use crate::pdf::table::extract_words_from_page;
158
-
159
- let mut all_tables = Vec::new();
160
-
161
- for (page_index, page) in document.pages().iter().enumerate() {
162
- let words = extract_words_from_page(&page, 0.0)?;
163
-
164
- if words.is_empty() {
165
- continue;
166
- }
167
-
168
- let column_threshold = 50;
169
- let row_threshold_ratio = 0.5;
170
-
171
- let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio);
172
-
173
- if !table_cells.is_empty() {
174
- let markdown = table_to_markdown(&table_cells);
175
-
176
- all_tables.push(Table {
177
- cells: table_cells,
178
- markdown,
179
- page_number: page_index + 1,
180
- });
181
- }
182
- }
183
-
184
- Ok(all_tables)
185
- }
186
-
187
- /// Fallback for when OCR feature is not enabled - returns empty tables.
188
- #[cfg(all(feature = "pdf", not(feature = "ocr")))]
189
- fn extract_tables_from_document(
190
- _document: &PdfDocument,
191
- _metadata: &crate::pdf::metadata::PdfExtractionMetadata,
192
- ) -> Result<Vec<crate::types::Table>> {
193
- Ok(vec![])
194
- }
195
-
196
- /// Helper function to assign tables and images to pages.
197
- ///
198
- /// If page_contents is None, returns None (no per-page tracking enabled).
199
- /// Otherwise, iterates through tables and images, assigning them to pages based on page_number.
200
- ///
201
- /// # Performance
202
- ///
203
- /// Uses Arc::new to wrap tables and images, avoiding expensive copies.
204
- /// This reduces memory overhead by enabling zero-copy sharing of table/image data
205
- /// across multiple references (e.g., when the same table appears on multiple pages).
206
- fn assign_tables_and_images_to_pages(
207
- mut page_contents: Option<Vec<PageContent>>,
208
- tables: &[crate::types::Table],
209
- images: &[crate::types::ExtractedImage],
210
- ) -> Option<Vec<PageContent>> {
211
- let pages = page_contents.take()?;
212
-
213
- let mut updated_pages = pages;
214
-
215
- for table in tables {
216
- if let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == table.page_number) {
217
- page.tables.push(std::sync::Arc::new(table.clone()));
218
- }
219
- }
220
-
221
- for image in images {
222
- if let Some(page_num) = image.page_number
223
- && let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == page_num)
224
- {
225
- page.images.push(std::sync::Arc::new(image.clone()));
226
- }
227
- }
228
-
229
- Some(updated_pages)
230
- }
27
+ use ocr::extract_with_ocr;
28
+ use pages::assign_tables_and_images_to_pages;
231
29
 
232
30
  /// PDF document extractor using pypdfium2 and playa-pdf.
233
31
  pub struct PdfExtractor;
@@ -242,105 +40,6 @@ impl PdfExtractor {
242
40
  pub fn new() -> Self {
243
41
  Self
244
42
  }
245
-
246
- /// Extract text, metadata, and tables from a PDF document using a single shared instance.
247
- ///
248
- /// This method consolidates all PDF extraction phases (text, metadata, tables) into a single
249
- /// operation using a single PdfDocument instance. This avoids redundant document parsing
250
- /// and pdfium initialization overhead.
251
- ///
252
- /// # Performance
253
- ///
254
- /// By reusing a single document instance across all extraction phases, we eliminate:
255
- /// - Duplicate document parsing overhead (25-40ms saved)
256
- /// - Redundant pdfium bindings initialization
257
- /// - Multiple page tree traversals
258
- ///
259
- /// Expected improvement: 20-30% faster PDF processing.
260
- ///
261
- /// # Returns
262
- ///
263
- /// A tuple containing:
264
- /// - PDF metadata (title, authors, dates, page structure, etc.)
265
- /// - Native extracted text (or empty if using OCR)
266
- /// - Extracted tables (if OCR feature enabled)
267
- /// - Per-page content (if page extraction configured)
268
- #[cfg(feature = "pdf")]
269
- fn extract_all_from_document(
270
- document: &PdfDocument,
271
- config: &ExtractionConfig,
272
- ) -> Result<PdfExtractionPhaseResult> {
273
- let (native_text, _boundaries, page_contents, pdf_metadata) =
274
- crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
275
-
276
- let tables = extract_tables_from_document(document, &pdf_metadata)?;
277
-
278
- Ok((pdf_metadata, native_text, tables, page_contents))
279
- }
280
-
281
- /// Extract text from PDF using OCR.
282
- ///
283
- /// Renders all pages to images and processes them with OCR.
284
- #[cfg(feature = "ocr")]
285
- async fn extract_with_ocr(&self, content: &[u8], config: &ExtractionConfig) -> Result<String> {
286
- use crate::plugins::registry::get_ocr_backend_registry;
287
- use image::ImageEncoder;
288
- use image::codecs::png::PngEncoder;
289
- use std::io::Cursor;
290
-
291
- let ocr_config = config.ocr.as_ref().ok_or_else(|| crate::KreuzbergError::Parsing {
292
- message: "OCR config required for force_ocr".to_string(),
293
- source: None,
294
- })?;
295
-
296
- let backend = {
297
- let registry = get_ocr_backend_registry();
298
- let registry = registry.read().map_err(|e| crate::KreuzbergError::Plugin {
299
- message: format!("Failed to acquire read lock on OCR backend registry: {}", e),
300
- plugin_name: "ocr-registry".to_string(),
301
- })?;
302
- registry.get(&ocr_config.backend)?
303
- };
304
-
305
- let images = {
306
- let render_options = PageRenderOptions::default();
307
- let renderer = PdfRenderer::new().map_err(|e| crate::KreuzbergError::Parsing {
308
- message: format!("Failed to initialize PDF renderer: {}", e),
309
- source: None,
310
- })?;
311
-
312
- renderer
313
- .render_all_pages(content, &render_options)
314
- .map_err(|e| crate::KreuzbergError::Parsing {
315
- message: format!("Failed to render PDF pages: {}", e),
316
- source: None,
317
- })?
318
- };
319
-
320
- let mut page_texts = Vec::with_capacity(images.len());
321
-
322
- for image in images {
323
- let rgb_image = image.to_rgb8();
324
- let (width, height) = rgb_image.dimensions();
325
-
326
- let mut image_bytes = Cursor::new(Vec::new());
327
- let encoder = PngEncoder::new(&mut image_bytes);
328
- encoder
329
- .write_image(&rgb_image, width, height, image::ColorType::Rgb8.into())
330
- .map_err(|e| crate::KreuzbergError::Parsing {
331
- message: format!("Failed to encode image: {}", e),
332
- source: None,
333
- })?;
334
-
335
- let image_data = image_bytes.into_inner();
336
-
337
- let ocr_result = backend.process_image(&image_data, ocr_config).await?;
338
-
339
- page_texts.push(ocr_result.content);
340
- }
341
-
342
- Ok(page_texts.join("\n\n"))
343
- }
344
43
  }
345
44
 
346
45
  impl Plugin for PdfExtractor {
@@ -404,7 +103,7 @@ impl DocumentExtractor for PdfExtractor {
404
103
  }
405
104
  })?;
406
105
 
407
- Self::extract_all_from_document(&document, config)?
106
+ extract_all_from_document(&document, config)?
408
107
  }
409
108
  #[cfg(all(not(target_arch = "wasm32"), feature = "tokio-runtime"))]
410
109
  {
@@ -428,7 +127,7 @@ impl DocumentExtractor for PdfExtractor {
428
127
  })?;
429
128
 
430
129
  let (pdf_metadata, native_text, tables, page_contents) =
431
- Self::extract_all_from_document(&document, &config_owned)?;
130
+ extract_all_from_document(&document, &config_owned)?;
432
131
 
433
132
  if let Some(page_cfg) = config_owned.pages.as_ref()
434
133
  && page_cfg.extract_pages
@@ -458,7 +157,7 @@ impl DocumentExtractor for PdfExtractor {
458
157
  }
459
158
  })?;
460
159
 
461
- Self::extract_all_from_document(&document, config)?
160
+ extract_all_from_document(&document, config)?
462
161
  }
463
162
  }
464
163
  #[cfg(all(not(target_arch = "wasm32"), not(feature = "tokio-runtime")))]
@@ -475,19 +174,19 @@ impl DocumentExtractor for PdfExtractor {
475
174
  }
476
175
  })?;
477
176
 
478
- Self::extract_all_from_document(&document, config)?
177
+ extract_all_from_document(&document, config)?
479
178
  }
480
179
  };
481
180
 
482
181
  #[cfg(feature = "ocr")]
483
182
  let text = if config.force_ocr {
484
183
  if config.ocr.is_some() {
485
- self.extract_with_ocr(content, config).await?
184
+ extract_with_ocr(content, config).await?
486
185
  } else {
487
186
  native_text
488
187
  }
489
188
  } else if config.ocr.is_some() {
490
- let decision = evaluate_native_text_for_ocr(&native_text, None);
189
+ let decision = ocr::evaluate_native_text_for_ocr(&native_text, None);
491
190
 
492
191
  if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
493
192
  eprintln!(
@@ -504,7 +203,7 @@ impl DocumentExtractor for PdfExtractor {
504
203
  }
505
204
 
506
205
  if decision.fallback {
507
- self.extract_with_ocr(content, config).await?
206
+ extract_with_ocr(content, config).await?
508
207
  } else {
509
208
  native_text
510
209
  }
@@ -593,6 +292,8 @@ impl DocumentExtractor for PdfExtractor {
593
292
  detected_languages: None,
594
293
  chunks: None,
595
294
  images,
295
+ djot_content: None,
296
+ elements: None,
596
297
  })
597
298
  }
598
299
 
@@ -640,21 +341,21 @@ mod tests {
640
341
  #[cfg(feature = "ocr")]
641
342
  #[test]
642
343
  fn test_should_fallback_to_ocr_for_empty_text() {
643
- assert!(evaluate_native_text_for_ocr("", Some(1)).fallback);
344
+ assert!(ocr::evaluate_native_text_for_ocr("", Some(1)).fallback);
644
345
  }
645
346
 
646
347
  #[cfg(feature = "ocr")]
647
348
  #[test]
648
349
  fn test_should_not_fallback_for_meaningful_text() {
649
350
  let sample = "This page has searchable vector text and should avoid OCR.";
650
- assert!(!evaluate_native_text_for_ocr(sample, Some(1)).fallback);
351
+ assert!(!ocr::evaluate_native_text_for_ocr(sample, Some(1)).fallback);
651
352
  }
652
353
 
653
354
  #[cfg(feature = "ocr")]
654
355
  #[test]
655
356
  fn test_should_fallback_for_punctuation_only_text() {
656
357
  let sample = " . , ; : -- -- ";
657
- assert!(evaluate_native_text_for_ocr(sample, Some(2)).fallback);
358
+ assert!(ocr::evaluate_native_text_for_ocr(sample, Some(2)).fallback);
658
359
  }
659
360
 
660
361
  #[tokio::test]
@@ -0,0 +1,214 @@
1
+ //! OCR functionality for PDF extraction.
2
+ //!
3
+ //! Handles text quality evaluation, OCR fallback decision logic, and OCR processing.
4
+
5
+ #[cfg(feature = "ocr")]
6
+ use crate::core::config::ExtractionConfig;
7
+
8
+ #[cfg(feature = "ocr")]
9
+ pub(crate) const MIN_TOTAL_NON_WHITESPACE: usize = 64;
10
+ #[cfg(feature = "ocr")]
11
+ pub(crate) const MIN_NON_WHITESPACE_PER_PAGE: f64 = 32.0;
12
+ #[cfg(feature = "ocr")]
13
+ pub(crate) const MIN_MEANINGFUL_WORD_LEN: usize = 4;
14
+ #[cfg(feature = "ocr")]
15
+ pub(crate) const MIN_MEANINGFUL_WORDS: usize = 3;
16
+ #[cfg(feature = "ocr")]
17
+ pub(crate) const MIN_ALNUM_RATIO: f64 = 0.3;
18
+
19
+ #[cfg(feature = "ocr")]
20
+ pub struct NativeTextStats {
21
+ pub non_whitespace: usize,
22
+ pub alnum: usize,
23
+ pub meaningful_words: usize,
24
+ pub alnum_ratio: f64,
25
+ }
26
+
27
+ #[cfg(feature = "ocr")]
28
+ pub struct OcrFallbackDecision {
29
+ pub stats: NativeTextStats,
30
+ pub avg_non_whitespace: f64,
31
+ pub avg_alnum: f64,
32
+ pub fallback: bool,
33
+ }
34
+
35
+ #[cfg(feature = "ocr")]
36
+ impl NativeTextStats {
37
+ pub fn from(text: &str) -> Self {
38
+ let mut non_whitespace = 0usize;
39
+ let mut alnum = 0usize;
40
+
41
+ for ch in text.chars() {
42
+ if !ch.is_whitespace() {
43
+ non_whitespace += 1;
44
+ if ch.is_alphanumeric() {
45
+ alnum += 1;
46
+ }
47
+ }
48
+ }
49
+
50
+ let meaningful_words = text
51
+ .split_whitespace()
52
+ .filter(|word| {
53
+ word.chars()
54
+ .filter(|c| c.is_alphanumeric())
55
+ .take(MIN_MEANINGFUL_WORD_LEN)
56
+ .count()
57
+ >= MIN_MEANINGFUL_WORD_LEN
58
+ })
59
+ .take(MIN_MEANINGFUL_WORDS)
60
+ .count();
61
+
62
+ let alnum_ratio = if non_whitespace == 0 {
63
+ 0.0
64
+ } else {
65
+ alnum as f64 / non_whitespace as f64
66
+ };
67
+
68
+ Self {
69
+ non_whitespace,
70
+ alnum,
71
+ meaningful_words,
72
+ alnum_ratio,
73
+ }
74
+ }
75
+ }
76
+
77
+ /// Evaluates native PDF text quality to determine if OCR fallback is needed.
78
+ ///
79
+ /// Analyzes text characteristics (whitespace, alphanumeric ratio, meaningful words)
80
+ /// to detect cases where native text extraction produced poor results (e.g., scanned
81
+ /// PDFs with garbled text).
82
+ ///
83
+ /// # Arguments
84
+ ///
85
+ /// * `native_text` - The text extracted from the PDF using native methods
86
+ /// * `page_count` - Optional page count for per-page average calculations
87
+ ///
88
+ /// # Returns
89
+ ///
90
+ /// An `OcrFallbackDecision` containing:
91
+ /// - Statistics about the text quality
92
+ /// - Per-page averages
93
+ /// - Boolean decision on whether to use OCR
94
+ #[cfg(feature = "ocr")]
95
+ pub fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>) -> OcrFallbackDecision {
96
+ let trimmed = native_text.trim();
97
+
98
+ if trimmed.is_empty() {
99
+ let empty_stats = NativeTextStats {
100
+ non_whitespace: 0,
101
+ alnum: 0,
102
+ meaningful_words: 0,
103
+ alnum_ratio: 0.0,
104
+ };
105
+ return OcrFallbackDecision {
106
+ stats: empty_stats,
107
+ avg_non_whitespace: 0.0,
108
+ avg_alnum: 0.0,
109
+ fallback: true,
110
+ };
111
+ }
112
+
113
+ let stats = NativeTextStats::from(trimmed);
114
+ let pages = page_count.unwrap_or(1).max(1) as f64;
115
+ let avg_non_whitespace = stats.non_whitespace as f64 / pages;
116
+ let avg_alnum = stats.alnum as f64 / pages;
117
+
118
+ let has_substantial_text = stats.non_whitespace >= MIN_TOTAL_NON_WHITESPACE
119
+ && avg_non_whitespace >= MIN_NON_WHITESPACE_PER_PAGE
120
+ && stats.meaningful_words >= MIN_MEANINGFUL_WORDS;
121
+
122
+ let fallback = if stats.non_whitespace == 0 || stats.alnum == 0 {
123
+ true
124
+ } else if has_substantial_text {
125
+ false
126
+ } else if (stats.alnum_ratio < MIN_ALNUM_RATIO && avg_alnum < MIN_NON_WHITESPACE_PER_PAGE)
127
+ || (stats.non_whitespace < MIN_TOTAL_NON_WHITESPACE && avg_non_whitespace < MIN_NON_WHITESPACE_PER_PAGE)
128
+ {
129
+ true
130
+ } else {
131
+ stats.meaningful_words == 0 && avg_non_whitespace < MIN_NON_WHITESPACE_PER_PAGE
132
+ };
133
+
134
+ OcrFallbackDecision {
135
+ stats,
136
+ avg_non_whitespace,
137
+ avg_alnum,
138
+ fallback,
139
+ }
140
+ }
141
+
142
+ /// Extract text from PDF using OCR.
143
+ ///
144
+ /// Renders all pages to images and processes them with OCR backend.
145
+ ///
146
+ /// # Arguments
147
+ ///
148
+ /// * `content` - Raw PDF bytes
149
+ /// * `config` - Extraction configuration including OCR settings
150
+ ///
151
+ /// # Returns
152
+ ///
153
+ /// Concatenated text from all pages, separated by double newlines
154
+ #[cfg(feature = "ocr")]
155
+ pub(crate) async fn extract_with_ocr(content: &[u8], config: &ExtractionConfig) -> crate::Result<String> {
156
+ use crate::pdf::rendering::{PageRenderOptions, PdfRenderer};
157
+ use crate::plugins::registry::get_ocr_backend_registry;
158
+ use image::ImageEncoder;
159
+ use image::codecs::png::PngEncoder;
160
+ use std::io::Cursor;
161
+
162
+ let ocr_config = config.ocr.as_ref().ok_or_else(|| crate::KreuzbergError::Parsing {
163
+ message: "OCR config required for force_ocr".to_string(),
164
+ source: None,
165
+ })?;
166
+
167
+ let backend = {
168
+ let registry = get_ocr_backend_registry();
169
+ let registry = registry.read().map_err(|e| crate::KreuzbergError::Plugin {
170
+ message: format!("Failed to acquire read lock on OCR backend registry: {}", e),
171
+ plugin_name: "ocr-registry".to_string(),
172
+ })?;
173
+ registry.get(&ocr_config.backend)?
174
+ };
175
+
176
+ let images = {
177
+ let render_options = PageRenderOptions::default();
178
+ let renderer = PdfRenderer::new().map_err(|e| crate::KreuzbergError::Parsing {
179
+ message: format!("Failed to initialize PDF renderer: {}", e),
180
+ source: None,
181
+ })?;
182
+
183
+ renderer
184
+ .render_all_pages(content, &render_options)
185
+ .map_err(|e| crate::KreuzbergError::Parsing {
186
+ message: format!("Failed to render PDF pages: {}", e),
187
+ source: None,
188
+ })?
189
+ };
190
+
191
+ let mut page_texts = Vec::with_capacity(images.len());
192
+
193
+ for image in images {
194
+ let rgb_image = image.to_rgb8();
195
+ let (width, height) = rgb_image.dimensions();
196
+
197
+ let mut image_bytes = Cursor::new(Vec::new());
198
+ let encoder = PngEncoder::new(&mut image_bytes);
199
+ encoder
200
+ .write_image(&rgb_image, width, height, image::ColorType::Rgb8.into())
201
+ .map_err(|e| crate::KreuzbergError::Parsing {
202
+ message: format!("Failed to encode image: {}", e),
203
+ source: None,
204
+ })?;
205
+
206
+ let image_data = image_bytes.into_inner();
207
+
208
+ let ocr_result = backend.process_image(&image_data, ocr_config).await?;
209
+
210
+ page_texts.push(ocr_result.content);
211
+ }
212
+
213
+ Ok(page_texts.join("\n\n"))
214
+ }
@@ -0,0 +1,51 @@
1
+ //! Page content management for PDF extraction.
2
+ //!
3
+ //! Handles assignment of tables and images to specific pages.
4
+
5
+ use crate::types::PageContent;
6
+
7
+ /// Helper function to assign tables and images to pages.
8
+ ///
9
+ /// If page_contents is None, returns None (no per-page tracking enabled).
10
+ /// Otherwise, iterates through tables and images, assigning them to pages based on page_number.
11
+ ///
12
+ /// # Performance
13
+ ///
14
+ /// Uses Arc::new to wrap tables and images, avoiding expensive copies.
15
+ /// This reduces memory overhead by enabling zero-copy sharing of table/image data
16
+ /// across multiple references (e.g., when the same table appears on multiple pages).
17
+ ///
18
+ /// # Arguments
19
+ ///
20
+ /// * `page_contents` - Optional vector of page contents to populate
21
+ /// * `tables` - Slice of tables to assign to pages
22
+ /// * `images` - Slice of images to assign to pages
23
+ ///
24
+ /// # Returns
25
+ ///
26
+ /// Updated page contents with tables and images assigned, or None if page tracking disabled
27
+ pub(crate) fn assign_tables_and_images_to_pages(
28
+ mut page_contents: Option<Vec<PageContent>>,
29
+ tables: &[crate::types::Table],
30
+ images: &[crate::types::ExtractedImage],
31
+ ) -> Option<Vec<PageContent>> {
32
+ let pages = page_contents.take()?;
33
+
34
+ let mut updated_pages = pages;
35
+
36
+ for table in tables {
37
+ if let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == table.page_number) {
38
+ page.tables.push(std::sync::Arc::new(table.clone()));
39
+ }
40
+ }
41
+
42
+ for image in images {
43
+ if let Some(page_num) = image.page_number
44
+ && let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == page_num)
45
+ {
46
+ page.images.push(std::sync::Arc::new(image.clone()));
47
+ }
48
+ }
49
+
50
+ Some(updated_pages)
51
+ }