kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,281 @@
1
+ //! Core types for document extraction.
2
+
3
+ // Module declarations
4
+ pub mod djot;
5
+ pub mod extraction;
6
+ pub mod formats;
7
+ pub mod metadata;
8
+ pub mod page;
9
+ pub mod serde_helpers;
10
+ pub mod tables;
11
+
12
+ // Re-export all types for backward compatibility
13
+ pub use djot::*;
14
+ pub use extraction::*;
15
+ pub use formats::*;
16
+ pub use metadata::*;
17
+ pub use page::*;
18
+ pub use tables::*;
19
+
20
+ #[cfg(test)]
21
+ mod tests {
22
+ use super::*;
23
+ use std::sync::Arc;
24
+
25
+ #[test]
26
+ fn test_metadata_serialization_with_format() {
27
+ let mut metadata = Metadata {
28
+ format: Some(FormatMetadata::Text(TextMetadata {
29
+ line_count: 1,
30
+ word_count: 2,
31
+ character_count: 13,
32
+ headers: None,
33
+ links: None,
34
+ code_blocks: None,
35
+ })),
36
+ ..Default::default()
37
+ };
38
+
39
+ metadata
40
+ .additional
41
+ .insert("quality_score".to_string(), serde_json::json!(1.0));
42
+
43
+ let json = serde_json::to_value(&metadata).unwrap();
44
+ println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
45
+
46
+ assert!(
47
+ json.get("format_type").is_some(),
48
+ "format_type should be present in serialized JSON"
49
+ );
50
+ assert_eq!(json.get("format_type").unwrap(), "text");
51
+
52
+ assert_eq!(json.get("line_count").unwrap(), 1);
53
+ assert_eq!(json.get("word_count").unwrap(), 2);
54
+ assert_eq!(json.get("character_count").unwrap(), 13);
55
+
56
+ assert_eq!(json.get("quality_score").unwrap(), 1.0);
57
+ }
58
+
59
+ #[test]
60
+ fn test_arc_table_serialization_format() {
61
+ let table = Table {
62
+ cells: vec![vec!["A".to_string(), "B".to_string()]],
63
+ markdown: "| A | B |\n|---|---|\n".to_string(),
64
+ page_number: 1,
65
+ };
66
+
67
+ let json = serde_json::to_value(&table).unwrap();
68
+
69
+ assert_eq!(json.get("cells").unwrap()[0][0], "A");
70
+ assert_eq!(json.get("markdown").unwrap(), "| A | B |\n|---|---|\n");
71
+ assert_eq!(json.get("page_number").unwrap(), 1);
72
+ }
73
+
74
+ #[test]
75
+ fn test_arc_table_roundtrip() {
76
+ let original = Table {
77
+ cells: vec![
78
+ vec!["X".to_string(), "Y".to_string()],
79
+ vec!["1".to_string(), "2".to_string()],
80
+ ],
81
+ markdown: "| X | Y |\n|---|---|\n| 1 | 2 |\n".to_string(),
82
+ page_number: 5,
83
+ };
84
+
85
+ let json = serde_json::to_string(&original).unwrap();
86
+ let deserialized: Table = serde_json::from_str(&json).unwrap();
87
+
88
+ assert_eq!(deserialized.cells, original.cells);
89
+ assert_eq!(deserialized.markdown, original.markdown);
90
+ assert_eq!(deserialized.page_number, original.page_number);
91
+ }
92
+
93
+ #[test]
94
+ fn test_arc_sharing_preserved_before_serialization() {
95
+ let shared_table = Arc::new(Table {
96
+ cells: vec![vec!["shared".to_string()]],
97
+ markdown: "| shared |".to_string(),
98
+ page_number: 1,
99
+ });
100
+
101
+ let tables_before = [Arc::clone(&shared_table), Arc::clone(&shared_table)].to_vec();
102
+ assert_eq!(Arc::strong_count(&tables_before[0]), 3);
103
+ assert_eq!(Arc::strong_count(&tables_before[1]), 3);
104
+ assert!(Arc::ptr_eq(&tables_before[0], &tables_before[1]));
105
+ }
106
+
107
+ #[test]
108
+ fn test_vec_arc_table_serialization_format() {
109
+ let tables = vec![
110
+ Table {
111
+ cells: vec![vec!["A".to_string()]],
112
+ markdown: "| A |".to_string(),
113
+ page_number: 1,
114
+ },
115
+ Table {
116
+ cells: vec![vec!["B".to_string()]],
117
+ markdown: "| B |".to_string(),
118
+ page_number: 2,
119
+ },
120
+ ];
121
+
122
+ let json = serde_json::to_string(&tables).unwrap();
123
+ let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
124
+
125
+ assert!(parsed.is_array());
126
+ assert_eq!(parsed.as_array().unwrap().len(), 2);
127
+ assert_eq!(parsed[0]["cells"][0][0], "A");
128
+ assert_eq!(parsed[1]["cells"][0][0], "B");
129
+ }
130
+
131
+ #[test]
132
+ fn test_page_content_arc_tables_roundtrip() {
133
+ let page = PageContent {
134
+ page_number: 3,
135
+ content: "Page 3 content".to_string(),
136
+ tables: vec![
137
+ Arc::new(Table {
138
+ cells: vec![vec!["Table1".to_string()]],
139
+ markdown: "| Table1 |".to_string(),
140
+ page_number: 3,
141
+ }),
142
+ Arc::new(Table {
143
+ cells: vec![vec!["Table2".to_string()]],
144
+ markdown: "| Table2 |".to_string(),
145
+ page_number: 3,
146
+ }),
147
+ ],
148
+ images: Vec::new(),
149
+ hierarchy: None,
150
+ };
151
+
152
+ let json = serde_json::to_string(&page).unwrap();
153
+ let deserialized: PageContent = serde_json::from_str(&json).unwrap();
154
+
155
+ assert_eq!(deserialized.page_number, 3);
156
+ assert_eq!(deserialized.content, "Page 3 content");
157
+ assert_eq!(deserialized.tables.len(), 2);
158
+ assert_eq!(deserialized.tables[0].cells[0][0], "Table1");
159
+ assert_eq!(deserialized.tables[1].cells[0][0], "Table2");
160
+ }
161
+
162
+ #[test]
163
+ fn test_page_content_arc_images_roundtrip() {
164
+ let image1 = Arc::new(ExtractedImage {
165
+ data: vec![0xFF, 0xD8, 0xFF],
166
+ format: "jpeg".to_string(),
167
+ image_index: 0,
168
+ page_number: Some(1),
169
+ width: Some(100),
170
+ height: Some(200),
171
+ colorspace: Some("RGB".to_string()),
172
+ bits_per_component: Some(8),
173
+ is_mask: false,
174
+ description: Some("Image 1".to_string()),
175
+ ocr_result: None,
176
+ });
177
+
178
+ let image2 = Arc::new(ExtractedImage {
179
+ data: vec![0x89, 0x50, 0x4E],
180
+ format: "png".to_string(),
181
+ image_index: 1,
182
+ page_number: Some(1),
183
+ width: Some(300),
184
+ height: Some(400),
185
+ colorspace: Some("RGBA".to_string()),
186
+ bits_per_component: Some(8),
187
+ is_mask: false,
188
+ description: Some("Image 2".to_string()),
189
+ ocr_result: None,
190
+ });
191
+
192
+ let page = PageContent {
193
+ page_number: 1,
194
+ content: "Page with images".to_string(),
195
+ tables: Vec::new(),
196
+ images: vec![image1, image2],
197
+ hierarchy: None,
198
+ };
199
+
200
+ let json = serde_json::to_string(&page).unwrap();
201
+ let deserialized: PageContent = serde_json::from_str(&json).unwrap();
202
+
203
+ assert_eq!(deserialized.images.len(), 2);
204
+ assert_eq!(deserialized.images[0].format, "jpeg");
205
+ assert_eq!(deserialized.images[0].width, Some(100));
206
+ assert_eq!(deserialized.images[1].format, "png");
207
+ assert_eq!(deserialized.images[1].height, Some(400));
208
+ }
209
+
210
+ #[test]
211
+ fn test_arc_sharing_loss_with_page_content() {
212
+ let shared_table = Arc::new(Table {
213
+ cells: vec![vec!["shared across pages".to_string()]],
214
+ markdown: "| shared across pages |".to_string(),
215
+ page_number: 0,
216
+ });
217
+
218
+ let page1 = PageContent {
219
+ page_number: 1,
220
+ content: "Page 1".to_string(),
221
+ tables: vec![Arc::clone(&shared_table)],
222
+ images: Vec::new(),
223
+ hierarchy: None,
224
+ };
225
+
226
+ let page2 = PageContent {
227
+ page_number: 2,
228
+ content: "Page 2".to_string(),
229
+ tables: vec![Arc::clone(&shared_table)],
230
+ images: Vec::new(),
231
+ hierarchy: None,
232
+ };
233
+
234
+ assert!(Arc::ptr_eq(&page1.tables[0], &page2.tables[0]));
235
+
236
+ let pages = vec![page1, page2];
237
+ let json = serde_json::to_string(&pages).unwrap();
238
+ let deserialized: Vec<PageContent> = serde_json::from_str(&json).unwrap();
239
+
240
+ assert_eq!(deserialized.len(), 2);
241
+ assert_eq!(deserialized[0].tables[0].cells, deserialized[1].tables[0].cells);
242
+ assert!(!Arc::ptr_eq(&deserialized[0].tables[0], &deserialized[1].tables[0]));
243
+ }
244
+
245
+ #[test]
246
+ fn test_empty_page_content_arcs() {
247
+ let page = PageContent {
248
+ page_number: 5,
249
+ content: "No tables or images".to_string(),
250
+ tables: Vec::new(),
251
+ images: Vec::new(),
252
+ hierarchy: None,
253
+ };
254
+
255
+ let json = serde_json::to_string(&page).unwrap();
256
+ let deserialized: PageContent = serde_json::from_str(&json).unwrap();
257
+
258
+ assert_eq!(deserialized.page_number, 5);
259
+ assert_eq!(deserialized.tables.len(), 0);
260
+ assert_eq!(deserialized.images.len(), 0);
261
+ }
262
+
263
+ #[test]
264
+ fn test_serde_vec_arc_module_behavior() {
265
+ let table1 = Table {
266
+ cells: vec![vec!["A".to_string()]],
267
+ markdown: "| A |".to_string(),
268
+ page_number: 1,
269
+ };
270
+
271
+ let table2 = Table {
272
+ cells: vec![vec!["B".to_string()]],
273
+ markdown: "| B |".to_string(),
274
+ page_number: 2,
275
+ };
276
+
277
+ let json = serde_json::to_string(&vec![table1, table2]).unwrap();
278
+ assert!(json.contains("\"A\""));
279
+ assert!(json.contains("\"B\""));
280
+ }
281
+ }
@@ -0,0 +1,182 @@
1
+ //! Page structure types for documents.
2
+ //!
3
+ //! This module defines types for representing paginated document structures.
4
+
5
+ use serde::{Deserialize, Serialize};
6
+ use std::sync::Arc;
7
+
8
+ // Import serde helper and types from sibling modules
9
+ use super::extraction::ExtractedImage;
10
+ use super::serde_helpers::serde_vec_arc;
11
+ use super::tables::Table;
12
+
13
+ /// Unified page structure for documents.
14
+ ///
15
+ /// Supports different page types (PDF pages, PPTX slides, Excel sheets)
16
+ /// with character offset boundaries for chunk-to-page mapping.
17
+ #[derive(Debug, Clone, Serialize, Deserialize)]
18
+ pub struct PageStructure {
19
+ /// Total number of pages/slides/sheets
20
+ pub total_count: usize,
21
+
22
+ /// Type of paginated unit
23
+ pub unit_type: PageUnitType,
24
+
25
+ /// Character offset boundaries for each page
26
+ ///
27
+ /// Maps character ranges in the extracted content to page numbers.
28
+ /// Used for chunk page range calculation.
29
+ #[serde(skip_serializing_if = "Option::is_none")]
30
+ pub boundaries: Option<Vec<PageBoundary>>,
31
+
32
+ /// Detailed per-page metadata (optional, only when needed)
33
+ #[serde(skip_serializing_if = "Option::is_none")]
34
+ pub pages: Option<Vec<PageInfo>>,
35
+ }
36
+
37
+ /// Type of paginated unit in a document.
38
+ ///
39
+ /// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
40
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
41
+ #[serde(rename_all = "snake_case")]
42
+ pub enum PageUnitType {
43
+ /// Standard document pages (PDF, DOCX, images)
44
+ Page,
45
+ /// Presentation slides (PPTX, ODP)
46
+ Slide,
47
+ /// Spreadsheet sheets (XLSX, ODS)
48
+ Sheet,
49
+ }
50
+
51
+ /// Byte offset boundary for a page.
52
+ ///
53
+ /// Tracks where a specific page's content starts and ends in the main content string,
54
+ /// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
55
+ /// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
56
+ #[derive(Debug, Clone, Serialize, Deserialize)]
57
+ pub struct PageBoundary {
58
+ /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
59
+ pub byte_start: usize,
60
+ /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
61
+ pub byte_end: usize,
62
+ /// Page number (1-indexed)
63
+ pub page_number: usize,
64
+ }
65
+
66
+ /// Metadata for individual page/slide/sheet.
67
+ ///
68
+ /// Captures per-page information including dimensions, content counts,
69
+ /// and visibility state (for presentations).
70
+ #[derive(Debug, Clone, Serialize, Deserialize)]
71
+ pub struct PageInfo {
72
+ /// Page number (1-indexed)
73
+ pub number: usize,
74
+
75
+ /// Page title (usually for presentations)
76
+ #[serde(skip_serializing_if = "Option::is_none")]
77
+ pub title: Option<String>,
78
+
79
+ /// Dimensions in points (PDF) or pixels (images): (width, height)
80
+ #[serde(skip_serializing_if = "Option::is_none")]
81
+ pub dimensions: Option<(f64, f64)>,
82
+
83
+ /// Number of images on this page
84
+ #[serde(skip_serializing_if = "Option::is_none")]
85
+ pub image_count: Option<usize>,
86
+
87
+ /// Number of tables on this page
88
+ #[serde(skip_serializing_if = "Option::is_none")]
89
+ pub table_count: Option<usize>,
90
+
91
+ /// Whether this page is hidden (e.g., in presentations)
92
+ #[serde(skip_serializing_if = "Option::is_none")]
93
+ pub hidden: Option<bool>,
94
+ }
95
+
96
+ /// Content for a single page/slide.
97
+ ///
98
+ /// When page extraction is enabled, documents are split into per-page content
99
+ /// with associated tables and images mapped to each page.
100
+ ///
101
+ /// # Performance
102
+ ///
103
+ /// Uses Arc-wrapped tables and images for memory efficiency:
104
+ /// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
105
+ /// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
106
+ /// - Maintains exact JSON compatibility via custom Serialize/Deserialize
107
+ ///
108
+ /// This reduces memory overhead for documents with shared tables/images
109
+ /// by avoiding redundant copies during serialization.
110
+ #[derive(Debug, Clone, Serialize, Deserialize)]
111
+ pub struct PageContent {
112
+ /// Page number (1-indexed)
113
+ pub page_number: usize,
114
+
115
+ /// Text content for this page
116
+ pub content: String,
117
+
118
+ /// Tables found on this page (uses Arc for memory efficiency)
119
+ ///
120
+ /// Serializes as Vec<Table> for JSON compatibility while maintaining
121
+ /// Arc semantics in-memory for zero-copy sharing.
122
+ #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
123
+ pub tables: Vec<Arc<Table>>,
124
+
125
+ /// Images found on this page (uses Arc for memory efficiency)
126
+ ///
127
+ /// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
128
+ /// Arc semantics in-memory for zero-copy sharing.
129
+ #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
130
+ pub images: Vec<Arc<ExtractedImage>>,
131
+
132
+ /// Hierarchy information for the page (when hierarchy extraction is enabled)
133
+ ///
134
+ /// Contains text hierarchy levels (H1-H6) extracted from the page content.
135
+ #[serde(skip_serializing_if = "Option::is_none")]
136
+ pub hierarchy: Option<PageHierarchy>,
137
+ }
138
+
139
+ /// Page hierarchy structure containing heading levels and block information.
140
+ ///
141
+ /// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
142
+ /// blocks with heading levels (H1-H6) for semantic document structure.
143
+ #[derive(Debug, Clone, Serialize, Deserialize)]
144
+ pub struct PageHierarchy {
145
+ /// Number of hierarchy blocks on this page
146
+ pub block_count: usize,
147
+
148
+ /// Hierarchical blocks with heading levels
149
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
150
+ pub blocks: Vec<HierarchicalBlock>,
151
+ }
152
+
153
+ /// A text block with hierarchy level assignment.
154
+ ///
155
+ /// Represents a block of text with semantic heading information extracted from
156
+ /// font size clustering and hierarchical analysis.
157
+ #[derive(Debug, Clone, Serialize, Deserialize)]
158
+ pub struct HierarchicalBlock {
159
+ /// The text content of this block
160
+ pub text: String,
161
+
162
+ /// The font size of the text in this block
163
+ pub font_size: f32,
164
+
165
+ /// The hierarchy level of this block (H1-H6 or Body)
166
+ ///
167
+ /// Levels correspond to HTML heading tags:
168
+ /// - "h1": Top-level heading
169
+ /// - "h2": Secondary heading
170
+ /// - "h3": Tertiary heading
171
+ /// - "h4": Quaternary heading
172
+ /// - "h5": Quinary heading
173
+ /// - "h6": Senary heading
174
+ /// - "body": Body text (no heading level)
175
+ pub level: String,
176
+
177
+ /// Bounding box information for the block
178
+ ///
179
+ /// Contains coordinates as (left, top, right, bottom) in PDF units.
180
+ #[serde(skip_serializing_if = "Option::is_none")]
181
+ pub bbox: Option<(f32, f32, f32, f32)>,
182
+ }
@@ -0,0 +1,132 @@
1
+ //! Custom serde serialization helpers for Arc<T> and Vec<Arc<T>>.
2
+
3
+ /// Module providing transparent serde support for Arc<T>.
4
+ ///
5
+ /// Allows Arc-wrapped types to serialize/deserialize as if unwrapped,
6
+ /// maintaining exact JSON format while preserving memory efficiency benefits.
7
+ ///
8
+ /// # Arc Sharing Semantics
9
+ ///
10
+ /// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
11
+ /// When deserializing, each Arc is independently created with `Arc::new()`.
12
+ /// This means that if two Arcs referenced the same data before serialization,
13
+ /// they will be separate Arcs after deserialization.
14
+ ///
15
+ /// Example:
16
+ /// ```ignore
17
+ /// let shared = Arc::new(Table { /* ... */ });
18
+ /// let tables = vec![Arc::clone(&shared), Arc::clone(&shared)];
19
+ /// // Both in-memory Arcs point to the same Table
20
+ ///
21
+ /// let json = serde_json::to_string(&tables)?;
22
+ /// let deserialized: Vec<Arc<Table>> = serde_json::from_str(&json)?;
23
+ /// // deserialized[0] and deserialized[1] are now independent Arcs,
24
+ /// // even though they contain identical data
25
+ /// ```
26
+ ///
27
+ /// This design choice maintains:
28
+ /// - Exact JSON format compatibility (no sharing metadata in JSON)
29
+ /// - Predictable deserialization behavior
30
+ /// - Zero additional serialization overhead
31
+ ///
32
+ /// If in-memory sharing is required, callers must implement custom sharing logic
33
+ /// or use a different data structure (like a HashMap of deduplicated values).
34
+ #[allow(dead_code)]
35
+ pub mod serde_arc {
36
+ use serde::{Deserialize, Deserializer, Serializer};
37
+ use std::sync::Arc;
38
+
39
+ /// Serialize an Arc<T> by serializing the inner value directly.
40
+ ///
41
+ /// This makes Arc<T> serialize identically to T, maintaining API compatibility.
42
+ /// The outer Arc wrapper is transparent during serialization.
43
+ pub fn serialize<S, T>(arc_value: &Arc<T>, serializer: S) -> Result<S::Ok, S::Error>
44
+ where
45
+ S: Serializer,
46
+ T: serde::Serialize,
47
+ {
48
+ (**arc_value).serialize(serializer)
49
+ }
50
+
51
+ /// Deserialize a T and wrap it in Arc.
52
+ ///
53
+ /// This makes Arc<T> deserialize from the same format as T.
54
+ /// Each Arc is independently created during deserialization;
55
+ /// Arc sharing from before serialization is NOT preserved.
56
+ pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Arc<T>, D::Error>
57
+ where
58
+ D: Deserializer<'de>,
59
+ T: Deserialize<'de>,
60
+ {
61
+ T::deserialize(deserializer).map(Arc::new)
62
+ }
63
+ }
64
+
65
+ /// Module for serializing Vec<Arc<T>> with transparent Arc handling.
66
+ ///
67
+ /// Serializes a Vec<Arc<T>> as Vec<T> for compatibility, while preserving
68
+ /// Arc semantics for memory efficiency.
69
+ ///
70
+ /// # Arc Sharing Semantics
71
+ ///
72
+ /// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
73
+ /// When deserializing, each element's Arc is independently created with `Arc::new()`.
74
+ /// This is important for `PageContent` where tables/images may be shared across pages.
75
+ ///
76
+ /// Example with shared tables:
77
+ /// ```ignore
78
+ /// let shared_table = Arc::new(Table { /* ... */ });
79
+ /// let page_contents = vec![
80
+ /// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
81
+ /// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
82
+ /// ];
83
+ /// // In-memory: both pages' tables point to the same Arc
84
+ ///
85
+ /// let json = serde_json::to_string(&page_contents)?;
86
+ /// let deserialized = serde_json::from_str::<Vec<PageContent>>(&json)?;
87
+ /// // After deserialization: each page has independent Arc instances,
88
+ /// // even though the table data is identical
89
+ /// ```
90
+ ///
91
+ /// Design rationale:
92
+ /// - JSON has no mechanism to represent shared references
93
+ /// - Preserving sharing would require complex metadata and deduplication
94
+ /// - Current approach is simple, predictable, and maintains compatibility
95
+ /// - In-memory sharing (via Arc) is an implementation detail for the Rust side
96
+ ///
97
+ /// If in-memory sharing is required after deserialization, implement custom
98
+ /// deduplication logic using hashing or content comparison.
99
+ pub mod serde_vec_arc {
100
+ use serde::{Deserialize, Deserializer, Serializer};
101
+ use std::sync::Arc;
102
+
103
+ /// Serialize Vec<Arc<T>> by serializing each T directly.
104
+ ///
105
+ /// Each element is unwrapped from its Arc and serialized independently.
106
+ /// No sharing metadata is included in the serialized output.
107
+ pub fn serialize<S, T>(vec: &[Arc<T>], serializer: S) -> Result<S::Ok, S::Error>
108
+ where
109
+ S: Serializer,
110
+ T: serde::Serialize,
111
+ {
112
+ use serde::ser::SerializeSeq;
113
+ let mut seq = serializer.serialize_seq(Some(vec.len()))?;
114
+ for arc_item in vec {
115
+ seq.serialize_element(&**arc_item)?;
116
+ }
117
+ seq.end()
118
+ }
119
+
120
+ /// Deserialize Vec<T> and wrap each element in Arc.
121
+ ///
122
+ /// Each element is independently wrapped in a new Arc.
123
+ /// Sharing relationships from before serialization are lost.
124
+ pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Vec<Arc<T>>, D::Error>
125
+ where
126
+ D: Deserializer<'de>,
127
+ T: Deserialize<'de>,
128
+ {
129
+ let vec: Vec<T> = Deserialize::deserialize(deserializer)?;
130
+ Ok(vec.into_iter().map(Arc::new).collect())
131
+ }
132
+ }
@@ -0,0 +1,39 @@
1
+ //! Table-related types for document extraction.
2
+
3
+ use serde::{Deserialize, Serialize};
4
+
5
+ /// Extracted table structure.
6
+ ///
7
+ /// Represents a table detected and extracted from a document (PDF, image, etc.).
8
+ /// Tables are converted to both structured cell data and Markdown format.
9
+ #[derive(Debug, Clone, Serialize, Deserialize)]
10
+ pub struct Table {
11
+ /// Table cells as a 2D vector (rows × columns)
12
+ pub cells: Vec<Vec<String>>,
13
+ /// Markdown representation of the table
14
+ pub markdown: String,
15
+ /// Page number where the table was found (1-indexed)
16
+ pub page_number: usize,
17
+ }
18
+
19
+ /// Individual table cell with content and optional styling.
20
+ ///
21
+ /// Future extension point for rich table support with cell-level metadata.
22
+ #[derive(Debug, Clone, Serialize, Deserialize)]
23
+ pub struct TableCell {
24
+ /// Cell content as text
25
+ pub content: String,
26
+ /// Row span (number of rows this cell spans)
27
+ #[serde(default = "default_span")]
28
+ pub row_span: usize,
29
+ /// Column span (number of columns this cell spans)
30
+ #[serde(default = "default_span")]
31
+ pub col_span: usize,
32
+ /// Whether this is a header cell
33
+ #[serde(default)]
34
+ pub is_header: bool,
35
+ }
36
+
37
+ fn default_span() -> usize {
38
+ 1
39
+ }
@@ -0,0 +1,58 @@
1
+ //! Quality heuristics and text analysis
2
+ //!
3
+ //! This module provides heuristic checks for text quality, including
4
+ //! structure analysis and line-level checks.
5
+
6
+ use super::patterns::*;
7
+
8
+ // ============================================================================
9
+ // Structure Thresholds
10
+ // ============================================================================
11
+
12
+ const MIN_SENTENCE_WORDS: f64 = 10.0;
13
+ const MAX_SENTENCE_WORDS: f64 = 30.0;
14
+ const MIN_PARAGRAPH_WORDS: f64 = 50.0;
15
+ const MAX_PARAGRAPH_WORDS: f64 = 300.0;
16
+
17
+ // ============================================================================
18
+ // Structure Analysis
19
+ // ============================================================================
20
+
21
+ /// Calculate bonus based on text structure quality
22
+ #[inline]
23
+ pub(crate) fn calculate_structure_bonus(text: &str) -> f64 {
24
+ if text.is_empty() {
25
+ return 0.0;
26
+ }
27
+
28
+ let sentence_count = SENTENCE_DETECT.find_iter(text).count() as f64;
29
+ let paragraph_count = text.matches("\n\n").count() as f64 + 1.0;
30
+ let words = text.split_whitespace().count() as f64;
31
+
32
+ if words == 0.0 {
33
+ return 0.0;
34
+ }
35
+
36
+ let avg_words_per_sentence = words / sentence_count.max(1.0);
37
+ let avg_words_per_paragraph = words / paragraph_count.max(1.0);
38
+
39
+ let mut structure_score: f64 = 0.0;
40
+
41
+ if (MIN_SENTENCE_WORDS..=MAX_SENTENCE_WORDS).contains(&avg_words_per_sentence) {
42
+ structure_score += 0.3;
43
+ }
44
+
45
+ if (MIN_PARAGRAPH_WORDS..=MAX_PARAGRAPH_WORDS).contains(&avg_words_per_paragraph) {
46
+ structure_score += 0.3;
47
+ }
48
+
49
+ if paragraph_count > 1.0 {
50
+ structure_score += 0.2;
51
+ }
52
+
53
+ if PUNCTUATION_DETECT.is_match(text) {
54
+ structure_score += 0.2;
55
+ }
56
+
57
+ structure_score.min(1.0)
58
+ }