kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,281 @@
1
+ //! Core types for document extraction.
2
+
3
+ // Module declarations
4
+ pub mod djot;
5
+ pub mod extraction;
6
+ pub mod formats;
7
+ pub mod metadata;
8
+ pub mod page;
9
+ pub mod serde_helpers;
10
+ pub mod tables;
11
+
12
+ // Re-export all types for backward compatibility
13
+ pub use djot::*;
14
+ pub use extraction::*;
15
+ pub use formats::*;
16
+ pub use metadata::*;
17
+ pub use page::*;
18
+ pub use tables::*;
19
+
20
+ #[cfg(test)]
21
+ mod tests {
22
+ use super::*;
23
+ use std::sync::Arc;
24
+
25
+ #[test]
26
+ fn test_metadata_serialization_with_format() {
27
+ let mut metadata = Metadata {
28
+ format: Some(FormatMetadata::Text(TextMetadata {
29
+ line_count: 1,
30
+ word_count: 2,
31
+ character_count: 13,
32
+ headers: None,
33
+ links: None,
34
+ code_blocks: None,
35
+ })),
36
+ ..Default::default()
37
+ };
38
+
39
+ metadata
40
+ .additional
41
+ .insert("quality_score".to_string(), serde_json::json!(1.0));
42
+
43
+ let json = serde_json::to_value(&metadata).unwrap();
44
+ println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
45
+
46
+ assert!(
47
+ json.get("format_type").is_some(),
48
+ "format_type should be present in serialized JSON"
49
+ );
50
+ assert_eq!(json.get("format_type").unwrap(), "text");
51
+
52
+ assert_eq!(json.get("line_count").unwrap(), 1);
53
+ assert_eq!(json.get("word_count").unwrap(), 2);
54
+ assert_eq!(json.get("character_count").unwrap(), 13);
55
+
56
+ assert_eq!(json.get("quality_score").unwrap(), 1.0);
57
+ }
58
+
59
+ #[test]
60
+ fn test_arc_table_serialization_format() {
61
+ let table = Table {
62
+ cells: vec![vec!["A".to_string(), "B".to_string()]],
63
+ markdown: "| A | B |\n|---|---|\n".to_string(),
64
+ page_number: 1,
65
+ };
66
+
67
+ let json = serde_json::to_value(&table).unwrap();
68
+
69
+ assert_eq!(json.get("cells").unwrap()[0][0], "A");
70
+ assert_eq!(json.get("markdown").unwrap(), "| A | B |\n|---|---|\n");
71
+ assert_eq!(json.get("page_number").unwrap(), 1);
72
+ }
73
+
74
+ #[test]
75
+ fn test_arc_table_roundtrip() {
76
+ let original = Table {
77
+ cells: vec![
78
+ vec!["X".to_string(), "Y".to_string()],
79
+ vec!["1".to_string(), "2".to_string()],
80
+ ],
81
+ markdown: "| X | Y |\n|---|---|\n| 1 | 2 |\n".to_string(),
82
+ page_number: 5,
83
+ };
84
+
85
+ let json = serde_json::to_string(&original).unwrap();
86
+ let deserialized: Table = serde_json::from_str(&json).unwrap();
87
+
88
+ assert_eq!(deserialized.cells, original.cells);
89
+ assert_eq!(deserialized.markdown, original.markdown);
90
+ assert_eq!(deserialized.page_number, original.page_number);
91
+ }
92
+
93
+ #[test]
94
+ fn test_arc_sharing_preserved_before_serialization() {
95
+ let shared_table = Arc::new(Table {
96
+ cells: vec![vec!["shared".to_string()]],
97
+ markdown: "| shared |".to_string(),
98
+ page_number: 1,
99
+ });
100
+
101
+ let tables_before = [Arc::clone(&shared_table), Arc::clone(&shared_table)].to_vec();
102
+ assert_eq!(Arc::strong_count(&tables_before[0]), 3);
103
+ assert_eq!(Arc::strong_count(&tables_before[1]), 3);
104
+ assert!(Arc::ptr_eq(&tables_before[0], &tables_before[1]));
105
+ }
106
+
107
+ #[test]
108
+ fn test_vec_arc_table_serialization_format() {
109
+ let tables = vec![
110
+ Table {
111
+ cells: vec![vec!["A".to_string()]],
112
+ markdown: "| A |".to_string(),
113
+ page_number: 1,
114
+ },
115
+ Table {
116
+ cells: vec![vec!["B".to_string()]],
117
+ markdown: "| B |".to_string(),
118
+ page_number: 2,
119
+ },
120
+ ];
121
+
122
+ let json = serde_json::to_string(&tables).unwrap();
123
+ let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
124
+
125
+ assert!(parsed.is_array());
126
+ assert_eq!(parsed.as_array().unwrap().len(), 2);
127
+ assert_eq!(parsed[0]["cells"][0][0], "A");
128
+ assert_eq!(parsed[1]["cells"][0][0], "B");
129
+ }
130
+
131
+ #[test]
132
+ fn test_page_content_arc_tables_roundtrip() {
133
+ let page = PageContent {
134
+ page_number: 3,
135
+ content: "Page 3 content".to_string(),
136
+ tables: vec![
137
+ Arc::new(Table {
138
+ cells: vec![vec!["Table1".to_string()]],
139
+ markdown: "| Table1 |".to_string(),
140
+ page_number: 3,
141
+ }),
142
+ Arc::new(Table {
143
+ cells: vec![vec!["Table2".to_string()]],
144
+ markdown: "| Table2 |".to_string(),
145
+ page_number: 3,
146
+ }),
147
+ ],
148
+ images: Vec::new(),
149
+ hierarchy: None,
150
+ };
151
+
152
+ let json = serde_json::to_string(&page).unwrap();
153
+ let deserialized: PageContent = serde_json::from_str(&json).unwrap();
154
+
155
+ assert_eq!(deserialized.page_number, 3);
156
+ assert_eq!(deserialized.content, "Page 3 content");
157
+ assert_eq!(deserialized.tables.len(), 2);
158
+ assert_eq!(deserialized.tables[0].cells[0][0], "Table1");
159
+ assert_eq!(deserialized.tables[1].cells[0][0], "Table2");
160
+ }
161
+
162
+ #[test]
163
+ fn test_page_content_arc_images_roundtrip() {
164
+ let image1 = Arc::new(ExtractedImage {
165
+ data: vec![0xFF, 0xD8, 0xFF],
166
+ format: "jpeg".to_string(),
167
+ image_index: 0,
168
+ page_number: Some(1),
169
+ width: Some(100),
170
+ height: Some(200),
171
+ colorspace: Some("RGB".to_string()),
172
+ bits_per_component: Some(8),
173
+ is_mask: false,
174
+ description: Some("Image 1".to_string()),
175
+ ocr_result: None,
176
+ });
177
+
178
+ let image2 = Arc::new(ExtractedImage {
179
+ data: vec![0x89, 0x50, 0x4E],
180
+ format: "png".to_string(),
181
+ image_index: 1,
182
+ page_number: Some(1),
183
+ width: Some(300),
184
+ height: Some(400),
185
+ colorspace: Some("RGBA".to_string()),
186
+ bits_per_component: Some(8),
187
+ is_mask: false,
188
+ description: Some("Image 2".to_string()),
189
+ ocr_result: None,
190
+ });
191
+
192
+ let page = PageContent {
193
+ page_number: 1,
194
+ content: "Page with images".to_string(),
195
+ tables: Vec::new(),
196
+ images: vec![image1, image2],
197
+ hierarchy: None,
198
+ };
199
+
200
+ let json = serde_json::to_string(&page).unwrap();
201
+ let deserialized: PageContent = serde_json::from_str(&json).unwrap();
202
+
203
+ assert_eq!(deserialized.images.len(), 2);
204
+ assert_eq!(deserialized.images[0].format, "jpeg");
205
+ assert_eq!(deserialized.images[0].width, Some(100));
206
+ assert_eq!(deserialized.images[1].format, "png");
207
+ assert_eq!(deserialized.images[1].height, Some(400));
208
+ }
209
+
210
+ #[test]
211
+ fn test_arc_sharing_loss_with_page_content() {
212
+ let shared_table = Arc::new(Table {
213
+ cells: vec![vec!["shared across pages".to_string()]],
214
+ markdown: "| shared across pages |".to_string(),
215
+ page_number: 0,
216
+ });
217
+
218
+ let page1 = PageContent {
219
+ page_number: 1,
220
+ content: "Page 1".to_string(),
221
+ tables: vec![Arc::clone(&shared_table)],
222
+ images: Vec::new(),
223
+ hierarchy: None,
224
+ };
225
+
226
+ let page2 = PageContent {
227
+ page_number: 2,
228
+ content: "Page 2".to_string(),
229
+ tables: vec![Arc::clone(&shared_table)],
230
+ images: Vec::new(),
231
+ hierarchy: None,
232
+ };
233
+
234
+ assert!(Arc::ptr_eq(&page1.tables[0], &page2.tables[0]));
235
+
236
+ let pages = vec![page1, page2];
237
+ let json = serde_json::to_string(&pages).unwrap();
238
+ let deserialized: Vec<PageContent> = serde_json::from_str(&json).unwrap();
239
+
240
+ assert_eq!(deserialized.len(), 2);
241
+ assert_eq!(deserialized[0].tables[0].cells, deserialized[1].tables[0].cells);
242
+ assert!(!Arc::ptr_eq(&deserialized[0].tables[0], &deserialized[1].tables[0]));
243
+ }
244
+
245
+ #[test]
246
+ fn test_empty_page_content_arcs() {
247
+ let page = PageContent {
248
+ page_number: 5,
249
+ content: "No tables or images".to_string(),
250
+ tables: Vec::new(),
251
+ images: Vec::new(),
252
+ hierarchy: None,
253
+ };
254
+
255
+ let json = serde_json::to_string(&page).unwrap();
256
+ let deserialized: PageContent = serde_json::from_str(&json).unwrap();
257
+
258
+ assert_eq!(deserialized.page_number, 5);
259
+ assert_eq!(deserialized.tables.len(), 0);
260
+ assert_eq!(deserialized.images.len(), 0);
261
+ }
262
+
263
+ #[test]
264
+ fn test_serde_vec_arc_module_behavior() {
265
+ let table1 = Table {
266
+ cells: vec![vec!["A".to_string()]],
267
+ markdown: "| A |".to_string(),
268
+ page_number: 1,
269
+ };
270
+
271
+ let table2 = Table {
272
+ cells: vec![vec!["B".to_string()]],
273
+ markdown: "| B |".to_string(),
274
+ page_number: 2,
275
+ };
276
+
277
+ let json = serde_json::to_string(&vec![table1, table2]).unwrap();
278
+ assert!(json.contains("\"A\""));
279
+ assert!(json.contains("\"B\""));
280
+ }
281
+ }
@@ -0,0 +1,182 @@
1
+ //! Page structure types for documents.
2
+ //!
3
+ //! This module defines types for representing paginated document structures.
4
+
5
+ use serde::{Deserialize, Serialize};
6
+ use std::sync::Arc;
7
+
8
+ // Import serde helper and types from sibling modules
9
+ use super::extraction::ExtractedImage;
10
+ use super::serde_helpers::serde_vec_arc;
11
+ use super::tables::Table;
12
+
13
+ /// Unified page structure for documents.
14
+ ///
15
+ /// Supports different page types (PDF pages, PPTX slides, Excel sheets)
16
+ /// with character offset boundaries for chunk-to-page mapping.
17
+ #[derive(Debug, Clone, Serialize, Deserialize)]
18
+ pub struct PageStructure {
19
+ /// Total number of pages/slides/sheets
20
+ pub total_count: usize,
21
+
22
+ /// Type of paginated unit
23
+ pub unit_type: PageUnitType,
24
+
25
+ /// Character offset boundaries for each page
26
+ ///
27
+ /// Maps character ranges in the extracted content to page numbers.
28
+ /// Used for chunk page range calculation.
29
+ #[serde(skip_serializing_if = "Option::is_none")]
30
+ pub boundaries: Option<Vec<PageBoundary>>,
31
+
32
+ /// Detailed per-page metadata (optional, only when needed)
33
+ #[serde(skip_serializing_if = "Option::is_none")]
34
+ pub pages: Option<Vec<PageInfo>>,
35
+ }
36
+
37
+ /// Type of paginated unit in a document.
38
+ ///
39
+ /// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
40
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
41
+ #[serde(rename_all = "snake_case")]
42
+ pub enum PageUnitType {
43
+ /// Standard document pages (PDF, DOCX, images)
44
+ Page,
45
+ /// Presentation slides (PPTX, ODP)
46
+ Slide,
47
+ /// Spreadsheet sheets (XLSX, ODS)
48
+ Sheet,
49
+ }
50
+
51
+ /// Byte offset boundary for a page.
52
+ ///
53
+ /// Tracks where a specific page's content starts and ends in the main content string,
54
+ /// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
55
+ /// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
56
+ #[derive(Debug, Clone, Serialize, Deserialize)]
57
+ pub struct PageBoundary {
58
+ /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
59
+ pub byte_start: usize,
60
+ /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
61
+ pub byte_end: usize,
62
+ /// Page number (1-indexed)
63
+ pub page_number: usize,
64
+ }
65
+
66
+ /// Metadata for individual page/slide/sheet.
67
+ ///
68
+ /// Captures per-page information including dimensions, content counts,
69
+ /// and visibility state (for presentations).
70
+ #[derive(Debug, Clone, Serialize, Deserialize)]
71
+ pub struct PageInfo {
72
+ /// Page number (1-indexed)
73
+ pub number: usize,
74
+
75
+ /// Page title (usually for presentations)
76
+ #[serde(skip_serializing_if = "Option::is_none")]
77
+ pub title: Option<String>,
78
+
79
+ /// Dimensions in points (PDF) or pixels (images): (width, height)
80
+ #[serde(skip_serializing_if = "Option::is_none")]
81
+ pub dimensions: Option<(f64, f64)>,
82
+
83
+ /// Number of images on this page
84
+ #[serde(skip_serializing_if = "Option::is_none")]
85
+ pub image_count: Option<usize>,
86
+
87
+ /// Number of tables on this page
88
+ #[serde(skip_serializing_if = "Option::is_none")]
89
+ pub table_count: Option<usize>,
90
+
91
+ /// Whether this page is hidden (e.g., in presentations)
92
+ #[serde(skip_serializing_if = "Option::is_none")]
93
+ pub hidden: Option<bool>,
94
+ }
95
+
96
+ /// Content for a single page/slide.
97
+ ///
98
+ /// When page extraction is enabled, documents are split into per-page content
99
+ /// with associated tables and images mapped to each page.
100
+ ///
101
+ /// # Performance
102
+ ///
103
+ /// Uses Arc-wrapped tables and images for memory efficiency:
104
+ /// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
105
+ /// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
106
+ /// - Maintains exact JSON compatibility via custom Serialize/Deserialize
107
+ ///
108
+ /// This reduces memory overhead for documents with shared tables/images
109
+ /// by avoiding redundant copies during serialization.
110
+ #[derive(Debug, Clone, Serialize, Deserialize)]
111
+ pub struct PageContent {
112
+ /// Page number (1-indexed)
113
+ pub page_number: usize,
114
+
115
+ /// Text content for this page
116
+ pub content: String,
117
+
118
+ /// Tables found on this page (uses Arc for memory efficiency)
119
+ ///
120
+ /// Serializes as Vec<Table> for JSON compatibility while maintaining
121
+ /// Arc semantics in-memory for zero-copy sharing.
122
+ #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
123
+ pub tables: Vec<Arc<Table>>,
124
+
125
+ /// Images found on this page (uses Arc for memory efficiency)
126
+ ///
127
+ /// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
128
+ /// Arc semantics in-memory for zero-copy sharing.
129
+ #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
130
+ pub images: Vec<Arc<ExtractedImage>>,
131
+
132
+ /// Hierarchy information for the page (when hierarchy extraction is enabled)
133
+ ///
134
+ /// Contains text hierarchy levels (H1-H6) extracted from the page content.
135
+ #[serde(skip_serializing_if = "Option::is_none")]
136
+ pub hierarchy: Option<PageHierarchy>,
137
+ }
138
+
139
+ /// Page hierarchy structure containing heading levels and block information.
140
+ ///
141
+ /// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
142
+ /// blocks with heading levels (H1-H6) for semantic document structure.
143
+ #[derive(Debug, Clone, Serialize, Deserialize)]
144
+ pub struct PageHierarchy {
145
+ /// Number of hierarchy blocks on this page
146
+ pub block_count: usize,
147
+
148
+ /// Hierarchical blocks with heading levels
149
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
150
+ pub blocks: Vec<HierarchicalBlock>,
151
+ }
152
+
153
+ /// A text block with hierarchy level assignment.
154
+ ///
155
+ /// Represents a block of text with semantic heading information extracted from
156
+ /// font size clustering and hierarchical analysis.
157
+ #[derive(Debug, Clone, Serialize, Deserialize)]
158
+ pub struct HierarchicalBlock {
159
+ /// The text content of this block
160
+ pub text: String,
161
+
162
+ /// The font size of the text in this block
163
+ pub font_size: f32,
164
+
165
+ /// The hierarchy level of this block (H1-H6 or Body)
166
+ ///
167
+ /// Levels correspond to HTML heading tags:
168
+ /// - "h1": Top-level heading
169
+ /// - "h2": Secondary heading
170
+ /// - "h3": Tertiary heading
171
+ /// - "h4": Quaternary heading
172
+ /// - "h5": Quinary heading
173
+ /// - "h6": Senary heading
174
+ /// - "body": Body text (no heading level)
175
+ pub level: String,
176
+
177
+ /// Bounding box information for the block
178
+ ///
179
+ /// Contains coordinates as (left, top, right, bottom) in PDF units.
180
+ #[serde(skip_serializing_if = "Option::is_none")]
181
+ pub bbox: Option<(f32, f32, f32, f32)>,
182
+ }
@@ -0,0 +1,132 @@
1
+ //! Custom serde serialization helpers for Arc<T> and Vec<Arc<T>>.
2
+
3
+ /// Module providing transparent serde support for Arc<T>.
4
+ ///
5
+ /// Allows Arc-wrapped types to serialize/deserialize as if unwrapped,
6
+ /// maintaining exact JSON format while preserving memory efficiency benefits.
7
+ ///
8
+ /// # Arc Sharing Semantics
9
+ ///
10
+ /// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
11
+ /// When deserializing, each Arc is independently created with `Arc::new()`.
12
+ /// This means that if two Arcs referenced the same data before serialization,
13
+ /// they will be separate Arcs after deserialization.
14
+ ///
15
+ /// Example:
16
+ /// ```ignore
17
+ /// let shared = Arc::new(Table { /* ... */ });
18
+ /// let tables = vec![Arc::clone(&shared), Arc::clone(&shared)];
19
+ /// // Both in-memory Arcs point to the same Table
20
+ ///
21
+ /// let json = serde_json::to_string(&tables)?;
22
+ /// let deserialized: Vec<Arc<Table>> = serde_json::from_str(&json)?;
23
+ /// // deserialized[0] and deserialized[1] are now independent Arcs,
24
+ /// // even though they contain identical data
25
+ /// ```
26
+ ///
27
+ /// This design choice maintains:
28
+ /// - Exact JSON format compatibility (no sharing metadata in JSON)
29
+ /// - Predictable deserialization behavior
30
+ /// - Zero additional serialization overhead
31
+ ///
32
+ /// If in-memory sharing is required, callers must implement custom sharing logic
33
+ /// or use a different data structure (like a HashMap of deduplicated values).
34
+ #[allow(dead_code)]
35
+ pub mod serde_arc {
36
+ use serde::{Deserialize, Deserializer, Serializer};
37
+ use std::sync::Arc;
38
+
39
+ /// Serialize an Arc<T> by serializing the inner value directly.
40
+ ///
41
+ /// This makes Arc<T> serialize identically to T, maintaining API compatibility.
42
+ /// The outer Arc wrapper is transparent during serialization.
43
+ pub fn serialize<S, T>(arc_value: &Arc<T>, serializer: S) -> Result<S::Ok, S::Error>
44
+ where
45
+ S: Serializer,
46
+ T: serde::Serialize,
47
+ {
48
+ (**arc_value).serialize(serializer)
49
+ }
50
+
51
+ /// Deserialize a T and wrap it in Arc.
52
+ ///
53
+ /// This makes Arc<T> deserialize from the same format as T.
54
+ /// Each Arc is independently created during deserialization;
55
+ /// Arc sharing from before serialization is NOT preserved.
56
+ pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Arc<T>, D::Error>
57
+ where
58
+ D: Deserializer<'de>,
59
+ T: Deserialize<'de>,
60
+ {
61
+ T::deserialize(deserializer).map(Arc::new)
62
+ }
63
+ }
64
+
65
+ /// Module for serializing Vec<Arc<T>> with transparent Arc handling.
66
+ ///
67
+ /// Serializes a Vec<Arc<T>> as Vec<T> for compatibility, while preserving
68
+ /// Arc semantics for memory efficiency.
69
+ ///
70
+ /// # Arc Sharing Semantics
71
+ ///
72
+ /// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
73
+ /// When deserializing, each element's Arc is independently created with `Arc::new()`.
74
+ /// This is important for `PageContent` where tables/images may be shared across pages.
75
+ ///
76
+ /// Example with shared tables:
77
+ /// ```ignore
78
+ /// let shared_table = Arc::new(Table { /* ... */ });
79
+ /// let page_contents = vec![
80
+ /// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
81
+ /// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
82
+ /// ];
83
+ /// // In-memory: both pages' tables point to the same Arc
84
+ ///
85
+ /// let json = serde_json::to_string(&page_contents)?;
86
+ /// let deserialized = serde_json::from_str::<Vec<PageContent>>(&json)?;
87
+ /// // After deserialization: each page has independent Arc instances,
88
+ /// // even though the table data is identical
89
+ /// ```
90
+ ///
91
+ /// Design rationale:
92
+ /// - JSON has no mechanism to represent shared references
93
+ /// - Preserving sharing would require complex metadata and deduplication
94
+ /// - Current approach is simple, predictable, and maintains compatibility
95
+ /// - In-memory sharing (via Arc) is an implementation detail for the Rust side
96
+ ///
97
+ /// If in-memory sharing is required after deserialization, implement custom
98
+ /// deduplication logic using hashing or content comparison.
99
+ pub mod serde_vec_arc {
100
+ use serde::{Deserialize, Deserializer, Serializer};
101
+ use std::sync::Arc;
102
+
103
+ /// Serialize Vec<Arc<T>> by serializing each T directly.
104
+ ///
105
+ /// Each element is unwrapped from its Arc and serialized independently.
106
+ /// No sharing metadata is included in the serialized output.
107
+ pub fn serialize<S, T>(vec: &[Arc<T>], serializer: S) -> Result<S::Ok, S::Error>
108
+ where
109
+ S: Serializer,
110
+ T: serde::Serialize,
111
+ {
112
+ use serde::ser::SerializeSeq;
113
+ let mut seq = serializer.serialize_seq(Some(vec.len()))?;
114
+ for arc_item in vec {
115
+ seq.serialize_element(&**arc_item)?;
116
+ }
117
+ seq.end()
118
+ }
119
+
120
+ /// Deserialize Vec<T> and wrap each element in Arc.
121
+ ///
122
+ /// Each element is independently wrapped in a new Arc.
123
+ /// Sharing relationships from before serialization are lost.
124
+ pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Vec<Arc<T>>, D::Error>
125
+ where
126
+ D: Deserializer<'de>,
127
+ T: Deserialize<'de>,
128
+ {
129
+ let vec: Vec<T> = Deserialize::deserialize(deserializer)?;
130
+ Ok(vec.into_iter().map(Arc::new).collect())
131
+ }
132
+ }
@@ -0,0 +1,39 @@
1
+ //! Table-related types for document extraction.
2
+
3
+ use serde::{Deserialize, Serialize};
4
+
5
+ /// Extracted table structure.
6
+ ///
7
+ /// Represents a table detected and extracted from a document (PDF, image, etc.).
8
+ /// Tables are converted to both structured cell data and Markdown format.
9
+ #[derive(Debug, Clone, Serialize, Deserialize)]
10
+ pub struct Table {
11
+ /// Table cells as a 2D vector (rows × columns)
12
+ pub cells: Vec<Vec<String>>,
13
+ /// Markdown representation of the table
14
+ pub markdown: String,
15
+ /// Page number where the table was found (1-indexed)
16
+ pub page_number: usize,
17
+ }
18
+
19
+ /// Individual table cell with content and optional styling.
20
+ ///
21
+ /// Future extension point for rich table support with cell-level metadata.
22
+ #[derive(Debug, Clone, Serialize, Deserialize)]
23
+ pub struct TableCell {
24
+ /// Cell content as text
25
+ pub content: String,
26
+ /// Row span (number of rows this cell spans)
27
+ #[serde(default = "default_span")]
28
+ pub row_span: usize,
29
+ /// Column span (number of columns this cell spans)
30
+ #[serde(default = "default_span")]
31
+ pub col_span: usize,
32
+ /// Whether this is a header cell
33
+ #[serde(default)]
34
+ pub is_header: bool,
35
+ }
36
+
37
+ fn default_span() -> usize {
38
+ 1
39
+ }
@@ -0,0 +1,58 @@
1
+ //! Quality heuristics and text analysis
2
+ //!
3
+ //! This module provides heuristic checks for text quality, including
4
+ //! structure analysis and line-level checks.
5
+
6
+ use super::patterns::*;
7
+
8
+ // ============================================================================
9
+ // Structure Thresholds
10
+ // ============================================================================
11
+
12
+ const MIN_SENTENCE_WORDS: f64 = 10.0;
13
+ const MAX_SENTENCE_WORDS: f64 = 30.0;
14
+ const MIN_PARAGRAPH_WORDS: f64 = 50.0;
15
+ const MAX_PARAGRAPH_WORDS: f64 = 300.0;
16
+
17
+ // ============================================================================
18
+ // Structure Analysis
19
+ // ============================================================================
20
+
21
+ /// Calculate bonus based on text structure quality
22
+ #[inline]
23
+ pub(crate) fn calculate_structure_bonus(text: &str) -> f64 {
24
+ if text.is_empty() {
25
+ return 0.0;
26
+ }
27
+
28
+ let sentence_count = SENTENCE_DETECT.find_iter(text).count() as f64;
29
+ let paragraph_count = text.matches("\n\n").count() as f64 + 1.0;
30
+ let words = text.split_whitespace().count() as f64;
31
+
32
+ if words == 0.0 {
33
+ return 0.0;
34
+ }
35
+
36
+ let avg_words_per_sentence = words / sentence_count.max(1.0);
37
+ let avg_words_per_paragraph = words / paragraph_count.max(1.0);
38
+
39
+ let mut structure_score: f64 = 0.0;
40
+
41
+ if (MIN_SENTENCE_WORDS..=MAX_SENTENCE_WORDS).contains(&avg_words_per_sentence) {
42
+ structure_score += 0.3;
43
+ }
44
+
45
+ if (MIN_PARAGRAPH_WORDS..=MAX_PARAGRAPH_WORDS).contains(&avg_words_per_paragraph) {
46
+ structure_score += 0.3;
47
+ }
48
+
49
+ if paragraph_count > 1.0 {
50
+ structure_score += 0.2;
51
+ }
52
+
53
+ if PUNCTUATION_DETECT.is_match(text) {
54
+ structure_score += 0.2;
55
+ }
56
+
57
+ structure_score.min(1.0)
58
+ }