kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,6 +1,5 @@
1
1
  use serde::{Deserialize, Serialize};
2
- use std::collections::{BTreeMap, HashMap};
3
- use std::sync::Arc;
2
+ use std::collections::HashMap;
4
3
 
5
4
  #[cfg(feature = "pdf")]
6
5
  use crate::pdf::metadata::PdfMetadata;
@@ -8,137 +7,6 @@ use crate::pdf::metadata::PdfMetadata;
8
7
  // ============================================================================
9
8
  // ============================================================================
10
9
 
11
- /// Module providing transparent serde support for Arc<T>.
12
- ///
13
- /// Allows Arc-wrapped types to serialize/deserialize as if unwrapped,
14
- /// maintaining exact JSON format while preserving memory efficiency benefits.
15
- ///
16
- /// # Arc Sharing Semantics
17
- ///
18
- /// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
19
- /// When deserializing, each Arc is independently created with `Arc::new()`.
20
- /// This means that if two Arcs referenced the same data before serialization,
21
- /// they will be separate Arcs after deserialization.
22
- ///
23
- /// Example:
24
- /// ```ignore
25
- /// let shared = Arc::new(Table { /* ... */ });
26
- /// let tables = vec![Arc::clone(&shared), Arc::clone(&shared)];
27
- /// // Both in-memory Arcs point to the same Table
28
- ///
29
- /// let json = serde_json::to_string(&tables)?;
30
- /// let deserialized: Vec<Arc<Table>> = serde_json::from_str(&json)?;
31
- /// // deserialized[0] and deserialized[1] are now independent Arcs,
32
- /// // even though they contain identical data
33
- /// ```
34
- ///
35
- /// This design choice maintains:
36
- /// - Exact JSON format compatibility (no sharing metadata in JSON)
37
- /// - Predictable deserialization behavior
38
- /// - Zero additional serialization overhead
39
- ///
40
- /// If in-memory sharing is required, callers must implement custom sharing logic
41
- /// or use a different data structure (like a HashMap of deduplicated values).
42
- #[allow(dead_code)]
43
- mod serde_arc {
44
- use serde::{Deserialize, Deserializer, Serializer};
45
- use std::sync::Arc;
46
-
47
- /// Serialize an Arc<T> by serializing the inner value directly.
48
- ///
49
- /// This makes Arc<T> serialize identically to T, maintaining API compatibility.
50
- /// The outer Arc wrapper is transparent during serialization.
51
- pub fn serialize<S, T>(arc_value: &Arc<T>, serializer: S) -> Result<S::Ok, S::Error>
52
- where
53
- S: Serializer,
54
- T: serde::Serialize,
55
- {
56
- (**arc_value).serialize(serializer)
57
- }
58
-
59
- /// Deserialize a T and wrap it in Arc.
60
- ///
61
- /// This makes Arc<T> deserialize from the same format as T.
62
- /// Each Arc is independently created during deserialization;
63
- /// Arc sharing from before serialization is NOT preserved.
64
- pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Arc<T>, D::Error>
65
- where
66
- D: Deserializer<'de>,
67
- T: Deserialize<'de>,
68
- {
69
- T::deserialize(deserializer).map(Arc::new)
70
- }
71
- }
72
-
73
- /// Module for serializing Vec<Arc<T>> with transparent Arc handling.
74
- ///
75
- /// Serializes a Vec<Arc<T>> as Vec<T> for compatibility, while preserving
76
- /// Arc semantics for memory efficiency.
77
- ///
78
- /// # Arc Sharing Semantics
79
- ///
80
- /// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
81
- /// When deserializing, each element's Arc is independently created with `Arc::new()`.
82
- /// This is important for `PageContent` where tables/images may be shared across pages.
83
- ///
84
- /// Example with shared tables:
85
- /// ```ignore
86
- /// let shared_table = Arc::new(Table { /* ... */ });
87
- /// let page_contents = vec![
88
- /// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
89
- /// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
90
- /// ];
91
- /// // In-memory: both pages' tables point to the same Arc
92
- ///
93
- /// let json = serde_json::to_string(&page_contents)?;
94
- /// let deserialized = serde_json::from_str::<Vec<PageContent>>(&json)?;
95
- /// // After deserialization: each page has independent Arc instances,
96
- /// // even though the table data is identical
97
- /// ```
98
- ///
99
- /// Design rationale:
100
- /// - JSON has no mechanism to represent shared references
101
- /// - Preserving sharing would require complex metadata and deduplication
102
- /// - Current approach is simple, predictable, and maintains compatibility
103
- /// - In-memory sharing (via Arc) is an implementation detail for the Rust side
104
- ///
105
- /// If in-memory sharing is required after deserialization, implement custom
106
- /// deduplication logic using hashing or content comparison.
107
- mod serde_vec_arc {
108
- use serde::{Deserialize, Deserializer, Serializer};
109
- use std::sync::Arc;
110
-
111
- /// Serialize Vec<Arc<T>> by serializing each T directly.
112
- ///
113
- /// Each element is unwrapped from its Arc and serialized independently.
114
- /// No sharing metadata is included in the serialized output.
115
- pub fn serialize<S, T>(vec: &[Arc<T>], serializer: S) -> Result<S::Ok, S::Error>
116
- where
117
- S: Serializer,
118
- T: serde::Serialize,
119
- {
120
- use serde::ser::SerializeSeq;
121
- let mut seq = serializer.serialize_seq(Some(vec.len()))?;
122
- for arc_item in vec {
123
- seq.serialize_element(&**arc_item)?;
124
- }
125
- seq.end()
126
- }
127
-
128
- /// Deserialize Vec<T> and wrap each element in Arc.
129
- ///
130
- /// Each element is independently wrapped in a new Arc.
131
- /// Sharing relationships from before serialization are lost.
132
- pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Vec<Arc<T>>, D::Error>
133
- where
134
- D: Deserializer<'de>,
135
- T: Deserialize<'de>,
136
- {
137
- let vec: Vec<T> = Deserialize::deserialize(deserializer)?;
138
- Ok(vec.into_iter().map(Arc::new).collect())
139
- }
140
- }
141
-
142
10
  /// General extraction result used by the core extraction API.
143
11
  ///
144
12
  /// This is the main result type returned by all extraction functions.
@@ -166,13 +34,6 @@ pub struct ExtractionResult {
166
34
  /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
167
35
  #[serde(skip_serializing_if = "Option::is_none")]
168
36
  pub images: Option<Vec<ExtractedImage>>,
169
-
170
- /// Per-page content when page extraction is enabled.
171
- ///
172
- /// When page extraction is configured, the document is split into per-page content
173
- /// with tables and images mapped to their respective pages.
174
- #[serde(skip_serializing_if = "Option::is_none")]
175
- pub pages: Option<Vec<PageContent>>,
176
37
  }
177
38
 
178
39
  /// Format-specific metadata (discriminated union).
@@ -201,45 +62,17 @@ pub enum FormatMetadata {
201
62
  /// via a discriminated union, and additional custom fields from postprocessors.
202
63
  #[derive(Debug, Clone, Serialize, Deserialize, Default)]
203
64
  pub struct Metadata {
204
- /// Document title
205
- #[serde(skip_serializing_if = "Option::is_none")]
206
- pub title: Option<String>,
207
-
208
- /// Document subject or description
209
- #[serde(skip_serializing_if = "Option::is_none")]
210
- pub subject: Option<String>,
211
-
212
- /// Primary author(s) - always Vec for consistency
213
- #[serde(skip_serializing_if = "Option::is_none")]
214
- pub authors: Option<Vec<String>>,
215
-
216
- /// Keywords/tags - always Vec for consistency
217
- #[serde(skip_serializing_if = "Option::is_none")]
218
- pub keywords: Option<Vec<String>>,
219
-
220
- /// Primary language (ISO 639 code)
65
+ /// Language of the document (ISO 639 code)
221
66
  #[serde(skip_serializing_if = "Option::is_none")]
222
67
  pub language: Option<String>,
223
68
 
224
- /// Creation timestamp (ISO 8601 format)
225
- #[serde(skip_serializing_if = "Option::is_none")]
226
- pub created_at: Option<String>,
227
-
228
- /// Last modification timestamp (ISO 8601 format)
69
+ /// Document date (format varies by source)
229
70
  #[serde(skip_serializing_if = "Option::is_none")]
230
- pub modified_at: Option<String>,
231
-
232
- /// User who created the document
233
- #[serde(skip_serializing_if = "Option::is_none")]
234
- pub created_by: Option<String>,
235
-
236
- /// User who last modified the document
237
- #[serde(skip_serializing_if = "Option::is_none")]
238
- pub modified_by: Option<String>,
71
+ pub date: Option<String>,
239
72
 
240
- /// Page/slide/sheet structure with boundaries
73
+ /// Document subject/description
241
74
  #[serde(skip_serializing_if = "Option::is_none")]
242
- pub pages: Option<PageStructure>,
75
+ pub subject: Option<String>,
243
76
 
244
77
  /// Format-specific metadata (discriminated union)
245
78
  ///
@@ -269,177 +102,6 @@ pub struct Metadata {
269
102
  pub additional: HashMap<String, serde_json::Value>,
270
103
  }
271
104
 
272
- /// Unified page structure for documents.
273
- ///
274
- /// Supports different page types (PDF pages, PPTX slides, Excel sheets)
275
- /// with character offset boundaries for chunk-to-page mapping.
276
- #[derive(Debug, Clone, Serialize, Deserialize)]
277
- pub struct PageStructure {
278
- /// Total number of pages/slides/sheets
279
- pub total_count: usize,
280
-
281
- /// Type of paginated unit
282
- pub unit_type: PageUnitType,
283
-
284
- /// Character offset boundaries for each page
285
- ///
286
- /// Maps character ranges in the extracted content to page numbers.
287
- /// Used for chunk page range calculation.
288
- #[serde(skip_serializing_if = "Option::is_none")]
289
- pub boundaries: Option<Vec<PageBoundary>>,
290
-
291
- /// Detailed per-page metadata (optional, only when needed)
292
- #[serde(skip_serializing_if = "Option::is_none")]
293
- pub pages: Option<Vec<PageInfo>>,
294
- }
295
-
296
- /// Type of paginated unit in a document.
297
- ///
298
- /// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
299
- #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
300
- #[serde(rename_all = "snake_case")]
301
- pub enum PageUnitType {
302
- /// Standard document pages (PDF, DOCX, images)
303
- Page,
304
- /// Presentation slides (PPTX, ODP)
305
- Slide,
306
- /// Spreadsheet sheets (XLSX, ODS)
307
- Sheet,
308
- }
309
-
310
- /// Byte offset boundary for a page.
311
- ///
312
- /// Tracks where a specific page's content starts and ends in the main content string,
313
- /// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
314
- /// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
315
- #[derive(Debug, Clone, Serialize, Deserialize)]
316
- pub struct PageBoundary {
317
- /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
318
- pub byte_start: usize,
319
- /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
320
- pub byte_end: usize,
321
- /// Page number (1-indexed)
322
- pub page_number: usize,
323
- }
324
-
325
- /// Metadata for individual page/slide/sheet.
326
- ///
327
- /// Captures per-page information including dimensions, content counts,
328
- /// and visibility state (for presentations).
329
- #[derive(Debug, Clone, Serialize, Deserialize)]
330
- pub struct PageInfo {
331
- /// Page number (1-indexed)
332
- pub number: usize,
333
-
334
- /// Page title (usually for presentations)
335
- #[serde(skip_serializing_if = "Option::is_none")]
336
- pub title: Option<String>,
337
-
338
- /// Dimensions in points (PDF) or pixels (images): (width, height)
339
- #[serde(skip_serializing_if = "Option::is_none")]
340
- pub dimensions: Option<(f64, f64)>,
341
-
342
- /// Number of images on this page
343
- #[serde(skip_serializing_if = "Option::is_none")]
344
- pub image_count: Option<usize>,
345
-
346
- /// Number of tables on this page
347
- #[serde(skip_serializing_if = "Option::is_none")]
348
- pub table_count: Option<usize>,
349
-
350
- /// Whether this page is hidden (e.g., in presentations)
351
- #[serde(skip_serializing_if = "Option::is_none")]
352
- pub hidden: Option<bool>,
353
- }
354
-
355
- /// Content for a single page/slide.
356
- ///
357
- /// When page extraction is enabled, documents are split into per-page content
358
- /// with associated tables and images mapped to each page.
359
- ///
360
- /// # Performance
361
- ///
362
- /// Uses Arc-wrapped tables and images for memory efficiency:
363
- /// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
364
- /// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
365
- /// - Maintains exact JSON compatibility via custom Serialize/Deserialize
366
- ///
367
- /// This reduces memory overhead for documents with shared tables/images
368
- /// by avoiding redundant copies during serialization.
369
- #[derive(Debug, Clone, Serialize, Deserialize)]
370
- pub struct PageContent {
371
- /// Page number (1-indexed)
372
- pub page_number: usize,
373
-
374
- /// Text content for this page
375
- pub content: String,
376
-
377
- /// Tables found on this page (uses Arc for memory efficiency)
378
- ///
379
- /// Serializes as Vec<Table> for JSON compatibility while maintaining
380
- /// Arc semantics in-memory for zero-copy sharing.
381
- #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
382
- pub tables: Vec<Arc<Table>>,
383
-
384
- /// Images found on this page (uses Arc for memory efficiency)
385
- ///
386
- /// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
387
- /// Arc semantics in-memory for zero-copy sharing.
388
- #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
389
- pub images: Vec<Arc<ExtractedImage>>,
390
-
391
- /// Hierarchy information for the page (when hierarchy extraction is enabled)
392
- ///
393
- /// Contains text hierarchy levels (H1-H6) extracted from the page content.
394
- #[serde(skip_serializing_if = "Option::is_none")]
395
- pub hierarchy: Option<PageHierarchy>,
396
- }
397
-
398
- /// Page hierarchy structure containing heading levels and block information.
399
- ///
400
- /// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
401
- /// blocks with heading levels (H1-H6) for semantic document structure.
402
- #[derive(Debug, Clone, Serialize, Deserialize)]
403
- pub struct PageHierarchy {
404
- /// Number of hierarchy blocks on this page
405
- pub block_count: usize,
406
-
407
- /// Hierarchical blocks with heading levels
408
- #[serde(skip_serializing_if = "Vec::is_empty", default)]
409
- pub blocks: Vec<HierarchicalBlock>,
410
- }
411
-
412
- /// A text block with hierarchy level assignment.
413
- ///
414
- /// Represents a block of text with semantic heading information extracted from
415
- /// font size clustering and hierarchical analysis.
416
- #[derive(Debug, Clone, Serialize, Deserialize)]
417
- pub struct HierarchicalBlock {
418
- /// The text content of this block
419
- pub text: String,
420
-
421
- /// The font size of the text in this block
422
- pub font_size: f32,
423
-
424
- /// The hierarchy level of this block (H1-H6 or Body)
425
- ///
426
- /// Levels correspond to HTML heading tags:
427
- /// - "h1": Top-level heading
428
- /// - "h2": Secondary heading
429
- /// - "h3": Tertiary heading
430
- /// - "h4": Quaternary heading
431
- /// - "h5": Quinary heading
432
- /// - "h6": Senary heading
433
- /// - "body": Body text (no heading level)
434
- pub level: String,
435
-
436
- /// Bounding box information for the block
437
- ///
438
- /// Contains coordinates as (left, top, right, bottom) in PDF units.
439
- #[serde(skip_serializing_if = "Option::is_none")]
440
- pub bbox: Option<(f32, f32, f32, f32)>,
441
- }
442
-
443
105
  /// Excel/spreadsheet metadata.
444
106
  ///
445
107
  /// Contains information about sheets in Excel, LibreOffice Calc, and other
@@ -551,308 +213,73 @@ pub struct TextMetadata {
551
213
  pub code_blocks: Option<Vec<(String, String)>>,
552
214
  }
553
215
 
554
- /// Text direction enumeration for HTML documents.
555
- #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
556
- #[serde(rename_all = "lowercase")]
557
- pub enum TextDirection {
558
- /// Left-to-right text direction
559
- #[serde(rename = "ltr")]
560
- LeftToRight,
561
- /// Right-to-left text direction
562
- #[serde(rename = "rtl")]
563
- RightToLeft,
564
- /// Automatic text direction detection
565
- #[serde(rename = "auto")]
566
- Auto,
567
- }
568
-
569
- /// Header/heading element metadata.
570
- #[derive(Debug, Clone, Serialize, Deserialize)]
571
- pub struct HeaderMetadata {
572
- /// Header level: 1 (h1) through 6 (h6)
573
- pub level: u8,
574
- /// Normalized text content of the header
575
- pub text: String,
576
- /// HTML id attribute if present
577
- #[serde(skip_serializing_if = "Option::is_none")]
578
- pub id: Option<String>,
579
- /// Document tree depth at the header element
580
- pub depth: usize,
581
- /// Byte offset in original HTML document
582
- pub html_offset: usize,
583
- }
584
-
585
- /// Link element metadata.
586
- #[derive(Debug, Clone, Serialize, Deserialize)]
587
- pub struct LinkMetadata {
588
- /// The href URL value
589
- pub href: String,
590
- /// Link text content (normalized)
591
- pub text: String,
592
- /// Optional title attribute
593
- #[serde(skip_serializing_if = "Option::is_none")]
594
- pub title: Option<String>,
595
- /// Link type classification
596
- pub link_type: LinkType,
597
- /// Rel attribute values
598
- pub rel: Vec<String>,
599
- /// Additional attributes as key-value pairs
600
- pub attributes: HashMap<String, String>,
601
- }
602
-
603
- /// Link type classification.
604
- #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
605
- #[serde(rename_all = "lowercase")]
606
- pub enum LinkType {
607
- /// Anchor link (#section)
608
- Anchor,
609
- /// Internal link (same domain)
610
- Internal,
611
- /// External link (different domain)
612
- External,
613
- /// Email link (mailto:)
614
- Email,
615
- /// Phone link (tel:)
616
- Phone,
617
- /// Other link type
618
- Other,
619
- }
620
-
621
- /// Image element metadata.
622
- #[derive(Debug, Clone, Serialize, Deserialize)]
623
- pub struct ImageMetadataType {
624
- /// Image source (URL, data URI, or SVG content)
625
- pub src: String,
626
- /// Alternative text from alt attribute
627
- #[serde(skip_serializing_if = "Option::is_none")]
628
- pub alt: Option<String>,
629
- /// Title attribute
630
- #[serde(skip_serializing_if = "Option::is_none")]
631
- pub title: Option<String>,
632
- /// Image dimensions as (width, height) if available
633
- pub dimensions: Option<(u32, u32)>,
634
- /// Image type classification
635
- pub image_type: ImageType,
636
- /// Additional attributes as key-value pairs
637
- pub attributes: HashMap<String, String>,
638
- }
639
-
640
- /// Image type classification.
641
- #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
642
- #[serde(rename_all = "lowercase")]
643
- pub enum ImageType {
644
- /// Data URI image
645
- #[serde(rename = "data-uri")]
646
- DataUri,
647
- /// Inline SVG
648
- #[serde(rename = "inline-svg")]
649
- InlineSvg,
650
- /// External image URL
651
- External,
652
- /// Relative path image
653
- Relative,
654
- }
655
-
656
- /// Structured data (Schema.org, microdata, RDFa) block.
657
- #[derive(Debug, Clone, Serialize, Deserialize)]
658
- pub struct StructuredData {
659
- /// Type of structured data
660
- pub data_type: StructuredDataType,
661
- /// Raw JSON string representation
662
- pub raw_json: String,
663
- /// Schema type if detectable (e.g., "Article", "Event", "Product")
664
- #[serde(skip_serializing_if = "Option::is_none")]
665
- pub schema_type: Option<String>,
666
- }
667
-
668
- /// Structured data type classification.
669
- #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
670
- #[serde(rename_all = "lowercase")]
671
- pub enum StructuredDataType {
672
- /// JSON-LD structured data
673
- #[serde(rename = "json-ld")]
674
- JsonLd,
675
- /// Microdata
676
- Microdata,
677
- /// RDFa
678
- #[serde(rename = "rdfa")]
679
- RDFa,
680
- }
681
-
682
216
  /// HTML metadata extracted from HTML documents.
683
217
  ///
684
- /// Includes document-level metadata, Open Graph data, Twitter Card metadata,
685
- /// and extracted structural elements (headers, links, images, structured data).
218
+ /// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
686
219
  #[derive(Debug, Clone, Serialize, Deserialize, Default)]
687
220
  pub struct HtmlMetadata {
688
- /// Document title from `<title>` tag
689
221
  #[serde(skip_serializing_if = "Option::is_none")]
690
222
  pub title: Option<String>,
691
223
 
692
- /// Document description from `<meta name="description">` tag
693
224
  #[serde(skip_serializing_if = "Option::is_none")]
694
225
  pub description: Option<String>,
695
226
 
696
- /// Document keywords from `<meta name="keywords">` tag, split on commas
697
- #[serde(default)]
698
- pub keywords: Vec<String>,
227
+ #[serde(skip_serializing_if = "Option::is_none")]
228
+ pub keywords: Option<String>,
699
229
 
700
- /// Document author from `<meta name="author">` tag
701
230
  #[serde(skip_serializing_if = "Option::is_none")]
702
231
  pub author: Option<String>,
703
232
 
704
- /// Canonical URL from `<link rel="canonical">` tag
705
233
  #[serde(skip_serializing_if = "Option::is_none")]
706
- pub canonical_url: Option<String>,
234
+ pub canonical: Option<String>,
707
235
 
708
- /// Base URL from `<base href="">` tag for resolving relative URLs
709
236
  #[serde(skip_serializing_if = "Option::is_none")]
710
237
  pub base_href: Option<String>,
711
238
 
712
- /// Document language from `lang` attribute
713
239
  #[serde(skip_serializing_if = "Option::is_none")]
714
- pub language: Option<String>,
240
+ pub og_title: Option<String>,
715
241
 
716
- /// Document text direction from `dir` attribute
717
242
  #[serde(skip_serializing_if = "Option::is_none")]
718
- pub text_direction: Option<TextDirection>,
243
+ pub og_description: Option<String>,
719
244
 
720
- /// Open Graph metadata (og:* properties) for social media
721
- /// Keys like "title", "description", "image", "url", etc.
722
- #[serde(default)]
723
- pub open_graph: BTreeMap<String, String>,
245
+ #[serde(skip_serializing_if = "Option::is_none")]
246
+ pub og_image: Option<String>,
724
247
 
725
- /// Twitter Card metadata (twitter:* properties)
726
- /// Keys like "card", "site", "creator", "title", "description", "image", etc.
727
- #[serde(default)]
728
- pub twitter_card: BTreeMap<String, String>,
248
+ #[serde(skip_serializing_if = "Option::is_none")]
249
+ pub og_url: Option<String>,
729
250
 
730
- /// Additional meta tags not covered by specific fields
731
- /// Keys are meta name/property attributes, values are content
732
- #[serde(default)]
733
- pub meta_tags: BTreeMap<String, String>,
251
+ #[serde(skip_serializing_if = "Option::is_none")]
252
+ pub og_type: Option<String>,
734
253
 
735
- /// Extracted header elements with hierarchy
736
- #[serde(default)]
737
- pub headers: Vec<HeaderMetadata>,
254
+ #[serde(skip_serializing_if = "Option::is_none")]
255
+ pub og_site_name: Option<String>,
738
256
 
739
- /// Extracted hyperlinks with type classification
740
- #[serde(default)]
741
- pub links: Vec<LinkMetadata>,
257
+ #[serde(skip_serializing_if = "Option::is_none")]
258
+ pub twitter_card: Option<String>,
742
259
 
743
- /// Extracted images with source and dimensions
744
- #[serde(default)]
745
- pub images: Vec<ImageMetadataType>,
260
+ #[serde(skip_serializing_if = "Option::is_none")]
261
+ pub twitter_title: Option<String>,
746
262
 
747
- /// Extracted structured data blocks
748
- #[serde(default)]
749
- pub structured_data: Vec<StructuredData>,
750
- }
263
+ #[serde(skip_serializing_if = "Option::is_none")]
264
+ pub twitter_description: Option<String>,
751
265
 
752
- impl HtmlMetadata {
753
- /// Check if metadata is empty (no meaningful content extracted).
754
- pub fn is_empty(&self) -> bool {
755
- self.title.is_none()
756
- && self.description.is_none()
757
- && self.keywords.is_empty()
758
- && self.author.is_none()
759
- && self.canonical_url.is_none()
760
- && self.base_href.is_none()
761
- && self.language.is_none()
762
- && self.text_direction.is_none()
763
- && self.open_graph.is_empty()
764
- && self.twitter_card.is_empty()
765
- && self.meta_tags.is_empty()
766
- && self.headers.is_empty()
767
- && self.links.is_empty()
768
- && self.images.is_empty()
769
- && self.structured_data.is_empty()
770
- }
771
- }
266
+ #[serde(skip_serializing_if = "Option::is_none")]
267
+ pub twitter_image: Option<String>,
772
268
 
773
- #[cfg(feature = "html")]
774
- impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
775
- fn from(metadata: html_to_markdown_rs::ExtendedMetadata) -> Self {
776
- let text_dir = metadata.document.text_direction.map(|td| match td {
777
- html_to_markdown_rs::TextDirection::LeftToRight => TextDirection::LeftToRight,
778
- html_to_markdown_rs::TextDirection::RightToLeft => TextDirection::RightToLeft,
779
- html_to_markdown_rs::TextDirection::Auto => TextDirection::Auto,
780
- });
781
-
782
- HtmlMetadata {
783
- title: metadata.document.title,
784
- description: metadata.document.description,
785
- keywords: metadata.document.keywords,
786
- author: metadata.document.author,
787
- canonical_url: metadata.document.canonical_url,
788
- base_href: metadata.document.base_href,
789
- language: metadata.document.language,
790
- text_direction: text_dir,
791
- open_graph: metadata.document.open_graph,
792
- twitter_card: metadata.document.twitter_card,
793
- meta_tags: metadata.document.meta_tags,
794
- headers: metadata
795
- .headers
796
- .into_iter()
797
- .map(|h| HeaderMetadata {
798
- level: h.level,
799
- text: h.text,
800
- id: h.id,
801
- depth: h.depth,
802
- html_offset: h.html_offset,
803
- })
804
- .collect(),
805
- links: metadata
806
- .links
807
- .into_iter()
808
- .map(|l| LinkMetadata {
809
- href: l.href,
810
- text: l.text,
811
- title: l.title,
812
- link_type: match l.link_type {
813
- html_to_markdown_rs::LinkType::Anchor => LinkType::Anchor,
814
- html_to_markdown_rs::LinkType::Internal => LinkType::Internal,
815
- html_to_markdown_rs::LinkType::External => LinkType::External,
816
- html_to_markdown_rs::LinkType::Email => LinkType::Email,
817
- html_to_markdown_rs::LinkType::Phone => LinkType::Phone,
818
- html_to_markdown_rs::LinkType::Other => LinkType::Other,
819
- },
820
- rel: l.rel,
821
- attributes: l.attributes.into_iter().collect(),
822
- })
823
- .collect(),
824
- images: metadata
825
- .images
826
- .into_iter()
827
- .map(|img| ImageMetadataType {
828
- src: img.src,
829
- alt: img.alt,
830
- title: img.title,
831
- dimensions: img.dimensions,
832
- image_type: match img.image_type {
833
- html_to_markdown_rs::ImageType::DataUri => ImageType::DataUri,
834
- html_to_markdown_rs::ImageType::InlineSvg => ImageType::InlineSvg,
835
- html_to_markdown_rs::ImageType::External => ImageType::External,
836
- html_to_markdown_rs::ImageType::Relative => ImageType::Relative,
837
- },
838
- attributes: img.attributes.into_iter().collect(),
839
- })
840
- .collect(),
841
- structured_data: metadata
842
- .structured_data
843
- .into_iter()
844
- .map(|sd| StructuredData {
845
- data_type: match sd.data_type {
846
- html_to_markdown_rs::StructuredDataType::JsonLd => StructuredDataType::JsonLd,
847
- html_to_markdown_rs::StructuredDataType::Microdata => StructuredDataType::Microdata,
848
- html_to_markdown_rs::StructuredDataType::RDFa => StructuredDataType::RDFa,
849
- },
850
- raw_json: sd.raw_json,
851
- schema_type: sd.schema_type,
852
- })
853
- .collect(),
854
- }
855
- }
269
+ #[serde(skip_serializing_if = "Option::is_none")]
270
+ pub twitter_site: Option<String>,
271
+
272
+ #[serde(skip_serializing_if = "Option::is_none")]
273
+ pub twitter_creator: Option<String>,
274
+
275
+ #[serde(skip_serializing_if = "Option::is_none")]
276
+ pub link_author: Option<String>,
277
+
278
+ #[serde(skip_serializing_if = "Option::is_none")]
279
+ pub link_license: Option<String>,
280
+
281
+ #[serde(skip_serializing_if = "Option::is_none")]
282
+ pub link_alternate: Option<String>,
856
283
  }
857
284
 
858
285
  /// OCR processing metadata.
@@ -921,11 +348,11 @@ pub struct Chunk {
921
348
  /// Metadata about a chunk's position in the original document.
922
349
  #[derive(Debug, Clone, Serialize, Deserialize)]
923
350
  pub struct ChunkMetadata {
924
- /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
925
- pub byte_start: usize,
351
+ /// Character offset where this chunk starts in the original text.
352
+ pub char_start: usize,
926
353
 
927
- /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
928
- pub byte_end: usize,
354
+ /// Character offset where this chunk ends in the original text.
355
+ pub char_end: usize,
929
356
 
930
357
  /// Number of tokens in this chunk (if available).
931
358
  ///
@@ -938,18 +365,6 @@ pub struct ChunkMetadata {
938
365
 
939
366
  /// Total number of chunks in the document.
940
367
  pub total_chunks: usize,
941
-
942
- /// First page number this chunk spans (1-indexed).
943
- ///
944
- /// Only populated when page tracking is enabled in extraction configuration.
945
- #[serde(skip_serializing_if = "Option::is_none")]
946
- pub first_page: Option<usize>,
947
-
948
- /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
949
- ///
950
- /// Only populated when page tracking is enabled in extraction configuration.
951
- #[serde(skip_serializing_if = "Option::is_none")]
952
- pub last_page: Option<usize>,
953
368
  }
954
369
 
955
370
  /// Extracted image from a document.
@@ -1032,11 +447,6 @@ pub struct ExcelSheet {
1032
447
  pub col_count: usize,
1033
448
  /// Total number of non-empty cells
1034
449
  pub cell_count: usize,
1035
- /// Pre-extracted table cells (2D vector of cell values)
1036
- /// Populated during markdown generation to avoid re-parsing markdown.
1037
- /// None for empty sheets.
1038
- #[serde(skip)]
1039
- pub table_cells: Option<Vec<Vec<String>>>,
1040
450
  }
1041
451
 
1042
452
  /// XML extraction result.
@@ -1095,22 +505,22 @@ pub struct PptxExtractionResult {
1095
505
  pub table_count: usize,
1096
506
  /// Extracted images from the presentation
1097
507
  pub images: Vec<ExtractedImage>,
1098
- /// Slide structure with boundaries (when page tracking is enabled)
1099
- #[serde(skip_serializing_if = "Option::is_none")]
1100
- pub page_structure: Option<PageStructure>,
1101
- /// Per-slide content (when page tracking is enabled)
1102
- #[serde(skip_serializing_if = "Option::is_none")]
1103
- pub page_contents: Option<Vec<PageContent>>,
1104
508
  }
1105
509
 
1106
510
  /// PowerPoint presentation metadata.
1107
511
  ///
1108
- /// Contains PPTX-specific metadata. Common fields like title, author, and description
1109
- /// are now in the base `Metadata` struct.
512
+ /// Contains document-level metadata extracted from the PPTX file.
1110
513
  #[derive(Debug, Clone, Serialize, Deserialize)]
1111
514
  pub struct PptxMetadata {
515
+ /// Presentation title
516
+ pub title: Option<String>,
517
+ /// Author name
518
+ pub author: Option<String>,
519
+ /// Description/comments
520
+ pub description: Option<String>,
521
+ /// Summary text
522
+ pub summary: Option<String>,
1112
523
  /// List of fonts used in the presentation
1113
- #[serde(skip_serializing_if = "Vec::is_empty", default)]
1114
524
  pub fonts: Vec<String>,
1115
525
  }
1116
526
 
@@ -1434,6 +844,18 @@ pub struct CacheStats {
1434
844
  pub newest_file_age_days: f64,
1435
845
  }
1436
846
 
847
+ /// Pandoc extraction result.
848
+ ///
849
+ /// Result of extracting content from a document using Pandoc,
850
+ /// including text and any metadata Pandoc was able to extract.
851
+ #[derive(Debug, Clone, Serialize, Deserialize)]
852
+ pub struct PandocExtractionResult {
853
+ /// Extracted text content
854
+ pub content: String,
855
+ /// Metadata extracted by Pandoc (varies by format)
856
+ pub metadata: HashMap<String, serde_json::Value>,
857
+ }
858
+
1437
859
  /// LibreOffice conversion result.
1438
860
  ///
1439
861
  /// Result of converting a legacy office document (e.g., .doc, .ppt)
@@ -1449,265 +871,3 @@ pub struct LibreOfficeConversionResult {
1449
871
  /// Target MIME type after conversion
1450
872
  pub target_mime: String,
1451
873
  }
1452
-
1453
- #[cfg(test)]
1454
- mod tests {
1455
- use super::*;
1456
-
1457
- #[test]
1458
- fn test_metadata_serialization_with_format() {
1459
- let mut metadata = Metadata {
1460
- format: Some(FormatMetadata::Text(TextMetadata {
1461
- line_count: 1,
1462
- word_count: 2,
1463
- character_count: 13,
1464
- headers: None,
1465
- links: None,
1466
- code_blocks: None,
1467
- })),
1468
- ..Default::default()
1469
- };
1470
-
1471
- metadata
1472
- .additional
1473
- .insert("quality_score".to_string(), serde_json::json!(1.0));
1474
-
1475
- let json = serde_json::to_value(&metadata).unwrap();
1476
- println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
1477
-
1478
- assert!(
1479
- json.get("format_type").is_some(),
1480
- "format_type should be present in serialized JSON"
1481
- );
1482
- assert_eq!(json.get("format_type").unwrap(), "text");
1483
-
1484
- assert_eq!(json.get("line_count").unwrap(), 1);
1485
- assert_eq!(json.get("word_count").unwrap(), 2);
1486
- assert_eq!(json.get("character_count").unwrap(), 13);
1487
-
1488
- assert_eq!(json.get("quality_score").unwrap(), 1.0);
1489
- }
1490
-
1491
- #[test]
1492
- fn test_arc_table_serialization_format() {
1493
- let table = Table {
1494
- cells: vec![vec!["A".to_string(), "B".to_string()]],
1495
- markdown: "| A | B |\n|---|---|\n".to_string(),
1496
- page_number: 1,
1497
- };
1498
-
1499
- let json = serde_json::to_value(&table).unwrap();
1500
-
1501
- assert_eq!(json.get("cells").unwrap()[0][0], "A");
1502
- assert_eq!(json.get("markdown").unwrap(), "| A | B |\n|---|---|\n");
1503
- assert_eq!(json.get("page_number").unwrap(), 1);
1504
- }
1505
-
1506
- #[test]
1507
- fn test_arc_table_roundtrip() {
1508
- let original = Table {
1509
- cells: vec![
1510
- vec!["X".to_string(), "Y".to_string()],
1511
- vec!["1".to_string(), "2".to_string()],
1512
- ],
1513
- markdown: "| X | Y |\n|---|---|\n| 1 | 2 |\n".to_string(),
1514
- page_number: 5,
1515
- };
1516
-
1517
- let json = serde_json::to_string(&original).unwrap();
1518
- let deserialized: Table = serde_json::from_str(&json).unwrap();
1519
-
1520
- assert_eq!(deserialized.cells, original.cells);
1521
- assert_eq!(deserialized.markdown, original.markdown);
1522
- assert_eq!(deserialized.page_number, original.page_number);
1523
- }
1524
-
1525
- #[test]
1526
- fn test_arc_sharing_preserved_before_serialization() {
1527
- let shared_table = Arc::new(Table {
1528
- cells: vec![vec!["shared".to_string()]],
1529
- markdown: "| shared |".to_string(),
1530
- page_number: 1,
1531
- });
1532
-
1533
- let tables_before = [Arc::clone(&shared_table), Arc::clone(&shared_table)].to_vec();
1534
- assert_eq!(Arc::strong_count(&tables_before[0]), 3);
1535
- assert_eq!(Arc::strong_count(&tables_before[1]), 3);
1536
- assert!(Arc::ptr_eq(&tables_before[0], &tables_before[1]));
1537
- }
1538
-
1539
- #[test]
1540
- fn test_vec_arc_table_serialization_format() {
1541
- let tables = vec![
1542
- Table {
1543
- cells: vec![vec!["A".to_string()]],
1544
- markdown: "| A |".to_string(),
1545
- page_number: 1,
1546
- },
1547
- Table {
1548
- cells: vec![vec!["B".to_string()]],
1549
- markdown: "| B |".to_string(),
1550
- page_number: 2,
1551
- },
1552
- ];
1553
-
1554
- let json = serde_json::to_string(&tables).unwrap();
1555
- let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
1556
-
1557
- assert!(parsed.is_array());
1558
- assert_eq!(parsed.as_array().unwrap().len(), 2);
1559
- assert_eq!(parsed[0]["cells"][0][0], "A");
1560
- assert_eq!(parsed[1]["cells"][0][0], "B");
1561
- }
1562
-
1563
- #[test]
1564
- fn test_page_content_arc_tables_roundtrip() {
1565
- let page = PageContent {
1566
- page_number: 3,
1567
- content: "Page 3 content".to_string(),
1568
- tables: vec![
1569
- Arc::new(Table {
1570
- cells: vec![vec!["Table1".to_string()]],
1571
- markdown: "| Table1 |".to_string(),
1572
- page_number: 3,
1573
- }),
1574
- Arc::new(Table {
1575
- cells: vec![vec!["Table2".to_string()]],
1576
- markdown: "| Table2 |".to_string(),
1577
- page_number: 3,
1578
- }),
1579
- ],
1580
- images: Vec::new(),
1581
- hierarchy: None,
1582
- };
1583
-
1584
- let json = serde_json::to_string(&page).unwrap();
1585
- let deserialized: PageContent = serde_json::from_str(&json).unwrap();
1586
-
1587
- assert_eq!(deserialized.page_number, 3);
1588
- assert_eq!(deserialized.content, "Page 3 content");
1589
- assert_eq!(deserialized.tables.len(), 2);
1590
- assert_eq!(deserialized.tables[0].cells[0][0], "Table1");
1591
- assert_eq!(deserialized.tables[1].cells[0][0], "Table2");
1592
- }
1593
-
1594
- #[test]
1595
- fn test_page_content_arc_images_roundtrip() {
1596
- let image1 = Arc::new(ExtractedImage {
1597
- data: vec![0xFF, 0xD8, 0xFF],
1598
- format: "jpeg".to_string(),
1599
- image_index: 0,
1600
- page_number: Some(1),
1601
- width: Some(100),
1602
- height: Some(200),
1603
- colorspace: Some("RGB".to_string()),
1604
- bits_per_component: Some(8),
1605
- is_mask: false,
1606
- description: Some("Image 1".to_string()),
1607
- ocr_result: None,
1608
- });
1609
-
1610
- let image2 = Arc::new(ExtractedImage {
1611
- data: vec![0x89, 0x50, 0x4E],
1612
- format: "png".to_string(),
1613
- image_index: 1,
1614
- page_number: Some(1),
1615
- width: Some(300),
1616
- height: Some(400),
1617
- colorspace: Some("RGBA".to_string()),
1618
- bits_per_component: Some(8),
1619
- is_mask: false,
1620
- description: Some("Image 2".to_string()),
1621
- ocr_result: None,
1622
- });
1623
-
1624
- let page = PageContent {
1625
- page_number: 1,
1626
- content: "Page with images".to_string(),
1627
- tables: Vec::new(),
1628
- images: vec![image1, image2],
1629
- hierarchy: None,
1630
- };
1631
-
1632
- let json = serde_json::to_string(&page).unwrap();
1633
- let deserialized: PageContent = serde_json::from_str(&json).unwrap();
1634
-
1635
- assert_eq!(deserialized.images.len(), 2);
1636
- assert_eq!(deserialized.images[0].format, "jpeg");
1637
- assert_eq!(deserialized.images[0].width, Some(100));
1638
- assert_eq!(deserialized.images[1].format, "png");
1639
- assert_eq!(deserialized.images[1].height, Some(400));
1640
- }
1641
-
1642
- #[test]
1643
- fn test_arc_sharing_loss_with_page_content() {
1644
- let shared_table = Arc::new(Table {
1645
- cells: vec![vec!["shared across pages".to_string()]],
1646
- markdown: "| shared across pages |".to_string(),
1647
- page_number: 0,
1648
- });
1649
-
1650
- let page1 = PageContent {
1651
- page_number: 1,
1652
- content: "Page 1".to_string(),
1653
- tables: vec![Arc::clone(&shared_table)],
1654
- images: Vec::new(),
1655
- hierarchy: None,
1656
- };
1657
-
1658
- let page2 = PageContent {
1659
- page_number: 2,
1660
- content: "Page 2".to_string(),
1661
- tables: vec![Arc::clone(&shared_table)],
1662
- images: Vec::new(),
1663
- hierarchy: None,
1664
- };
1665
-
1666
- assert!(Arc::ptr_eq(&page1.tables[0], &page2.tables[0]));
1667
-
1668
- let pages = vec![page1, page2];
1669
- let json = serde_json::to_string(&pages).unwrap();
1670
- let deserialized: Vec<PageContent> = serde_json::from_str(&json).unwrap();
1671
-
1672
- assert_eq!(deserialized.len(), 2);
1673
- assert_eq!(deserialized[0].tables[0].cells, deserialized[1].tables[0].cells);
1674
- assert!(!Arc::ptr_eq(&deserialized[0].tables[0], &deserialized[1].tables[0]));
1675
- }
1676
-
1677
- #[test]
1678
- fn test_empty_page_content_arcs() {
1679
- let page = PageContent {
1680
- page_number: 5,
1681
- content: "No tables or images".to_string(),
1682
- tables: Vec::new(),
1683
- images: Vec::new(),
1684
- hierarchy: None,
1685
- };
1686
-
1687
- let json = serde_json::to_string(&page).unwrap();
1688
- let deserialized: PageContent = serde_json::from_str(&json).unwrap();
1689
-
1690
- assert_eq!(deserialized.page_number, 5);
1691
- assert_eq!(deserialized.tables.len(), 0);
1692
- assert_eq!(deserialized.images.len(), 0);
1693
- }
1694
-
1695
- #[test]
1696
- fn test_serde_vec_arc_module_behavior() {
1697
- let table1 = Table {
1698
- cells: vec![vec!["A".to_string()]],
1699
- markdown: "| A |".to_string(),
1700
- page_number: 1,
1701
- };
1702
-
1703
- let table2 = Table {
1704
- cells: vec![vec!["B".to_string()]],
1705
- markdown: "| B |".to_string(),
1706
- page_number: 2,
1707
- };
1708
-
1709
- let json = serde_json::to_string(&vec![table1, table2]).unwrap();
1710
- assert!(json.contains("\"A\""));
1711
- assert!(json.contains("\"B\""));
1712
- }
1713
- }