kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +538 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +2 -105
  8. data/README.md +454 -454
  9. data/Rakefile +25 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -341
  12. data/ext/kreuzberg_rb/extconf.rb +45 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
  14. data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
  15. data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
  23. data/extconf.rb +28 -28
  24. data/kreuzberg.gemspec +214 -214
  25. data/lib/kreuzberg/api_proxy.rb +142 -142
  26. data/lib/kreuzberg/cache_api.rb +81 -81
  27. data/lib/kreuzberg/cli.rb +55 -55
  28. data/lib/kreuzberg/cli_proxy.rb +127 -127
  29. data/lib/kreuzberg/config.rb +724 -724
  30. data/lib/kreuzberg/error_context.rb +80 -80
  31. data/lib/kreuzberg/errors.rb +118 -118
  32. data/lib/kreuzberg/extraction_api.rb +340 -340
  33. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  36. data/lib/kreuzberg/result.rb +279 -279
  37. data/lib/kreuzberg/setup_lib_path.rb +80 -80
  38. data/lib/kreuzberg/validator_protocol.rb +89 -89
  39. data/lib/kreuzberg/version.rb +5 -5
  40. data/lib/kreuzberg.rb +109 -109
  41. data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
  42. data/sig/kreuzberg/internal.rbs +184 -184
  43. data/sig/kreuzberg.rbs +546 -546
  44. data/spec/binding/cache_spec.rb +227 -227
  45. data/spec/binding/cli_proxy_spec.rb +85 -85
  46. data/spec/binding/cli_spec.rb +55 -55
  47. data/spec/binding/config_spec.rb +345 -345
  48. data/spec/binding/config_validation_spec.rb +283 -283
  49. data/spec/binding/error_handling_spec.rb +213 -213
  50. data/spec/binding/errors_spec.rb +66 -66
  51. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  52. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  53. data/spec/binding/plugins/validator_spec.rb +274 -274
  54. data/spec/fixtures/config.toml +39 -39
  55. data/spec/fixtures/config.yaml +41 -41
  56. data/spec/fixtures/invalid_config.toml +4 -4
  57. data/spec/smoke/package_spec.rb +178 -178
  58. data/spec/spec_helper.rb +42 -42
  59. data/vendor/Cargo.toml +2 -1
  60. data/vendor/kreuzberg/Cargo.toml +2 -2
  61. data/vendor/kreuzberg/README.md +230 -230
  62. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
  63. data/vendor/kreuzberg/build.rs +843 -843
  64. data/vendor/kreuzberg/src/api/error.rs +81 -81
  65. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  66. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  67. data/vendor/kreuzberg/src/api/server.rs +353 -353
  68. data/vendor/kreuzberg/src/api/types.rs +170 -170
  69. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  70. data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
  71. data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
  72. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  73. data/vendor/kreuzberg/src/core/config.rs +1080 -1080
  74. data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
  75. data/vendor/kreuzberg/src/core/io.rs +329 -329
  76. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  77. data/vendor/kreuzberg/src/core/mod.rs +47 -47
  78. data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
  79. data/vendor/kreuzberg/src/embeddings.rs +500 -500
  80. data/vendor/kreuzberg/src/error.rs +431 -431
  81. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  82. data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
  83. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  84. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  85. data/vendor/kreuzberg/src/extraction/html.rs +601 -601
  86. data/vendor/kreuzberg/src/extraction/image.rs +491 -491
  87. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
  88. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
  89. data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
  90. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  91. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  92. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  93. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  94. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
  95. data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
  96. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  97. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  98. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  99. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  100. data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
  101. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
  102. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
  103. data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
  104. data/vendor/kreuzberg/src/extractors/email.rs +157 -157
  105. data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
  106. data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
  107. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
  108. data/vendor/kreuzberg/src/extractors/html.rs +407 -407
  109. data/vendor/kreuzberg/src/extractors/image.rs +219 -219
  110. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
  112. data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
  114. data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
  115. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  116. data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
  117. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
  118. data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
  119. data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
  120. data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
  121. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
  122. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  123. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  124. data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
  125. data/vendor/kreuzberg/src/extractors/text.rs +265 -265
  126. data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
  127. data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
  128. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  129. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  130. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  131. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  132. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  133. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  134. data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
  135. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  136. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  137. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  138. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
  139. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
  140. data/vendor/kreuzberg/src/lib.rs +113 -113
  141. data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
  142. data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
  143. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  144. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  145. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  146. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  147. data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
  148. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  149. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  150. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
  151. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  152. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  153. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  154. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  155. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
  156. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
  157. data/vendor/kreuzberg/src/pdf/error.rs +130 -130
  158. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  159. data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
  160. data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
  161. data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
  162. data/vendor/kreuzberg/src/pdf/table.rs +420 -420
  163. data/vendor/kreuzberg/src/pdf/text.rs +240 -240
  164. data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
  165. data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
  166. data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
  167. data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
  168. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
  169. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  170. data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
  171. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  172. data/vendor/kreuzberg/src/text/mod.rs +25 -25
  173. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  174. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
  175. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  176. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  177. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  178. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  179. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  180. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  181. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  182. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  183. data/vendor/kreuzberg/src/types.rs +1055 -1055
  184. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  185. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  186. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  187. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  188. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  189. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  190. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  191. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  192. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  193. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  194. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  195. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  196. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  198. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  199. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  200. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  201. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  202. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  203. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  204. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  205. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  206. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  207. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  208. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  209. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  210. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  211. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  212. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  213. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  214. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  215. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  216. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  217. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  218. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  219. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  220. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  221. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  222. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  223. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  224. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  225. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  226. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  227. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  228. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  229. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  230. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  231. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  232. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  233. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  234. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  235. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  236. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  237. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  238. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  239. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  240. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  241. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  242. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  243. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  244. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  245. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  246. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  247. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  248. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  249. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  250. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  251. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  252. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  253. data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
  254. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
  255. data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
  256. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  257. data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
  258. data/vendor/kreuzberg/tests/config_features.rs +612 -612
  259. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
  260. data/vendor/kreuzberg/tests/core_integration.rs +510 -510
  261. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  262. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
  263. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  264. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  265. data/vendor/kreuzberg/tests/email_integration.rs +327 -327
  266. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  267. data/vendor/kreuzberg/tests/error_handling.rs +402 -402
  268. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  269. data/vendor/kreuzberg/tests/format_integration.rs +164 -164
  270. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  271. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  272. data/vendor/kreuzberg/tests/image_integration.rs +255 -255
  273. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  274. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  275. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  276. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  277. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  278. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  279. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  280. data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
  281. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
  282. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
  283. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
  284. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  285. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
  286. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  287. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  288. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
  289. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
  290. data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
  291. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
  292. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
  293. data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
  294. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  295. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
  296. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
  297. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
  298. data/vendor/kreuzberg/tests/security_validation.rs +416 -416
  299. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  300. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
  301. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
  302. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
  303. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  304. data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
  305. data/vendor/kreuzberg-ffi/README.md +851 -851
  306. data/vendor/kreuzberg-ffi/build.rs +176 -176
  307. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
  308. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
  309. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
  310. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
  311. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
  312. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
  313. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
  314. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
  315. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
  316. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
  317. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
  318. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
  319. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
  320. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
  321. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  322. data/vendor/kreuzberg-tesseract/LICENSE +22 -22
  323. data/vendor/kreuzberg-tesseract/README.md +399 -399
  324. data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
  325. data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
  326. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
  327. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
  328. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
  329. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
  330. data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
  331. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
  332. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
  333. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
  334. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
  335. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
  336. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
  337. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
  338. data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
  339. data/vendor/rb-sys/Cargo.lock +393 -393
  340. data/vendor/rb-sys/Cargo.toml +70 -70
  341. data/vendor/rb-sys/Cargo.toml.orig +57 -57
  342. data/vendor/rb-sys/LICENSE-APACHE +190 -190
  343. data/vendor/rb-sys/LICENSE-MIT +21 -21
  344. data/vendor/rb-sys/build/features.rs +111 -111
  345. data/vendor/rb-sys/build/main.rs +286 -286
  346. data/vendor/rb-sys/build/stable_api_config.rs +155 -155
  347. data/vendor/rb-sys/build/version.rs +50 -50
  348. data/vendor/rb-sys/readme.md +36 -36
  349. data/vendor/rb-sys/src/bindings.rs +21 -21
  350. data/vendor/rb-sys/src/hidden.rs +11 -11
  351. data/vendor/rb-sys/src/lib.rs +35 -35
  352. data/vendor/rb-sys/src/macros.rs +371 -371
  353. data/vendor/rb-sys/src/memory.rs +53 -53
  354. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
  355. data/vendor/rb-sys/src/special_consts.rs +31 -31
  356. data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
  357. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
  358. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
  359. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
  360. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
  361. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
  362. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
  363. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
  364. data/vendor/rb-sys/src/stable_api.rs +260 -260
  365. data/vendor/rb-sys/src/symbol.rs +31 -31
  366. data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
  367. data/vendor/rb-sys/src/utils.rs +89 -89
  368. data/vendor/rb-sys/src/value_type.rs +7 -7
  369. metadata +7 -80
@@ -1,1055 +1,1055 @@
1
- use serde::{Deserialize, Serialize};
2
- use std::collections::HashMap;
3
-
4
- #[cfg(feature = "pdf")]
5
- use crate::pdf::metadata::PdfMetadata;
6
-
7
- // ============================================================================
8
- // ============================================================================
9
-
10
- /// General extraction result used by the core extraction API.
11
- ///
12
- /// This is the main result type returned by all extraction functions.
13
- #[derive(Debug, Clone, Serialize, Deserialize)]
14
- pub struct ExtractionResult {
15
- pub content: String,
16
- pub mime_type: String,
17
- pub metadata: Metadata,
18
- pub tables: Vec<Table>,
19
- #[serde(skip_serializing_if = "Option::is_none")]
20
- pub detected_languages: Option<Vec<String>>,
21
-
22
- /// Text chunks when chunking is enabled.
23
- ///
24
- /// When chunking configuration is provided, the content is split into
25
- /// overlapping chunks for efficient processing. Each chunk contains the text,
26
- /// optional embeddings (if enabled), and metadata about its position.
27
- #[serde(skip_serializing_if = "Option::is_none")]
28
- pub chunks: Option<Vec<Chunk>>,
29
-
30
- /// Extracted images from the document.
31
- ///
32
- /// When image extraction is enabled via `ImageExtractionConfig`, this field
33
- /// contains all images found in the document with their raw data and metadata.
34
- /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
35
- #[serde(skip_serializing_if = "Option::is_none")]
36
- pub images: Option<Vec<ExtractedImage>>,
37
-
38
- /// Per-page content when page extraction is enabled.
39
- ///
40
- /// When page extraction is configured, the document is split into per-page content
41
- /// with tables and images mapped to their respective pages.
42
- #[serde(skip_serializing_if = "Option::is_none")]
43
- pub pages: Option<Vec<PageContent>>,
44
- }
45
-
46
- /// Format-specific metadata (discriminated union).
47
- ///
48
- /// Only one format type can exist per extraction result. This provides
49
- /// type-safe, clean metadata without nested optionals.
50
- #[derive(Debug, Clone, Serialize, Deserialize)]
51
- #[serde(tag = "format_type", rename_all = "snake_case")]
52
- pub enum FormatMetadata {
53
- #[cfg(feature = "pdf")]
54
- Pdf(PdfMetadata),
55
- Excel(ExcelMetadata),
56
- Email(EmailMetadata),
57
- Pptx(PptxMetadata),
58
- Archive(ArchiveMetadata),
59
- Image(ImageMetadata),
60
- Xml(XmlMetadata),
61
- Text(TextMetadata),
62
- Html(Box<HtmlMetadata>),
63
- Ocr(OcrMetadata),
64
- }
65
-
66
- /// Extraction result metadata.
67
- ///
68
- /// Contains common fields applicable to all formats, format-specific metadata
69
- /// via a discriminated union, and additional custom fields from postprocessors.
70
- #[derive(Debug, Clone, Serialize, Deserialize, Default)]
71
- pub struct Metadata {
72
- /// Document title
73
- #[serde(skip_serializing_if = "Option::is_none")]
74
- pub title: Option<String>,
75
-
76
- /// Document subject or description
77
- #[serde(skip_serializing_if = "Option::is_none")]
78
- pub subject: Option<String>,
79
-
80
- /// Primary author(s) - always Vec for consistency
81
- #[serde(skip_serializing_if = "Option::is_none")]
82
- pub authors: Option<Vec<String>>,
83
-
84
- /// Keywords/tags - always Vec for consistency
85
- #[serde(skip_serializing_if = "Option::is_none")]
86
- pub keywords: Option<Vec<String>>,
87
-
88
- /// Primary language (ISO 639 code)
89
- #[serde(skip_serializing_if = "Option::is_none")]
90
- pub language: Option<String>,
91
-
92
- /// Creation timestamp (ISO 8601 format)
93
- #[serde(skip_serializing_if = "Option::is_none")]
94
- pub created_at: Option<String>,
95
-
96
- /// Last modification timestamp (ISO 8601 format)
97
- #[serde(skip_serializing_if = "Option::is_none")]
98
- pub modified_at: Option<String>,
99
-
100
- /// User who created the document
101
- #[serde(skip_serializing_if = "Option::is_none")]
102
- pub created_by: Option<String>,
103
-
104
- /// User who last modified the document
105
- #[serde(skip_serializing_if = "Option::is_none")]
106
- pub modified_by: Option<String>,
107
-
108
- /// Page/slide/sheet structure with boundaries
109
- #[serde(skip_serializing_if = "Option::is_none")]
110
- pub pages: Option<PageStructure>,
111
-
112
- /// Document date (DEPRECATED - use created_at/modified_at instead)
113
- #[serde(skip_serializing_if = "Option::is_none")]
114
- pub date: Option<String>,
115
-
116
- /// Format-specific metadata (discriminated union)
117
- ///
118
- /// Contains detailed metadata specific to the document format.
119
- /// Serializes with a `format_type` discriminator field.
120
- #[serde(flatten, skip_serializing_if = "Option::is_none")]
121
- pub format: Option<FormatMetadata>,
122
-
123
- /// Image preprocessing metadata (when OCR preprocessing was applied)
124
- #[serde(skip_serializing_if = "Option::is_none")]
125
- pub image_preprocessing: Option<ImagePreprocessingMetadata>,
126
-
127
- /// JSON schema (for structured data extraction)
128
- #[serde(skip_serializing_if = "Option::is_none")]
129
- pub json_schema: Option<serde_json::Value>,
130
-
131
- /// Error metadata (for batch operations)
132
- #[serde(skip_serializing_if = "Option::is_none")]
133
- pub error: Option<ErrorMetadata>,
134
-
135
- /// Additional custom fields from postprocessors.
136
- ///
137
- /// This flattened HashMap allows Python/TypeScript postprocessors to add
138
- /// arbitrary fields (entity extraction, keyword extraction, etc.).
139
- /// Fields are merged at the root level during serialization.
140
- #[serde(flatten)]
141
- pub additional: HashMap<String, serde_json::Value>,
142
- }
143
-
144
- /// Unified page structure for documents.
145
- ///
146
- /// Supports different page types (PDF pages, PPTX slides, Excel sheets)
147
- /// with character offset boundaries for chunk-to-page mapping.
148
- #[derive(Debug, Clone, Serialize, Deserialize)]
149
- pub struct PageStructure {
150
- /// Total number of pages/slides/sheets
151
- pub total_count: usize,
152
-
153
- /// Type of paginated unit
154
- pub unit_type: PageUnitType,
155
-
156
- /// Character offset boundaries for each page
157
- ///
158
- /// Maps character ranges in the extracted content to page numbers.
159
- /// Used for chunk page range calculation.
160
- #[serde(skip_serializing_if = "Option::is_none")]
161
- pub boundaries: Option<Vec<PageBoundary>>,
162
-
163
- /// Detailed per-page metadata (optional, only when needed)
164
- #[serde(skip_serializing_if = "Option::is_none")]
165
- pub pages: Option<Vec<PageInfo>>,
166
- }
167
-
168
- /// Type of paginated unit in a document.
169
- ///
170
- /// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
171
- #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
172
- #[serde(rename_all = "snake_case")]
173
- pub enum PageUnitType {
174
- /// Standard document pages (PDF, DOCX, images)
175
- Page,
176
- /// Presentation slides (PPTX, ODP)
177
- Slide,
178
- /// Spreadsheet sheets (XLSX, ODS)
179
- Sheet,
180
- }
181
-
182
- /// Byte offset boundary for a page.
183
- ///
184
- /// Tracks where a specific page's content starts and ends in the main content string,
185
- /// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
186
- /// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
187
- #[derive(Debug, Clone, Serialize, Deserialize)]
188
- pub struct PageBoundary {
189
- /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
190
- pub byte_start: usize,
191
- /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
192
- pub byte_end: usize,
193
- /// Page number (1-indexed)
194
- pub page_number: usize,
195
- }
196
-
197
- /// Metadata for individual page/slide/sheet.
198
- ///
199
- /// Captures per-page information including dimensions, content counts,
200
- /// and visibility state (for presentations).
201
- #[derive(Debug, Clone, Serialize, Deserialize)]
202
- pub struct PageInfo {
203
- /// Page number (1-indexed)
204
- pub number: usize,
205
-
206
- /// Page title (usually for presentations)
207
- #[serde(skip_serializing_if = "Option::is_none")]
208
- pub title: Option<String>,
209
-
210
- /// Dimensions in points (PDF) or pixels (images): (width, height)
211
- #[serde(skip_serializing_if = "Option::is_none")]
212
- pub dimensions: Option<(f64, f64)>,
213
-
214
- /// Number of images on this page
215
- #[serde(skip_serializing_if = "Option::is_none")]
216
- pub image_count: Option<usize>,
217
-
218
- /// Number of tables on this page
219
- #[serde(skip_serializing_if = "Option::is_none")]
220
- pub table_count: Option<usize>,
221
-
222
- /// Whether this page is hidden (e.g., in presentations)
223
- #[serde(skip_serializing_if = "Option::is_none")]
224
- pub hidden: Option<bool>,
225
- }
226
-
227
- /// Content for a single page/slide.
228
- ///
229
- /// When page extraction is enabled, documents are split into per-page content
230
- /// with associated tables and images mapped to each page.
231
- #[derive(Debug, Clone, Serialize, Deserialize)]
232
- pub struct PageContent {
233
- /// Page number (1-indexed)
234
- pub page_number: usize,
235
-
236
- /// Text content for this page
237
- pub content: String,
238
-
239
- /// Tables found on this page
240
- #[serde(skip_serializing_if = "Vec::is_empty", default)]
241
- pub tables: Vec<Table>,
242
-
243
- /// Images found on this page
244
- #[serde(skip_serializing_if = "Vec::is_empty", default)]
245
- pub images: Vec<ExtractedImage>,
246
- }
247
-
248
- /// Excel/spreadsheet metadata.
249
- ///
250
- /// Contains information about sheets in Excel, LibreOffice Calc, and other
251
- /// spreadsheet formats (.xlsx, .xls, .ods, etc.).
252
- #[derive(Debug, Clone, Serialize, Deserialize)]
253
- pub struct ExcelMetadata {
254
- /// Total number of sheets in the workbook
255
- pub sheet_count: usize,
256
- /// Names of all sheets in order
257
- pub sheet_names: Vec<String>,
258
- }
259
-
260
- /// Email metadata extracted from .eml and .msg files.
261
- ///
262
- /// Includes sender/recipient information, message ID, and attachment list.
263
- #[derive(Debug, Clone, Serialize, Deserialize)]
264
- pub struct EmailMetadata {
265
- /// Sender's email address
266
- #[serde(skip_serializing_if = "Option::is_none")]
267
- pub from_email: Option<String>,
268
-
269
- /// Sender's display name
270
- #[serde(skip_serializing_if = "Option::is_none")]
271
- pub from_name: Option<String>,
272
-
273
- /// Primary recipients
274
- pub to_emails: Vec<String>,
275
- /// CC recipients
276
- pub cc_emails: Vec<String>,
277
- /// BCC recipients
278
- pub bcc_emails: Vec<String>,
279
-
280
- /// Message-ID header value
281
- #[serde(skip_serializing_if = "Option::is_none")]
282
- pub message_id: Option<String>,
283
-
284
- /// List of attachment filenames
285
- pub attachments: Vec<String>,
286
- }
287
-
288
- /// Archive (ZIP/TAR/7Z) metadata.
289
- ///
290
- /// Extracted from compressed archive files containing file lists and size information.
291
- #[derive(Debug, Clone, Serialize, Deserialize)]
292
- pub struct ArchiveMetadata {
293
- /// Archive format ("ZIP", "TAR", "7Z", etc.)
294
- pub format: String,
295
- /// Total number of files in the archive
296
- pub file_count: usize,
297
- /// List of file paths within the archive
298
- pub file_list: Vec<String>,
299
- /// Total uncompressed size in bytes
300
- pub total_size: usize,
301
-
302
- /// Compressed size in bytes (if available)
303
- #[serde(skip_serializing_if = "Option::is_none")]
304
- pub compressed_size: Option<usize>,
305
- }
306
-
307
- /// Image metadata extracted from image files.
308
- ///
309
- /// Includes dimensions, format, and EXIF data.
310
- #[derive(Debug, Clone, Serialize, Deserialize)]
311
- pub struct ImageMetadata {
312
- /// Image width in pixels
313
- pub width: u32,
314
- /// Image height in pixels
315
- pub height: u32,
316
- /// Image format (e.g., "PNG", "JPEG", "TIFF")
317
- pub format: String,
318
- /// EXIF metadata tags
319
- pub exif: HashMap<String, String>,
320
- }
321
-
322
- /// XML metadata extracted during XML parsing.
323
- ///
324
- /// Provides statistics about XML document structure.
325
- #[derive(Debug, Clone, Serialize, Deserialize)]
326
- pub struct XmlMetadata {
327
- /// Total number of XML elements processed
328
- pub element_count: usize,
329
- /// List of unique element tag names (sorted)
330
- pub unique_elements: Vec<String>,
331
- }
332
-
333
- /// Text/Markdown metadata.
334
- ///
335
- /// Extracted from plain text and Markdown files. Includes word counts and,
336
- /// for Markdown, structural elements like headers and links.
337
- #[derive(Debug, Clone, Serialize, Deserialize)]
338
- pub struct TextMetadata {
339
- /// Number of lines in the document
340
- pub line_count: usize,
341
- /// Number of words
342
- pub word_count: usize,
343
- /// Number of characters
344
- pub character_count: usize,
345
-
346
- /// Markdown headers (headings text only, for Markdown files)
347
- #[serde(skip_serializing_if = "Option::is_none")]
348
- pub headers: Option<Vec<String>>,
349
-
350
- /// Markdown links as (text, url) tuples (for Markdown files)
351
- #[serde(skip_serializing_if = "Option::is_none")]
352
- pub links: Option<Vec<(String, String)>>,
353
-
354
- /// Code blocks as (language, code) tuples (for Markdown files)
355
- #[serde(skip_serializing_if = "Option::is_none")]
356
- pub code_blocks: Option<Vec<(String, String)>>,
357
- }
358
-
359
- /// HTML metadata extracted from HTML documents.
360
- ///
361
- /// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
362
- #[derive(Debug, Clone, Serialize, Deserialize, Default)]
363
- pub struct HtmlMetadata {
364
- #[serde(skip_serializing_if = "Option::is_none")]
365
- pub title: Option<String>,
366
-
367
- #[serde(skip_serializing_if = "Option::is_none")]
368
- pub description: Option<String>,
369
-
370
- #[serde(skip_serializing_if = "Option::is_none")]
371
- pub keywords: Option<String>,
372
-
373
- #[serde(skip_serializing_if = "Option::is_none")]
374
- pub author: Option<String>,
375
-
376
- #[serde(skip_serializing_if = "Option::is_none")]
377
- pub canonical: Option<String>,
378
-
379
- #[serde(skip_serializing_if = "Option::is_none")]
380
- pub base_href: Option<String>,
381
-
382
- #[serde(skip_serializing_if = "Option::is_none")]
383
- pub og_title: Option<String>,
384
-
385
- #[serde(skip_serializing_if = "Option::is_none")]
386
- pub og_description: Option<String>,
387
-
388
- #[serde(skip_serializing_if = "Option::is_none")]
389
- pub og_image: Option<String>,
390
-
391
- #[serde(skip_serializing_if = "Option::is_none")]
392
- pub og_url: Option<String>,
393
-
394
- #[serde(skip_serializing_if = "Option::is_none")]
395
- pub og_type: Option<String>,
396
-
397
- #[serde(skip_serializing_if = "Option::is_none")]
398
- pub og_site_name: Option<String>,
399
-
400
- #[serde(skip_serializing_if = "Option::is_none")]
401
- pub twitter_card: Option<String>,
402
-
403
- #[serde(skip_serializing_if = "Option::is_none")]
404
- pub twitter_title: Option<String>,
405
-
406
- #[serde(skip_serializing_if = "Option::is_none")]
407
- pub twitter_description: Option<String>,
408
-
409
- #[serde(skip_serializing_if = "Option::is_none")]
410
- pub twitter_image: Option<String>,
411
-
412
- #[serde(skip_serializing_if = "Option::is_none")]
413
- pub twitter_site: Option<String>,
414
-
415
- #[serde(skip_serializing_if = "Option::is_none")]
416
- pub twitter_creator: Option<String>,
417
-
418
- #[serde(skip_serializing_if = "Option::is_none")]
419
- pub link_author: Option<String>,
420
-
421
- #[serde(skip_serializing_if = "Option::is_none")]
422
- pub link_license: Option<String>,
423
-
424
- #[serde(skip_serializing_if = "Option::is_none")]
425
- pub link_alternate: Option<String>,
426
- }
427
-
428
- /// OCR processing metadata.
429
- ///
430
- /// Captures information about OCR processing configuration and results.
431
- #[derive(Debug, Clone, Serialize, Deserialize)]
432
- pub struct OcrMetadata {
433
- /// OCR language code(s) used
434
- pub language: String,
435
- /// Tesseract Page Segmentation Mode (PSM)
436
- pub psm: i32,
437
- /// Output format (e.g., "text", "hocr")
438
- pub output_format: String,
439
- /// Number of tables detected
440
- pub table_count: usize,
441
-
442
- #[serde(skip_serializing_if = "Option::is_none")]
443
- pub table_rows: Option<usize>,
444
-
445
- #[serde(skip_serializing_if = "Option::is_none")]
446
- pub table_cols: Option<usize>,
447
- }
448
-
449
- /// Error metadata (for batch operations).
450
- #[derive(Debug, Clone, Serialize, Deserialize)]
451
- pub struct ErrorMetadata {
452
- pub error_type: String,
453
- pub message: String,
454
- }
455
-
456
- /// Extracted table structure.
457
- ///
458
- /// Represents a table detected and extracted from a document (PDF, image, etc.).
459
- /// Tables are converted to both structured cell data and Markdown format.
460
- #[derive(Debug, Clone, Serialize, Deserialize)]
461
- pub struct Table {
462
- /// Table cells as a 2D vector (rows × columns)
463
- pub cells: Vec<Vec<String>>,
464
- /// Markdown representation of the table
465
- pub markdown: String,
466
- /// Page number where the table was found (1-indexed)
467
- pub page_number: usize,
468
- }
469
-
470
- /// A text chunk with optional embedding and metadata.
471
- ///
472
- /// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
473
- /// contains the text content, optional embedding vector (if embedding generation
474
- /// is configured), and metadata about its position in the document.
475
- #[derive(Debug, Clone, Serialize, Deserialize)]
476
- pub struct Chunk {
477
- /// The text content of this chunk.
478
- pub content: String,
479
-
480
- /// Optional embedding vector for this chunk.
481
- ///
482
- /// Only populated when `EmbeddingConfig` is provided in chunking configuration.
483
- /// The dimensionality depends on the chosen embedding model.
484
- #[serde(skip_serializing_if = "Option::is_none")]
485
- pub embedding: Option<Vec<f32>>,
486
-
487
- /// Metadata about this chunk's position and properties.
488
- pub metadata: ChunkMetadata,
489
- }
490
-
491
- /// Metadata about a chunk's position in the original document.
492
- #[derive(Debug, Clone, Serialize, Deserialize)]
493
- pub struct ChunkMetadata {
494
- /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
495
- pub byte_start: usize,
496
-
497
- /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
498
- pub byte_end: usize,
499
-
500
- /// Number of tokens in this chunk (if available).
501
- ///
502
- /// This is calculated by the embedding model's tokenizer if embeddings are enabled.
503
- #[serde(skip_serializing_if = "Option::is_none")]
504
- pub token_count: Option<usize>,
505
-
506
- /// Zero-based index of this chunk in the document.
507
- pub chunk_index: usize,
508
-
509
- /// Total number of chunks in the document.
510
- pub total_chunks: usize,
511
-
512
- /// First page number this chunk spans (1-indexed).
513
- ///
514
- /// Only populated when page tracking is enabled in extraction configuration.
515
- #[serde(skip_serializing_if = "Option::is_none")]
516
- pub first_page: Option<usize>,
517
-
518
- /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
519
- ///
520
- /// Only populated when page tracking is enabled in extraction configuration.
521
- #[serde(skip_serializing_if = "Option::is_none")]
522
- pub last_page: Option<usize>,
523
- }
524
-
525
- /// Extracted image from a document.
526
- ///
527
- /// Contains raw image data, metadata, and optional nested OCR results.
528
- /// Raw bytes allow cross-language compatibility - users can convert to
529
- /// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
530
- #[derive(Debug, Clone, Serialize, Deserialize)]
531
- pub struct ExtractedImage {
532
- /// Raw image data (PNG, JPEG, WebP, etc. bytes)
533
- pub data: Vec<u8>,
534
-
535
- /// Image format (e.g., "jpeg", "png", "webp")
536
- pub format: String,
537
-
538
- /// Zero-indexed position of this image in the document/page
539
- pub image_index: usize,
540
-
541
- /// Page/slide number where image was found (1-indexed)
542
- #[serde(skip_serializing_if = "Option::is_none")]
543
- pub page_number: Option<usize>,
544
-
545
- /// Image width in pixels
546
- #[serde(skip_serializing_if = "Option::is_none")]
547
- pub width: Option<u32>,
548
-
549
- /// Image height in pixels
550
- #[serde(skip_serializing_if = "Option::is_none")]
551
- pub height: Option<u32>,
552
-
553
- /// Colorspace information (e.g., "RGB", "CMYK", "Gray")
554
- #[serde(skip_serializing_if = "Option::is_none")]
555
- pub colorspace: Option<String>,
556
-
557
- /// Bits per color component (e.g., 8, 16)
558
- #[serde(skip_serializing_if = "Option::is_none")]
559
- pub bits_per_component: Option<u32>,
560
-
561
- /// Whether this image is a mask image
562
- #[serde(default)]
563
- pub is_mask: bool,
564
-
565
- /// Optional description of the image
566
- #[serde(skip_serializing_if = "Option::is_none")]
567
- pub description: Option<String>,
568
-
569
- /// Nested OCR extraction result (if image was OCRed)
570
- ///
571
- /// When OCR is performed on this image, the result is embedded here
572
- /// rather than in a separate collection, making the relationship explicit.
573
- #[serde(skip_serializing_if = "Option::is_none")]
574
- pub ocr_result: Option<Box<ExtractionResult>>,
575
- }
576
-
577
- /// Excel workbook representation.
578
- ///
579
- /// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
580
- /// extracted content and metadata.
581
- #[derive(Debug, Clone, Serialize, Deserialize)]
582
- pub struct ExcelWorkbook {
583
- /// All sheets in the workbook
584
- pub sheets: Vec<ExcelSheet>,
585
- /// Workbook-level metadata (author, creation date, etc.)
586
- pub metadata: HashMap<String, String>,
587
- }
588
-
589
- /// Single Excel worksheet.
590
- ///
591
- /// Represents one sheet from an Excel workbook with its content
592
- /// converted to Markdown format and dimensional statistics.
593
- #[derive(Debug, Clone, Serialize, Deserialize)]
594
- pub struct ExcelSheet {
595
- /// Sheet name as it appears in Excel
596
- pub name: String,
597
- /// Sheet content converted to Markdown tables
598
- pub markdown: String,
599
- /// Number of rows
600
- pub row_count: usize,
601
- /// Number of columns
602
- pub col_count: usize,
603
- /// Total number of non-empty cells
604
- pub cell_count: usize,
605
- }
606
-
607
- /// XML extraction result.
608
- ///
609
- /// Contains extracted text content from XML files along with
610
- /// structural statistics about the XML document.
611
- #[derive(Debug, Clone, Serialize, Deserialize)]
612
- pub struct XmlExtractionResult {
613
- /// Extracted text content (XML structure filtered out)
614
- pub content: String,
615
- /// Total number of XML elements processed
616
- pub element_count: usize,
617
- /// List of unique element names found (sorted)
618
- pub unique_elements: Vec<String>,
619
- }
620
-
621
- /// Plain text and Markdown extraction result.
622
- ///
623
- /// Contains the extracted text along with statistics and,
624
- /// for Markdown files, structural elements like headers and links.
625
- #[derive(Debug, Clone, Serialize, Deserialize)]
626
- pub struct TextExtractionResult {
627
- /// Extracted text content
628
- pub content: String,
629
- /// Number of lines
630
- pub line_count: usize,
631
- /// Number of words
632
- pub word_count: usize,
633
- /// Number of characters
634
- pub character_count: usize,
635
- /// Markdown headers (text only, Markdown files only)
636
- #[serde(skip_serializing_if = "Option::is_none")]
637
- pub headers: Option<Vec<String>>,
638
- /// Markdown links as (text, URL) tuples (Markdown files only)
639
- #[serde(skip_serializing_if = "Option::is_none")]
640
- pub links: Option<Vec<(String, String)>>,
641
- /// Code blocks as (language, code) tuples (Markdown files only)
642
- #[serde(skip_serializing_if = "Option::is_none")]
643
- pub code_blocks: Option<Vec<(String, String)>>,
644
- }
645
-
646
- /// PowerPoint (PPTX) extraction result.
647
- ///
648
- /// Contains extracted slide content, metadata, and embedded images/tables.
649
- #[derive(Debug, Clone, Serialize, Deserialize)]
650
- pub struct PptxExtractionResult {
651
- /// Extracted text content from all slides
652
- pub content: String,
653
- /// Presentation metadata
654
- pub metadata: PptxMetadata,
655
- /// Total number of slides
656
- pub slide_count: usize,
657
- /// Total number of embedded images
658
- pub image_count: usize,
659
- /// Total number of tables
660
- pub table_count: usize,
661
- /// Extracted images from the presentation
662
- pub images: Vec<ExtractedImage>,
663
- /// Slide structure with boundaries (when page tracking is enabled)
664
- #[serde(skip_serializing_if = "Option::is_none")]
665
- pub page_structure: Option<PageStructure>,
666
- /// Per-slide content (when page tracking is enabled)
667
- #[serde(skip_serializing_if = "Option::is_none")]
668
- pub page_contents: Option<Vec<PageContent>>,
669
- }
670
-
671
- /// PowerPoint presentation metadata.
672
- ///
673
- /// Contains PPTX-specific metadata. Common fields like title, author, and description
674
- /// are now in the base `Metadata` struct.
675
- #[derive(Debug, Clone, Serialize, Deserialize)]
676
- pub struct PptxMetadata {
677
- /// List of fonts used in the presentation
678
- #[serde(skip_serializing_if = "Vec::is_empty", default)]
679
- pub fonts: Vec<String>,
680
- }
681
-
682
- /// Email extraction result.
683
- ///
684
- /// Complete representation of an extracted email message (.eml or .msg)
685
- /// including headers, body content, and attachments.
686
- #[derive(Debug, Clone, Serialize, Deserialize)]
687
- pub struct EmailExtractionResult {
688
- /// Email subject line
689
- pub subject: Option<String>,
690
- /// Sender email address
691
- pub from_email: Option<String>,
692
- /// Primary recipient email addresses
693
- pub to_emails: Vec<String>,
694
- /// CC recipient email addresses
695
- pub cc_emails: Vec<String>,
696
- /// BCC recipient email addresses
697
- pub bcc_emails: Vec<String>,
698
- /// Email date/timestamp
699
- pub date: Option<String>,
700
- /// Message-ID header value
701
- pub message_id: Option<String>,
702
- /// Plain text version of the email body
703
- pub plain_text: Option<String>,
704
- /// HTML version of the email body
705
- pub html_content: Option<String>,
706
- /// Cleaned/processed text content
707
- pub cleaned_text: String,
708
- /// List of email attachments
709
- pub attachments: Vec<EmailAttachment>,
710
- /// Additional email headers and metadata
711
- pub metadata: HashMap<String, String>,
712
- }
713
-
714
- /// Email attachment representation.
715
- ///
716
- /// Contains metadata and optionally the content of an email attachment.
717
- #[derive(Debug, Clone, Serialize, Deserialize)]
718
- pub struct EmailAttachment {
719
- /// Attachment name (from Content-Disposition header)
720
- pub name: Option<String>,
721
- /// Filename of the attachment
722
- pub filename: Option<String>,
723
- /// MIME type of the attachment
724
- pub mime_type: Option<String>,
725
- /// Size in bytes
726
- pub size: Option<usize>,
727
- /// Whether this attachment is an image
728
- pub is_image: bool,
729
- /// Attachment data (if extracted)
730
- pub data: Option<Vec<u8>>,
731
- }
732
-
733
- /// OCR extraction result.
734
- ///
735
- /// Result of performing OCR on an image or scanned document,
736
- /// including recognized text and detected tables.
737
- #[derive(Debug, Clone, Serialize, Deserialize)]
738
- pub struct OcrExtractionResult {
739
- /// Recognized text content
740
- pub content: String,
741
- /// Original MIME type of the processed image
742
- pub mime_type: String,
743
- /// OCR processing metadata (confidence scores, language, etc.)
744
- pub metadata: HashMap<String, serde_json::Value>,
745
- /// Tables detected and extracted via OCR
746
- pub tables: Vec<OcrTable>,
747
- }
748
-
749
- /// Table detected via OCR.
750
- ///
751
- /// Represents a table structure recognized during OCR processing.
752
- #[derive(Debug, Clone, Serialize, Deserialize)]
753
- pub struct OcrTable {
754
- /// Table cells as a 2D vector (rows × columns)
755
- pub cells: Vec<Vec<String>>,
756
- /// Markdown representation of the table
757
- pub markdown: String,
758
- /// Page number where the table was found (1-indexed)
759
- pub page_number: usize,
760
- }
761
-
762
- /// Image preprocessing configuration for OCR.
763
- ///
764
- /// These settings control how images are preprocessed before OCR to improve
765
- /// text recognition quality. Different preprocessing strategies work better
766
- /// for different document types.
767
- #[derive(Debug, Clone, Serialize, Deserialize)]
768
- #[serde(default)]
769
- pub struct ImagePreprocessingConfig {
770
- /// Target DPI for the image (300 is standard, 600 for small text).
771
- pub target_dpi: i32,
772
-
773
- /// Auto-detect and correct image rotation.
774
- pub auto_rotate: bool,
775
-
776
- /// Correct skew (tilted images).
777
- pub deskew: bool,
778
-
779
- /// Remove noise from the image.
780
- pub denoise: bool,
781
-
782
- /// Enhance contrast for better text visibility.
783
- pub contrast_enhance: bool,
784
-
785
- /// Binarization method: "otsu", "sauvola", "adaptive".
786
- pub binarization_method: String,
787
-
788
- /// Invert colors (white text on black → black on white).
789
- pub invert_colors: bool,
790
- }
791
-
792
- impl Default for ImagePreprocessingConfig {
793
- fn default() -> Self {
794
- Self {
795
- target_dpi: 300,
796
- auto_rotate: true,
797
- deskew: true,
798
- denoise: false,
799
- contrast_enhance: false,
800
- binarization_method: "otsu".to_string(),
801
- invert_colors: false,
802
- }
803
- }
804
- }
805
-
806
- /// Tesseract OCR configuration.
807
- ///
808
- /// Provides fine-grained control over Tesseract OCR engine parameters.
809
- /// Most users can use the defaults, but these settings allow optimization
810
- /// for specific document types (invoices, handwriting, etc.).
811
- #[derive(Debug, Clone, Serialize, Deserialize)]
812
- #[serde(default)]
813
- pub struct TesseractConfig {
814
- /// Language code (e.g., "eng", "deu", "fra")
815
- pub language: String,
816
-
817
- /// Page Segmentation Mode (0-13).
818
- ///
819
- /// Common values:
820
- /// - 3: Fully automatic page segmentation (default)
821
- /// - 6: Assume a single uniform block of text
822
- /// - 11: Sparse text with no particular order
823
- pub psm: i32,
824
-
825
- /// Output format ("text" or "markdown")
826
- pub output_format: String,
827
-
828
- /// OCR Engine Mode (0-3).
829
- ///
830
- /// - 0: Legacy engine only
831
- /// - 1: Neural nets (LSTM) only (usually best)
832
- /// - 2: Legacy + LSTM
833
- /// - 3: Default (based on what's available)
834
- pub oem: i32,
835
-
836
- /// Minimum confidence threshold (0.0-100.0).
837
- ///
838
- /// Words with confidence below this threshold may be rejected or flagged.
839
- pub min_confidence: f64,
840
-
841
- /// Image preprocessing configuration.
842
- ///
843
- /// Controls how images are preprocessed before OCR. Can significantly
844
- /// improve quality for scanned documents or low-quality images.
845
- #[serde(skip_serializing_if = "Option::is_none")]
846
- pub preprocessing: Option<ImagePreprocessingConfig>,
847
-
848
- /// Enable automatic table detection and reconstruction
849
- pub enable_table_detection: bool,
850
-
851
- /// Minimum confidence threshold for table detection (0.0-1.0)
852
- pub table_min_confidence: f64,
853
-
854
- /// Column threshold for table detection (pixels)
855
- pub table_column_threshold: i32,
856
-
857
- /// Row threshold ratio for table detection (0.0-1.0)
858
- pub table_row_threshold_ratio: f64,
859
-
860
- /// Enable OCR result caching
861
- pub use_cache: bool,
862
-
863
- /// Use pre-adapted templates for character classification
864
- pub classify_use_pre_adapted_templates: bool,
865
-
866
- /// Enable N-gram language model
867
- pub language_model_ngram_on: bool,
868
-
869
- /// Don't reject good words during block-level processing
870
- pub tessedit_dont_blkrej_good_wds: bool,
871
-
872
- /// Don't reject good words during row-level processing
873
- pub tessedit_dont_rowrej_good_wds: bool,
874
-
875
- /// Enable dictionary correction
876
- pub tessedit_enable_dict_correction: bool,
877
-
878
- /// Whitelist of allowed characters (empty = all allowed)
879
- pub tessedit_char_whitelist: String,
880
-
881
- /// Blacklist of forbidden characters (empty = none forbidden)
882
- pub tessedit_char_blacklist: String,
883
-
884
- /// Use primary language params model
885
- pub tessedit_use_primary_params_model: bool,
886
-
887
- /// Variable-width space detection
888
- pub textord_space_size_is_variable: bool,
889
-
890
- /// Use adaptive thresholding method
891
- pub thresholding_method: bool,
892
- }
893
-
894
- impl Default for TesseractConfig {
895
- fn default() -> Self {
896
- Self {
897
- language: "eng".to_string(),
898
- psm: 3,
899
- output_format: "markdown".to_string(),
900
- oem: 3,
901
- min_confidence: 0.0,
902
- preprocessing: None,
903
- enable_table_detection: true,
904
- table_min_confidence: 0.0,
905
- table_column_threshold: 50,
906
- table_row_threshold_ratio: 0.5,
907
- use_cache: true,
908
- classify_use_pre_adapted_templates: true,
909
- language_model_ngram_on: false,
910
- tessedit_dont_blkrej_good_wds: true,
911
- tessedit_dont_rowrej_good_wds: true,
912
- tessedit_enable_dict_correction: true,
913
- tessedit_char_whitelist: String::new(),
914
- tessedit_char_blacklist: String::new(),
915
- tessedit_use_primary_params_model: true,
916
- textord_space_size_is_variable: true,
917
- thresholding_method: false,
918
- }
919
- }
920
- }
921
-
922
- /// Image preprocessing metadata.
923
- ///
924
- /// Tracks the transformations applied to an image during OCR preprocessing,
925
- /// including DPI normalization, resizing, and resampling.
926
- #[derive(Debug, Clone, Serialize, Deserialize)]
927
- pub struct ImagePreprocessingMetadata {
928
- /// Original image dimensions (width, height) in pixels
929
- pub original_dimensions: (usize, usize),
930
- /// Original image DPI (horizontal, vertical)
931
- pub original_dpi: (f64, f64),
932
- /// Target DPI from configuration
933
- pub target_dpi: i32,
934
- /// Scaling factor applied to the image
935
- pub scale_factor: f64,
936
- /// Whether DPI was auto-adjusted based on content
937
- pub auto_adjusted: bool,
938
- /// Final DPI after processing
939
- pub final_dpi: i32,
940
- /// New dimensions after resizing (if resized)
941
- pub new_dimensions: Option<(usize, usize)>,
942
- /// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
943
- pub resample_method: String,
944
- /// Whether dimensions were clamped to max_image_dimension
945
- pub dimension_clamped: bool,
946
- /// Calculated optimal DPI (if auto_adjust_dpi enabled)
947
- pub calculated_dpi: Option<i32>,
948
- /// Whether resize was skipped (dimensions already optimal)
949
- pub skipped_resize: bool,
950
- /// Error message if resize failed
951
- pub resize_error: Option<String>,
952
- }
953
-
954
- /// Image extraction configuration (internal use).
955
- ///
956
- /// **Note:** This is an internal type used for image preprocessing.
957
- /// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
958
- #[derive(Debug, Clone, Serialize, Deserialize)]
959
- pub struct ExtractionConfig {
960
- /// Target DPI for image normalization
961
- pub target_dpi: i32,
962
- /// Maximum image dimension (width or height)
963
- pub max_image_dimension: i32,
964
- /// Whether to auto-adjust DPI based on content
965
- pub auto_adjust_dpi: bool,
966
- /// Minimum DPI threshold
967
- pub min_dpi: i32,
968
- /// Maximum DPI threshold
969
- pub max_dpi: i32,
970
- }
971
-
972
- impl Default for ExtractionConfig {
973
- fn default() -> Self {
974
- Self {
975
- target_dpi: 300,
976
- max_image_dimension: 4096,
977
- auto_adjust_dpi: true,
978
- min_dpi: 72,
979
- max_dpi: 600,
980
- }
981
- }
982
- }
983
-
984
- /// Cache statistics.
985
- ///
986
- /// Provides information about the extraction result cache,
987
- /// including size, file count, and age distribution.
988
- #[derive(Debug, Clone, Serialize, Deserialize)]
989
- pub struct CacheStats {
990
- /// Total number of cached files
991
- pub total_files: usize,
992
- /// Total cache size in megabytes
993
- pub total_size_mb: f64,
994
- /// Available disk space in megabytes
995
- pub available_space_mb: f64,
996
- /// Age of the oldest cached file in days
997
- pub oldest_file_age_days: f64,
998
- /// Age of the newest cached file in days
999
- pub newest_file_age_days: f64,
1000
- }
1001
-
1002
- /// LibreOffice conversion result.
1003
- ///
1004
- /// Result of converting a legacy office document (e.g., .doc, .ppt)
1005
- /// to a modern format using LibreOffice.
1006
- #[derive(Debug, Clone, Serialize, Deserialize)]
1007
- pub struct LibreOfficeConversionResult {
1008
- /// Converted file bytes
1009
- pub converted_bytes: Vec<u8>,
1010
- /// Original format identifier
1011
- pub original_format: String,
1012
- /// Target format identifier
1013
- pub target_format: String,
1014
- /// Target MIME type after conversion
1015
- pub target_mime: String,
1016
- }
1017
-
1018
- #[cfg(test)]
1019
- mod tests {
1020
- use super::*;
1021
-
1022
- #[test]
1023
- fn test_metadata_serialization_with_format() {
1024
- let mut metadata = Metadata {
1025
- format: Some(FormatMetadata::Text(TextMetadata {
1026
- line_count: 1,
1027
- word_count: 2,
1028
- character_count: 13,
1029
- headers: None,
1030
- links: None,
1031
- code_blocks: None,
1032
- })),
1033
- ..Default::default()
1034
- };
1035
-
1036
- metadata
1037
- .additional
1038
- .insert("quality_score".to_string(), serde_json::json!(1.0));
1039
-
1040
- let json = serde_json::to_value(&metadata).unwrap();
1041
- println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
1042
-
1043
- assert!(
1044
- json.get("format_type").is_some(),
1045
- "format_type should be present in serialized JSON"
1046
- );
1047
- assert_eq!(json.get("format_type").unwrap(), "text");
1048
-
1049
- assert_eq!(json.get("line_count").unwrap(), 1);
1050
- assert_eq!(json.get("word_count").unwrap(), 2);
1051
- assert_eq!(json.get("character_count").unwrap(), 13);
1052
-
1053
- assert_eq!(json.get("quality_score").unwrap(), 1.0);
1054
- }
1055
- }
1
+ use serde::{Deserialize, Serialize};
2
+ use std::collections::HashMap;
3
+
4
+ #[cfg(feature = "pdf")]
5
+ use crate::pdf::metadata::PdfMetadata;
6
+
7
+ // ============================================================================
8
+ // ============================================================================
9
+
10
+ /// General extraction result used by the core extraction API.
11
+ ///
12
+ /// This is the main result type returned by all extraction functions.
13
+ #[derive(Debug, Clone, Serialize, Deserialize)]
14
+ pub struct ExtractionResult {
15
+ pub content: String,
16
+ pub mime_type: String,
17
+ pub metadata: Metadata,
18
+ pub tables: Vec<Table>,
19
+ #[serde(skip_serializing_if = "Option::is_none")]
20
+ pub detected_languages: Option<Vec<String>>,
21
+
22
+ /// Text chunks when chunking is enabled.
23
+ ///
24
+ /// When chunking configuration is provided, the content is split into
25
+ /// overlapping chunks for efficient processing. Each chunk contains the text,
26
+ /// optional embeddings (if enabled), and metadata about its position.
27
+ #[serde(skip_serializing_if = "Option::is_none")]
28
+ pub chunks: Option<Vec<Chunk>>,
29
+
30
+ /// Extracted images from the document.
31
+ ///
32
+ /// When image extraction is enabled via `ImageExtractionConfig`, this field
33
+ /// contains all images found in the document with their raw data and metadata.
34
+ /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
35
+ #[serde(skip_serializing_if = "Option::is_none")]
36
+ pub images: Option<Vec<ExtractedImage>>,
37
+
38
+ /// Per-page content when page extraction is enabled.
39
+ ///
40
+ /// When page extraction is configured, the document is split into per-page content
41
+ /// with tables and images mapped to their respective pages.
42
+ #[serde(skip_serializing_if = "Option::is_none")]
43
+ pub pages: Option<Vec<PageContent>>,
44
+ }
45
+
46
+ /// Format-specific metadata (discriminated union).
47
+ ///
48
+ /// Only one format type can exist per extraction result. This provides
49
+ /// type-safe, clean metadata without nested optionals.
50
+ #[derive(Debug, Clone, Serialize, Deserialize)]
51
+ #[serde(tag = "format_type", rename_all = "snake_case")]
52
+ pub enum FormatMetadata {
53
+ #[cfg(feature = "pdf")]
54
+ Pdf(PdfMetadata),
55
+ Excel(ExcelMetadata),
56
+ Email(EmailMetadata),
57
+ Pptx(PptxMetadata),
58
+ Archive(ArchiveMetadata),
59
+ Image(ImageMetadata),
60
+ Xml(XmlMetadata),
61
+ Text(TextMetadata),
62
+ Html(Box<HtmlMetadata>),
63
+ Ocr(OcrMetadata),
64
+ }
65
+
66
+ /// Extraction result metadata.
67
+ ///
68
+ /// Contains common fields applicable to all formats, format-specific metadata
69
+ /// via a discriminated union, and additional custom fields from postprocessors.
70
+ #[derive(Debug, Clone, Serialize, Deserialize, Default)]
71
+ pub struct Metadata {
72
+ /// Document title
73
+ #[serde(skip_serializing_if = "Option::is_none")]
74
+ pub title: Option<String>,
75
+
76
+ /// Document subject or description
77
+ #[serde(skip_serializing_if = "Option::is_none")]
78
+ pub subject: Option<String>,
79
+
80
+ /// Primary author(s) - always Vec for consistency
81
+ #[serde(skip_serializing_if = "Option::is_none")]
82
+ pub authors: Option<Vec<String>>,
83
+
84
+ /// Keywords/tags - always Vec for consistency
85
+ #[serde(skip_serializing_if = "Option::is_none")]
86
+ pub keywords: Option<Vec<String>>,
87
+
88
+ /// Primary language (ISO 639 code)
89
+ #[serde(skip_serializing_if = "Option::is_none")]
90
+ pub language: Option<String>,
91
+
92
+ /// Creation timestamp (ISO 8601 format)
93
+ #[serde(skip_serializing_if = "Option::is_none")]
94
+ pub created_at: Option<String>,
95
+
96
+ /// Last modification timestamp (ISO 8601 format)
97
+ #[serde(skip_serializing_if = "Option::is_none")]
98
+ pub modified_at: Option<String>,
99
+
100
+ /// User who created the document
101
+ #[serde(skip_serializing_if = "Option::is_none")]
102
+ pub created_by: Option<String>,
103
+
104
+ /// User who last modified the document
105
+ #[serde(skip_serializing_if = "Option::is_none")]
106
+ pub modified_by: Option<String>,
107
+
108
+ /// Page/slide/sheet structure with boundaries
109
+ #[serde(skip_serializing_if = "Option::is_none")]
110
+ pub pages: Option<PageStructure>,
111
+
112
+ /// Document date (DEPRECATED - use created_at/modified_at instead)
113
+ #[serde(skip_serializing_if = "Option::is_none")]
114
+ pub date: Option<String>,
115
+
116
+ /// Format-specific metadata (discriminated union)
117
+ ///
118
+ /// Contains detailed metadata specific to the document format.
119
+ /// Serializes with a `format_type` discriminator field.
120
+ #[serde(flatten, skip_serializing_if = "Option::is_none")]
121
+ pub format: Option<FormatMetadata>,
122
+
123
+ /// Image preprocessing metadata (when OCR preprocessing was applied)
124
+ #[serde(skip_serializing_if = "Option::is_none")]
125
+ pub image_preprocessing: Option<ImagePreprocessingMetadata>,
126
+
127
+ /// JSON schema (for structured data extraction)
128
+ #[serde(skip_serializing_if = "Option::is_none")]
129
+ pub json_schema: Option<serde_json::Value>,
130
+
131
+ /// Error metadata (for batch operations)
132
+ #[serde(skip_serializing_if = "Option::is_none")]
133
+ pub error: Option<ErrorMetadata>,
134
+
135
+ /// Additional custom fields from postprocessors.
136
+ ///
137
+ /// This flattened HashMap allows Python/TypeScript postprocessors to add
138
+ /// arbitrary fields (entity extraction, keyword extraction, etc.).
139
+ /// Fields are merged at the root level during serialization.
140
+ #[serde(flatten)]
141
+ pub additional: HashMap<String, serde_json::Value>,
142
+ }
143
+
144
+ /// Unified page structure for documents.
145
+ ///
146
+ /// Supports different page types (PDF pages, PPTX slides, Excel sheets)
147
+ /// with character offset boundaries for chunk-to-page mapping.
148
+ #[derive(Debug, Clone, Serialize, Deserialize)]
149
+ pub struct PageStructure {
150
+ /// Total number of pages/slides/sheets
151
+ pub total_count: usize,
152
+
153
+ /// Type of paginated unit
154
+ pub unit_type: PageUnitType,
155
+
156
+ /// Character offset boundaries for each page
157
+ ///
158
+ /// Maps character ranges in the extracted content to page numbers.
159
+ /// Used for chunk page range calculation.
160
+ #[serde(skip_serializing_if = "Option::is_none")]
161
+ pub boundaries: Option<Vec<PageBoundary>>,
162
+
163
+ /// Detailed per-page metadata (optional, only when needed)
164
+ #[serde(skip_serializing_if = "Option::is_none")]
165
+ pub pages: Option<Vec<PageInfo>>,
166
+ }
167
+
168
+ /// Type of paginated unit in a document.
169
+ ///
170
+ /// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
171
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
172
+ #[serde(rename_all = "snake_case")]
173
+ pub enum PageUnitType {
174
+ /// Standard document pages (PDF, DOCX, images)
175
+ Page,
176
+ /// Presentation slides (PPTX, ODP)
177
+ Slide,
178
+ /// Spreadsheet sheets (XLSX, ODS)
179
+ Sheet,
180
+ }
181
+
182
+ /// Byte offset boundary for a page.
183
+ ///
184
+ /// Tracks where a specific page's content starts and ends in the main content string,
185
+ /// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
186
+ /// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
187
+ #[derive(Debug, Clone, Serialize, Deserialize)]
188
+ pub struct PageBoundary {
189
+ /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
190
+ pub byte_start: usize,
191
+ /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
192
+ pub byte_end: usize,
193
+ /// Page number (1-indexed)
194
+ pub page_number: usize,
195
+ }
196
+
197
+ /// Metadata for individual page/slide/sheet.
198
+ ///
199
+ /// Captures per-page information including dimensions, content counts,
200
+ /// and visibility state (for presentations).
201
+ #[derive(Debug, Clone, Serialize, Deserialize)]
202
+ pub struct PageInfo {
203
+ /// Page number (1-indexed)
204
+ pub number: usize,
205
+
206
+ /// Page title (usually for presentations)
207
+ #[serde(skip_serializing_if = "Option::is_none")]
208
+ pub title: Option<String>,
209
+
210
+ /// Dimensions in points (PDF) or pixels (images): (width, height)
211
+ #[serde(skip_serializing_if = "Option::is_none")]
212
+ pub dimensions: Option<(f64, f64)>,
213
+
214
+ /// Number of images on this page
215
+ #[serde(skip_serializing_if = "Option::is_none")]
216
+ pub image_count: Option<usize>,
217
+
218
+ /// Number of tables on this page
219
+ #[serde(skip_serializing_if = "Option::is_none")]
220
+ pub table_count: Option<usize>,
221
+
222
+ /// Whether this page is hidden (e.g., in presentations)
223
+ #[serde(skip_serializing_if = "Option::is_none")]
224
+ pub hidden: Option<bool>,
225
+ }
226
+
227
+ /// Content for a single page/slide.
228
+ ///
229
+ /// When page extraction is enabled, documents are split into per-page content
230
+ /// with associated tables and images mapped to each page.
231
+ #[derive(Debug, Clone, Serialize, Deserialize)]
232
+ pub struct PageContent {
233
+ /// Page number (1-indexed)
234
+ pub page_number: usize,
235
+
236
+ /// Text content for this page
237
+ pub content: String,
238
+
239
+ /// Tables found on this page
240
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
241
+ pub tables: Vec<Table>,
242
+
243
+ /// Images found on this page
244
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
245
+ pub images: Vec<ExtractedImage>,
246
+ }
247
+
248
+ /// Excel/spreadsheet metadata.
249
+ ///
250
+ /// Contains information about sheets in Excel, LibreOffice Calc, and other
251
+ /// spreadsheet formats (.xlsx, .xls, .ods, etc.).
252
+ #[derive(Debug, Clone, Serialize, Deserialize)]
253
+ pub struct ExcelMetadata {
254
+ /// Total number of sheets in the workbook
255
+ pub sheet_count: usize,
256
+ /// Names of all sheets in order
257
+ pub sheet_names: Vec<String>,
258
+ }
259
+
260
+ /// Email metadata extracted from .eml and .msg files.
261
+ ///
262
+ /// Includes sender/recipient information, message ID, and attachment list.
263
+ #[derive(Debug, Clone, Serialize, Deserialize)]
264
+ pub struct EmailMetadata {
265
+ /// Sender's email address
266
+ #[serde(skip_serializing_if = "Option::is_none")]
267
+ pub from_email: Option<String>,
268
+
269
+ /// Sender's display name
270
+ #[serde(skip_serializing_if = "Option::is_none")]
271
+ pub from_name: Option<String>,
272
+
273
+ /// Primary recipients
274
+ pub to_emails: Vec<String>,
275
+ /// CC recipients
276
+ pub cc_emails: Vec<String>,
277
+ /// BCC recipients
278
+ pub bcc_emails: Vec<String>,
279
+
280
+ /// Message-ID header value
281
+ #[serde(skip_serializing_if = "Option::is_none")]
282
+ pub message_id: Option<String>,
283
+
284
+ /// List of attachment filenames
285
+ pub attachments: Vec<String>,
286
+ }
287
+
288
+ /// Archive (ZIP/TAR/7Z) metadata.
289
+ ///
290
+ /// Extracted from compressed archive files containing file lists and size information.
291
+ #[derive(Debug, Clone, Serialize, Deserialize)]
292
+ pub struct ArchiveMetadata {
293
+ /// Archive format ("ZIP", "TAR", "7Z", etc.)
294
+ pub format: String,
295
+ /// Total number of files in the archive
296
+ pub file_count: usize,
297
+ /// List of file paths within the archive
298
+ pub file_list: Vec<String>,
299
+ /// Total uncompressed size in bytes
300
+ pub total_size: usize,
301
+
302
+ /// Compressed size in bytes (if available)
303
+ #[serde(skip_serializing_if = "Option::is_none")]
304
+ pub compressed_size: Option<usize>,
305
+ }
306
+
307
+ /// Image metadata extracted from image files.
308
+ ///
309
+ /// Includes dimensions, format, and EXIF data.
310
+ #[derive(Debug, Clone, Serialize, Deserialize)]
311
+ pub struct ImageMetadata {
312
+ /// Image width in pixels
313
+ pub width: u32,
314
+ /// Image height in pixels
315
+ pub height: u32,
316
+ /// Image format (e.g., "PNG", "JPEG", "TIFF")
317
+ pub format: String,
318
+ /// EXIF metadata tags
319
+ pub exif: HashMap<String, String>,
320
+ }
321
+
322
+ /// XML metadata extracted during XML parsing.
323
+ ///
324
+ /// Provides statistics about XML document structure.
325
+ #[derive(Debug, Clone, Serialize, Deserialize)]
326
+ pub struct XmlMetadata {
327
+ /// Total number of XML elements processed
328
+ pub element_count: usize,
329
+ /// List of unique element tag names (sorted)
330
+ pub unique_elements: Vec<String>,
331
+ }
332
+
333
+ /// Text/Markdown metadata.
334
+ ///
335
+ /// Extracted from plain text and Markdown files. Includes word counts and,
336
+ /// for Markdown, structural elements like headers and links.
337
+ #[derive(Debug, Clone, Serialize, Deserialize)]
338
+ pub struct TextMetadata {
339
+ /// Number of lines in the document
340
+ pub line_count: usize,
341
+ /// Number of words
342
+ pub word_count: usize,
343
+ /// Number of characters
344
+ pub character_count: usize,
345
+
346
+ /// Markdown headers (headings text only, for Markdown files)
347
+ #[serde(skip_serializing_if = "Option::is_none")]
348
+ pub headers: Option<Vec<String>>,
349
+
350
+ /// Markdown links as (text, url) tuples (for Markdown files)
351
+ #[serde(skip_serializing_if = "Option::is_none")]
352
+ pub links: Option<Vec<(String, String)>>,
353
+
354
+ /// Code blocks as (language, code) tuples (for Markdown files)
355
+ #[serde(skip_serializing_if = "Option::is_none")]
356
+ pub code_blocks: Option<Vec<(String, String)>>,
357
+ }
358
+
359
+ /// HTML metadata extracted from HTML documents.
360
+ ///
361
+ /// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
362
+ #[derive(Debug, Clone, Serialize, Deserialize, Default)]
363
+ pub struct HtmlMetadata {
364
+ #[serde(skip_serializing_if = "Option::is_none")]
365
+ pub title: Option<String>,
366
+
367
+ #[serde(skip_serializing_if = "Option::is_none")]
368
+ pub description: Option<String>,
369
+
370
+ #[serde(skip_serializing_if = "Option::is_none")]
371
+ pub keywords: Option<String>,
372
+
373
+ #[serde(skip_serializing_if = "Option::is_none")]
374
+ pub author: Option<String>,
375
+
376
+ #[serde(skip_serializing_if = "Option::is_none")]
377
+ pub canonical: Option<String>,
378
+
379
+ #[serde(skip_serializing_if = "Option::is_none")]
380
+ pub base_href: Option<String>,
381
+
382
+ #[serde(skip_serializing_if = "Option::is_none")]
383
+ pub og_title: Option<String>,
384
+
385
+ #[serde(skip_serializing_if = "Option::is_none")]
386
+ pub og_description: Option<String>,
387
+
388
+ #[serde(skip_serializing_if = "Option::is_none")]
389
+ pub og_image: Option<String>,
390
+
391
+ #[serde(skip_serializing_if = "Option::is_none")]
392
+ pub og_url: Option<String>,
393
+
394
+ #[serde(skip_serializing_if = "Option::is_none")]
395
+ pub og_type: Option<String>,
396
+
397
+ #[serde(skip_serializing_if = "Option::is_none")]
398
+ pub og_site_name: Option<String>,
399
+
400
+ #[serde(skip_serializing_if = "Option::is_none")]
401
+ pub twitter_card: Option<String>,
402
+
403
+ #[serde(skip_serializing_if = "Option::is_none")]
404
+ pub twitter_title: Option<String>,
405
+
406
+ #[serde(skip_serializing_if = "Option::is_none")]
407
+ pub twitter_description: Option<String>,
408
+
409
+ #[serde(skip_serializing_if = "Option::is_none")]
410
+ pub twitter_image: Option<String>,
411
+
412
+ #[serde(skip_serializing_if = "Option::is_none")]
413
+ pub twitter_site: Option<String>,
414
+
415
+ #[serde(skip_serializing_if = "Option::is_none")]
416
+ pub twitter_creator: Option<String>,
417
+
418
+ #[serde(skip_serializing_if = "Option::is_none")]
419
+ pub link_author: Option<String>,
420
+
421
+ #[serde(skip_serializing_if = "Option::is_none")]
422
+ pub link_license: Option<String>,
423
+
424
+ #[serde(skip_serializing_if = "Option::is_none")]
425
+ pub link_alternate: Option<String>,
426
+ }
427
+
428
+ /// OCR processing metadata.
429
+ ///
430
+ /// Captures information about OCR processing configuration and results.
431
+ #[derive(Debug, Clone, Serialize, Deserialize)]
432
+ pub struct OcrMetadata {
433
+ /// OCR language code(s) used
434
+ pub language: String,
435
+ /// Tesseract Page Segmentation Mode (PSM)
436
+ pub psm: i32,
437
+ /// Output format (e.g., "text", "hocr")
438
+ pub output_format: String,
439
+ /// Number of tables detected
440
+ pub table_count: usize,
441
+
442
+ #[serde(skip_serializing_if = "Option::is_none")]
443
+ pub table_rows: Option<usize>,
444
+
445
+ #[serde(skip_serializing_if = "Option::is_none")]
446
+ pub table_cols: Option<usize>,
447
+ }
448
+
449
+ /// Error metadata (for batch operations).
450
+ #[derive(Debug, Clone, Serialize, Deserialize)]
451
+ pub struct ErrorMetadata {
452
+ pub error_type: String,
453
+ pub message: String,
454
+ }
455
+
456
+ /// Extracted table structure.
457
+ ///
458
+ /// Represents a table detected and extracted from a document (PDF, image, etc.).
459
+ /// Tables are converted to both structured cell data and Markdown format.
460
+ #[derive(Debug, Clone, Serialize, Deserialize)]
461
+ pub struct Table {
462
+ /// Table cells as a 2D vector (rows × columns)
463
+ pub cells: Vec<Vec<String>>,
464
+ /// Markdown representation of the table
465
+ pub markdown: String,
466
+ /// Page number where the table was found (1-indexed)
467
+ pub page_number: usize,
468
+ }
469
+
470
+ /// A text chunk with optional embedding and metadata.
471
+ ///
472
+ /// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
473
+ /// contains the text content, optional embedding vector (if embedding generation
474
+ /// is configured), and metadata about its position in the document.
475
+ #[derive(Debug, Clone, Serialize, Deserialize)]
476
+ pub struct Chunk {
477
+ /// The text content of this chunk.
478
+ pub content: String,
479
+
480
+ /// Optional embedding vector for this chunk.
481
+ ///
482
+ /// Only populated when `EmbeddingConfig` is provided in chunking configuration.
483
+ /// The dimensionality depends on the chosen embedding model.
484
+ #[serde(skip_serializing_if = "Option::is_none")]
485
+ pub embedding: Option<Vec<f32>>,
486
+
487
+ /// Metadata about this chunk's position and properties.
488
+ pub metadata: ChunkMetadata,
489
+ }
490
+
491
+ /// Metadata about a chunk's position in the original document.
492
+ #[derive(Debug, Clone, Serialize, Deserialize)]
493
+ pub struct ChunkMetadata {
494
+ /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
495
+ pub byte_start: usize,
496
+
497
+ /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
498
+ pub byte_end: usize,
499
+
500
+ /// Number of tokens in this chunk (if available).
501
+ ///
502
+ /// This is calculated by the embedding model's tokenizer if embeddings are enabled.
503
+ #[serde(skip_serializing_if = "Option::is_none")]
504
+ pub token_count: Option<usize>,
505
+
506
+ /// Zero-based index of this chunk in the document.
507
+ pub chunk_index: usize,
508
+
509
+ /// Total number of chunks in the document.
510
+ pub total_chunks: usize,
511
+
512
+ /// First page number this chunk spans (1-indexed).
513
+ ///
514
+ /// Only populated when page tracking is enabled in extraction configuration.
515
+ #[serde(skip_serializing_if = "Option::is_none")]
516
+ pub first_page: Option<usize>,
517
+
518
+ /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
519
+ ///
520
+ /// Only populated when page tracking is enabled in extraction configuration.
521
+ #[serde(skip_serializing_if = "Option::is_none")]
522
+ pub last_page: Option<usize>,
523
+ }
524
+
525
+ /// Extracted image from a document.
526
+ ///
527
+ /// Contains raw image data, metadata, and optional nested OCR results.
528
+ /// Raw bytes allow cross-language compatibility - users can convert to
529
+ /// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
530
+ #[derive(Debug, Clone, Serialize, Deserialize)]
531
+ pub struct ExtractedImage {
532
+ /// Raw image data (PNG, JPEG, WebP, etc. bytes)
533
+ pub data: Vec<u8>,
534
+
535
+ /// Image format (e.g., "jpeg", "png", "webp")
536
+ pub format: String,
537
+
538
+ /// Zero-indexed position of this image in the document/page
539
+ pub image_index: usize,
540
+
541
+ /// Page/slide number where image was found (1-indexed)
542
+ #[serde(skip_serializing_if = "Option::is_none")]
543
+ pub page_number: Option<usize>,
544
+
545
+ /// Image width in pixels
546
+ #[serde(skip_serializing_if = "Option::is_none")]
547
+ pub width: Option<u32>,
548
+
549
+ /// Image height in pixels
550
+ #[serde(skip_serializing_if = "Option::is_none")]
551
+ pub height: Option<u32>,
552
+
553
+ /// Colorspace information (e.g., "RGB", "CMYK", "Gray")
554
+ #[serde(skip_serializing_if = "Option::is_none")]
555
+ pub colorspace: Option<String>,
556
+
557
+ /// Bits per color component (e.g., 8, 16)
558
+ #[serde(skip_serializing_if = "Option::is_none")]
559
+ pub bits_per_component: Option<u32>,
560
+
561
+ /// Whether this image is a mask image
562
+ #[serde(default)]
563
+ pub is_mask: bool,
564
+
565
+ /// Optional description of the image
566
+ #[serde(skip_serializing_if = "Option::is_none")]
567
+ pub description: Option<String>,
568
+
569
+ /// Nested OCR extraction result (if image was OCRed)
570
+ ///
571
+ /// When OCR is performed on this image, the result is embedded here
572
+ /// rather than in a separate collection, making the relationship explicit.
573
+ #[serde(skip_serializing_if = "Option::is_none")]
574
+ pub ocr_result: Option<Box<ExtractionResult>>,
575
+ }
576
+
577
+ /// Excel workbook representation.
578
+ ///
579
+ /// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
580
+ /// extracted content and metadata.
581
+ #[derive(Debug, Clone, Serialize, Deserialize)]
582
+ pub struct ExcelWorkbook {
583
+ /// All sheets in the workbook
584
+ pub sheets: Vec<ExcelSheet>,
585
+ /// Workbook-level metadata (author, creation date, etc.)
586
+ pub metadata: HashMap<String, String>,
587
+ }
588
+
589
+ /// Single Excel worksheet.
590
+ ///
591
+ /// Represents one sheet from an Excel workbook with its content
592
+ /// converted to Markdown format and dimensional statistics.
593
+ #[derive(Debug, Clone, Serialize, Deserialize)]
594
+ pub struct ExcelSheet {
595
+ /// Sheet name as it appears in Excel
596
+ pub name: String,
597
+ /// Sheet content converted to Markdown tables
598
+ pub markdown: String,
599
+ /// Number of rows
600
+ pub row_count: usize,
601
+ /// Number of columns
602
+ pub col_count: usize,
603
+ /// Total number of non-empty cells
604
+ pub cell_count: usize,
605
+ }
606
+
607
+ /// XML extraction result.
608
+ ///
609
+ /// Contains extracted text content from XML files along with
610
+ /// structural statistics about the XML document.
611
+ #[derive(Debug, Clone, Serialize, Deserialize)]
612
+ pub struct XmlExtractionResult {
613
+ /// Extracted text content (XML structure filtered out)
614
+ pub content: String,
615
+ /// Total number of XML elements processed
616
+ pub element_count: usize,
617
+ /// List of unique element names found (sorted)
618
+ pub unique_elements: Vec<String>,
619
+ }
620
+
621
+ /// Plain text and Markdown extraction result.
622
+ ///
623
+ /// Contains the extracted text along with statistics and,
624
+ /// for Markdown files, structural elements like headers and links.
625
+ #[derive(Debug, Clone, Serialize, Deserialize)]
626
+ pub struct TextExtractionResult {
627
+ /// Extracted text content
628
+ pub content: String,
629
+ /// Number of lines
630
+ pub line_count: usize,
631
+ /// Number of words
632
+ pub word_count: usize,
633
+ /// Number of characters
634
+ pub character_count: usize,
635
+ /// Markdown headers (text only, Markdown files only)
636
+ #[serde(skip_serializing_if = "Option::is_none")]
637
+ pub headers: Option<Vec<String>>,
638
+ /// Markdown links as (text, URL) tuples (Markdown files only)
639
+ #[serde(skip_serializing_if = "Option::is_none")]
640
+ pub links: Option<Vec<(String, String)>>,
641
+ /// Code blocks as (language, code) tuples (Markdown files only)
642
+ #[serde(skip_serializing_if = "Option::is_none")]
643
+ pub code_blocks: Option<Vec<(String, String)>>,
644
+ }
645
+
646
+ /// PowerPoint (PPTX) extraction result.
647
+ ///
648
+ /// Contains extracted slide content, metadata, and embedded images/tables.
649
+ #[derive(Debug, Clone, Serialize, Deserialize)]
650
+ pub struct PptxExtractionResult {
651
+ /// Extracted text content from all slides
652
+ pub content: String,
653
+ /// Presentation metadata
654
+ pub metadata: PptxMetadata,
655
+ /// Total number of slides
656
+ pub slide_count: usize,
657
+ /// Total number of embedded images
658
+ pub image_count: usize,
659
+ /// Total number of tables
660
+ pub table_count: usize,
661
+ /// Extracted images from the presentation
662
+ pub images: Vec<ExtractedImage>,
663
+ /// Slide structure with boundaries (when page tracking is enabled)
664
+ #[serde(skip_serializing_if = "Option::is_none")]
665
+ pub page_structure: Option<PageStructure>,
666
+ /// Per-slide content (when page tracking is enabled)
667
+ #[serde(skip_serializing_if = "Option::is_none")]
668
+ pub page_contents: Option<Vec<PageContent>>,
669
+ }
670
+
671
+ /// PowerPoint presentation metadata.
672
+ ///
673
+ /// Contains PPTX-specific metadata. Common fields like title, author, and description
674
+ /// are now in the base `Metadata` struct.
675
+ #[derive(Debug, Clone, Serialize, Deserialize)]
676
+ pub struct PptxMetadata {
677
+ /// List of fonts used in the presentation
678
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
679
+ pub fonts: Vec<String>,
680
+ }
681
+
682
+ /// Email extraction result.
683
+ ///
684
+ /// Complete representation of an extracted email message (.eml or .msg)
685
+ /// including headers, body content, and attachments.
686
+ #[derive(Debug, Clone, Serialize, Deserialize)]
687
+ pub struct EmailExtractionResult {
688
+ /// Email subject line
689
+ pub subject: Option<String>,
690
+ /// Sender email address
691
+ pub from_email: Option<String>,
692
+ /// Primary recipient email addresses
693
+ pub to_emails: Vec<String>,
694
+ /// CC recipient email addresses
695
+ pub cc_emails: Vec<String>,
696
+ /// BCC recipient email addresses
697
+ pub bcc_emails: Vec<String>,
698
+ /// Email date/timestamp
699
+ pub date: Option<String>,
700
+ /// Message-ID header value
701
+ pub message_id: Option<String>,
702
+ /// Plain text version of the email body
703
+ pub plain_text: Option<String>,
704
+ /// HTML version of the email body
705
+ pub html_content: Option<String>,
706
+ /// Cleaned/processed text content
707
+ pub cleaned_text: String,
708
+ /// List of email attachments
709
+ pub attachments: Vec<EmailAttachment>,
710
+ /// Additional email headers and metadata
711
+ pub metadata: HashMap<String, String>,
712
+ }
713
+
714
+ /// Email attachment representation.
715
+ ///
716
+ /// Contains metadata and optionally the content of an email attachment.
717
+ #[derive(Debug, Clone, Serialize, Deserialize)]
718
+ pub struct EmailAttachment {
719
+ /// Attachment name (from Content-Disposition header)
720
+ pub name: Option<String>,
721
+ /// Filename of the attachment
722
+ pub filename: Option<String>,
723
+ /// MIME type of the attachment
724
+ pub mime_type: Option<String>,
725
+ /// Size in bytes
726
+ pub size: Option<usize>,
727
+ /// Whether this attachment is an image
728
+ pub is_image: bool,
729
+ /// Attachment data (if extracted)
730
+ pub data: Option<Vec<u8>>,
731
+ }
732
+
733
+ /// OCR extraction result.
734
+ ///
735
+ /// Result of performing OCR on an image or scanned document,
736
+ /// including recognized text and detected tables.
737
+ #[derive(Debug, Clone, Serialize, Deserialize)]
738
+ pub struct OcrExtractionResult {
739
+ /// Recognized text content
740
+ pub content: String,
741
+ /// Original MIME type of the processed image
742
+ pub mime_type: String,
743
+ /// OCR processing metadata (confidence scores, language, etc.)
744
+ pub metadata: HashMap<String, serde_json::Value>,
745
+ /// Tables detected and extracted via OCR
746
+ pub tables: Vec<OcrTable>,
747
+ }
748
+
749
+ /// Table detected via OCR.
750
+ ///
751
+ /// Represents a table structure recognized during OCR processing.
752
+ #[derive(Debug, Clone, Serialize, Deserialize)]
753
+ pub struct OcrTable {
754
+ /// Table cells as a 2D vector (rows × columns)
755
+ pub cells: Vec<Vec<String>>,
756
+ /// Markdown representation of the table
757
+ pub markdown: String,
758
+ /// Page number where the table was found (1-indexed)
759
+ pub page_number: usize,
760
+ }
761
+
762
+ /// Image preprocessing configuration for OCR.
763
+ ///
764
+ /// These settings control how images are preprocessed before OCR to improve
765
+ /// text recognition quality. Different preprocessing strategies work better
766
+ /// for different document types.
767
+ #[derive(Debug, Clone, Serialize, Deserialize)]
768
+ #[serde(default)]
769
+ pub struct ImagePreprocessingConfig {
770
+ /// Target DPI for the image (300 is standard, 600 for small text).
771
+ pub target_dpi: i32,
772
+
773
+ /// Auto-detect and correct image rotation.
774
+ pub auto_rotate: bool,
775
+
776
+ /// Correct skew (tilted images).
777
+ pub deskew: bool,
778
+
779
+ /// Remove noise from the image.
780
+ pub denoise: bool,
781
+
782
+ /// Enhance contrast for better text visibility.
783
+ pub contrast_enhance: bool,
784
+
785
+ /// Binarization method: "otsu", "sauvola", "adaptive".
786
+ pub binarization_method: String,
787
+
788
+ /// Invert colors (white text on black → black on white).
789
+ pub invert_colors: bool,
790
+ }
791
+
792
+ impl Default for ImagePreprocessingConfig {
793
+ fn default() -> Self {
794
+ Self {
795
+ target_dpi: 300,
796
+ auto_rotate: true,
797
+ deskew: true,
798
+ denoise: false,
799
+ contrast_enhance: false,
800
+ binarization_method: "otsu".to_string(),
801
+ invert_colors: false,
802
+ }
803
+ }
804
+ }
805
+
806
+ /// Tesseract OCR configuration.
807
+ ///
808
+ /// Provides fine-grained control over Tesseract OCR engine parameters.
809
+ /// Most users can use the defaults, but these settings allow optimization
810
+ /// for specific document types (invoices, handwriting, etc.).
811
+ #[derive(Debug, Clone, Serialize, Deserialize)]
812
+ #[serde(default)]
813
+ pub struct TesseractConfig {
814
+ /// Language code (e.g., "eng", "deu", "fra")
815
+ pub language: String,
816
+
817
+ /// Page Segmentation Mode (0-13).
818
+ ///
819
+ /// Common values:
820
+ /// - 3: Fully automatic page segmentation (default)
821
+ /// - 6: Assume a single uniform block of text
822
+ /// - 11: Sparse text with no particular order
823
+ pub psm: i32,
824
+
825
+ /// Output format ("text" or "markdown")
826
+ pub output_format: String,
827
+
828
+ /// OCR Engine Mode (0-3).
829
+ ///
830
+ /// - 0: Legacy engine only
831
+ /// - 1: Neural nets (LSTM) only (usually best)
832
+ /// - 2: Legacy + LSTM
833
+ /// - 3: Default (based on what's available)
834
+ pub oem: i32,
835
+
836
+ /// Minimum confidence threshold (0.0-100.0).
837
+ ///
838
+ /// Words with confidence below this threshold may be rejected or flagged.
839
+ pub min_confidence: f64,
840
+
841
+ /// Image preprocessing configuration.
842
+ ///
843
+ /// Controls how images are preprocessed before OCR. Can significantly
844
+ /// improve quality for scanned documents or low-quality images.
845
+ #[serde(skip_serializing_if = "Option::is_none")]
846
+ pub preprocessing: Option<ImagePreprocessingConfig>,
847
+
848
+ /// Enable automatic table detection and reconstruction
849
+ pub enable_table_detection: bool,
850
+
851
+ /// Minimum confidence threshold for table detection (0.0-1.0)
852
+ pub table_min_confidence: f64,
853
+
854
+ /// Column threshold for table detection (pixels)
855
+ pub table_column_threshold: i32,
856
+
857
+ /// Row threshold ratio for table detection (0.0-1.0)
858
+ pub table_row_threshold_ratio: f64,
859
+
860
+ /// Enable OCR result caching
861
+ pub use_cache: bool,
862
+
863
+ /// Use pre-adapted templates for character classification
864
+ pub classify_use_pre_adapted_templates: bool,
865
+
866
+ /// Enable N-gram language model
867
+ pub language_model_ngram_on: bool,
868
+
869
+ /// Don't reject good words during block-level processing
870
+ pub tessedit_dont_blkrej_good_wds: bool,
871
+
872
+ /// Don't reject good words during row-level processing
873
+ pub tessedit_dont_rowrej_good_wds: bool,
874
+
875
+ /// Enable dictionary correction
876
+ pub tessedit_enable_dict_correction: bool,
877
+
878
+ /// Whitelist of allowed characters (empty = all allowed)
879
+ pub tessedit_char_whitelist: String,
880
+
881
+ /// Blacklist of forbidden characters (empty = none forbidden)
882
+ pub tessedit_char_blacklist: String,
883
+
884
+ /// Use primary language params model
885
+ pub tessedit_use_primary_params_model: bool,
886
+
887
+ /// Variable-width space detection
888
+ pub textord_space_size_is_variable: bool,
889
+
890
+ /// Use adaptive thresholding method
891
+ pub thresholding_method: bool,
892
+ }
893
+
894
+ impl Default for TesseractConfig {
895
+ fn default() -> Self {
896
+ Self {
897
+ language: "eng".to_string(),
898
+ psm: 3,
899
+ output_format: "markdown".to_string(),
900
+ oem: 3,
901
+ min_confidence: 0.0,
902
+ preprocessing: None,
903
+ enable_table_detection: true,
904
+ table_min_confidence: 0.0,
905
+ table_column_threshold: 50,
906
+ table_row_threshold_ratio: 0.5,
907
+ use_cache: true,
908
+ classify_use_pre_adapted_templates: true,
909
+ language_model_ngram_on: false,
910
+ tessedit_dont_blkrej_good_wds: true,
911
+ tessedit_dont_rowrej_good_wds: true,
912
+ tessedit_enable_dict_correction: true,
913
+ tessedit_char_whitelist: String::new(),
914
+ tessedit_char_blacklist: String::new(),
915
+ tessedit_use_primary_params_model: true,
916
+ textord_space_size_is_variable: true,
917
+ thresholding_method: false,
918
+ }
919
+ }
920
+ }
921
+
922
+ /// Image preprocessing metadata.
923
+ ///
924
+ /// Tracks the transformations applied to an image during OCR preprocessing,
925
+ /// including DPI normalization, resizing, and resampling.
926
+ #[derive(Debug, Clone, Serialize, Deserialize)]
927
+ pub struct ImagePreprocessingMetadata {
928
+ /// Original image dimensions (width, height) in pixels
929
+ pub original_dimensions: (usize, usize),
930
+ /// Original image DPI (horizontal, vertical)
931
+ pub original_dpi: (f64, f64),
932
+ /// Target DPI from configuration
933
+ pub target_dpi: i32,
934
+ /// Scaling factor applied to the image
935
+ pub scale_factor: f64,
936
+ /// Whether DPI was auto-adjusted based on content
937
+ pub auto_adjusted: bool,
938
+ /// Final DPI after processing
939
+ pub final_dpi: i32,
940
+ /// New dimensions after resizing (if resized)
941
+ pub new_dimensions: Option<(usize, usize)>,
942
+ /// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
943
+ pub resample_method: String,
944
+ /// Whether dimensions were clamped to max_image_dimension
945
+ pub dimension_clamped: bool,
946
+ /// Calculated optimal DPI (if auto_adjust_dpi enabled)
947
+ pub calculated_dpi: Option<i32>,
948
+ /// Whether resize was skipped (dimensions already optimal)
949
+ pub skipped_resize: bool,
950
+ /// Error message if resize failed
951
+ pub resize_error: Option<String>,
952
+ }
953
+
954
+ /// Image extraction configuration (internal use).
955
+ ///
956
+ /// **Note:** This is an internal type used for image preprocessing.
957
+ /// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
958
+ #[derive(Debug, Clone, Serialize, Deserialize)]
959
+ pub struct ExtractionConfig {
960
+ /// Target DPI for image normalization
961
+ pub target_dpi: i32,
962
+ /// Maximum image dimension (width or height)
963
+ pub max_image_dimension: i32,
964
+ /// Whether to auto-adjust DPI based on content
965
+ pub auto_adjust_dpi: bool,
966
+ /// Minimum DPI threshold
967
+ pub min_dpi: i32,
968
+ /// Maximum DPI threshold
969
+ pub max_dpi: i32,
970
+ }
971
+
972
+ impl Default for ExtractionConfig {
973
+ fn default() -> Self {
974
+ Self {
975
+ target_dpi: 300,
976
+ max_image_dimension: 4096,
977
+ auto_adjust_dpi: true,
978
+ min_dpi: 72,
979
+ max_dpi: 600,
980
+ }
981
+ }
982
+ }
983
+
984
+ /// Cache statistics.
985
+ ///
986
+ /// Provides information about the extraction result cache,
987
+ /// including size, file count, and age distribution.
988
+ #[derive(Debug, Clone, Serialize, Deserialize)]
989
+ pub struct CacheStats {
990
+ /// Total number of cached files
991
+ pub total_files: usize,
992
+ /// Total cache size in megabytes
993
+ pub total_size_mb: f64,
994
+ /// Available disk space in megabytes
995
+ pub available_space_mb: f64,
996
+ /// Age of the oldest cached file in days
997
+ pub oldest_file_age_days: f64,
998
+ /// Age of the newest cached file in days
999
+ pub newest_file_age_days: f64,
1000
+ }
1001
+
1002
+ /// LibreOffice conversion result.
1003
+ ///
1004
+ /// Result of converting a legacy office document (e.g., .doc, .ppt)
1005
+ /// to a modern format using LibreOffice.
1006
+ #[derive(Debug, Clone, Serialize, Deserialize)]
1007
+ pub struct LibreOfficeConversionResult {
1008
+ /// Converted file bytes
1009
+ pub converted_bytes: Vec<u8>,
1010
+ /// Original format identifier
1011
+ pub original_format: String,
1012
+ /// Target format identifier
1013
+ pub target_format: String,
1014
+ /// Target MIME type after conversion
1015
+ pub target_mime: String,
1016
+ }
1017
+
1018
+ #[cfg(test)]
1019
+ mod tests {
1020
+ use super::*;
1021
+
1022
+ #[test]
1023
+ fn test_metadata_serialization_with_format() {
1024
+ let mut metadata = Metadata {
1025
+ format: Some(FormatMetadata::Text(TextMetadata {
1026
+ line_count: 1,
1027
+ word_count: 2,
1028
+ character_count: 13,
1029
+ headers: None,
1030
+ links: None,
1031
+ code_blocks: None,
1032
+ })),
1033
+ ..Default::default()
1034
+ };
1035
+
1036
+ metadata
1037
+ .additional
1038
+ .insert("quality_score".to_string(), serde_json::json!(1.0));
1039
+
1040
+ let json = serde_json::to_value(&metadata).unwrap();
1041
+ println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
1042
+
1043
+ assert!(
1044
+ json.get("format_type").is_some(),
1045
+ "format_type should be present in serialized JSON"
1046
+ );
1047
+ assert_eq!(json.get("format_type").unwrap(), "text");
1048
+
1049
+ assert_eq!(json.get("line_count").unwrap(), 1);
1050
+ assert_eq!(json.get("word_count").unwrap(), 2);
1051
+ assert_eq!(json.get("character_count").unwrap(), 13);
1052
+
1053
+ assert_eq!(json.get("quality_score").unwrap(), 1.0);
1054
+ }
1055
+ }