kreuzberg 4.0.0.pre.rc.11 → 4.0.0.pre.rc.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +538 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +2 -105
  8. data/README.md +454 -454
  9. data/Rakefile +25 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -341
  12. data/ext/kreuzberg_rb/extconf.rb +45 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
  14. data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6941
  15. data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
  23. data/extconf.rb +28 -28
  24. data/kreuzberg.gemspec +214 -214
  25. data/lib/kreuzberg/api_proxy.rb +142 -142
  26. data/lib/kreuzberg/cache_api.rb +81 -81
  27. data/lib/kreuzberg/cli.rb +55 -55
  28. data/lib/kreuzberg/cli_proxy.rb +127 -127
  29. data/lib/kreuzberg/config.rb +724 -724
  30. data/lib/kreuzberg/error_context.rb +80 -80
  31. data/lib/kreuzberg/errors.rb +118 -118
  32. data/lib/kreuzberg/extraction_api.rb +340 -340
  33. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  36. data/lib/kreuzberg/result.rb +279 -279
  37. data/lib/kreuzberg/setup_lib_path.rb +80 -80
  38. data/lib/kreuzberg/validator_protocol.rb +89 -89
  39. data/lib/kreuzberg/version.rb +5 -5
  40. data/lib/kreuzberg.rb +109 -109
  41. data/lib/{libpdfium.dylib → pdfium.dll} +0 -0
  42. data/sig/kreuzberg/internal.rbs +184 -184
  43. data/sig/kreuzberg.rbs +546 -546
  44. data/spec/binding/cache_spec.rb +227 -227
  45. data/spec/binding/cli_proxy_spec.rb +85 -85
  46. data/spec/binding/cli_spec.rb +55 -55
  47. data/spec/binding/config_spec.rb +345 -345
  48. data/spec/binding/config_validation_spec.rb +283 -283
  49. data/spec/binding/error_handling_spec.rb +213 -213
  50. data/spec/binding/errors_spec.rb +66 -66
  51. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  52. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  53. data/spec/binding/plugins/validator_spec.rb +274 -274
  54. data/spec/fixtures/config.toml +39 -39
  55. data/spec/fixtures/config.yaml +41 -41
  56. data/spec/fixtures/invalid_config.toml +4 -4
  57. data/spec/smoke/package_spec.rb +178 -178
  58. data/spec/spec_helper.rb +42 -42
  59. data/vendor/Cargo.toml +2 -1
  60. data/vendor/kreuzberg/Cargo.toml +2 -2
  61. data/vendor/kreuzberg/README.md +230 -230
  62. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
  63. data/vendor/kreuzberg/build.rs +843 -843
  64. data/vendor/kreuzberg/src/api/error.rs +81 -81
  65. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  66. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  67. data/vendor/kreuzberg/src/api/server.rs +353 -353
  68. data/vendor/kreuzberg/src/api/types.rs +170 -170
  69. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  70. data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
  71. data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
  72. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  73. data/vendor/kreuzberg/src/core/config.rs +1080 -1080
  74. data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
  75. data/vendor/kreuzberg/src/core/io.rs +329 -329
  76. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  77. data/vendor/kreuzberg/src/core/mod.rs +47 -47
  78. data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
  79. data/vendor/kreuzberg/src/embeddings.rs +500 -500
  80. data/vendor/kreuzberg/src/error.rs +431 -431
  81. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  82. data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
  83. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  84. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  85. data/vendor/kreuzberg/src/extraction/html.rs +601 -601
  86. data/vendor/kreuzberg/src/extraction/image.rs +491 -491
  87. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
  88. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
  89. data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
  90. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  91. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  92. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  93. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  94. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
  95. data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
  96. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  97. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  98. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  99. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  100. data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
  101. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
  102. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
  103. data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
  104. data/vendor/kreuzberg/src/extractors/email.rs +157 -157
  105. data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
  106. data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
  107. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
  108. data/vendor/kreuzberg/src/extractors/html.rs +407 -407
  109. data/vendor/kreuzberg/src/extractors/image.rs +219 -219
  110. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
  112. data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
  114. data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
  115. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  116. data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
  117. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
  118. data/vendor/kreuzberg/src/extractors/pdf.rs +749 -722
  119. data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
  120. data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
  121. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
  122. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  123. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  124. data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
  125. data/vendor/kreuzberg/src/extractors/text.rs +265 -265
  126. data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
  127. data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
  128. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  129. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  130. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  131. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  132. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  133. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  134. data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
  135. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  136. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  137. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  138. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
  139. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
  140. data/vendor/kreuzberg/src/lib.rs +113 -113
  141. data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
  142. data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
  143. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  144. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  145. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  146. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  147. data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
  148. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  149. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  150. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
  151. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  152. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  153. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  154. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  155. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
  156. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
  157. data/vendor/kreuzberg/src/pdf/error.rs +130 -130
  158. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  159. data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
  160. data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
  161. data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
  162. data/vendor/kreuzberg/src/pdf/table.rs +420 -420
  163. data/vendor/kreuzberg/src/pdf/text.rs +240 -240
  164. data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
  165. data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
  166. data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
  167. data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
  168. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
  169. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  170. data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
  171. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  172. data/vendor/kreuzberg/src/text/mod.rs +25 -25
  173. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  174. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
  175. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  176. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  177. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  178. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  179. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  180. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  181. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  182. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  183. data/vendor/kreuzberg/src/types.rs +1055 -1055
  184. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  185. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  186. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  187. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  188. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  189. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  190. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  191. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  192. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  193. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  194. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  195. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  196. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  198. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  199. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  200. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  201. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  202. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  203. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  204. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  205. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  206. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  207. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  208. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  209. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  210. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  211. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  212. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  213. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  214. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  215. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  216. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  217. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  218. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  219. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  220. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  221. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  222. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  223. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  224. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  225. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  226. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  227. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  228. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  229. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  230. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  231. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  232. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  233. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  234. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  235. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  236. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  237. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  238. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  239. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  240. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  241. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  242. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  243. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  244. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  245. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  246. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  247. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  248. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  249. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  250. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  251. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  252. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  253. data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
  254. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
  255. data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
  256. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  257. data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
  258. data/vendor/kreuzberg/tests/config_features.rs +612 -612
  259. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
  260. data/vendor/kreuzberg/tests/core_integration.rs +510 -510
  261. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  262. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
  263. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  264. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  265. data/vendor/kreuzberg/tests/email_integration.rs +327 -327
  266. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  267. data/vendor/kreuzberg/tests/error_handling.rs +402 -402
  268. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  269. data/vendor/kreuzberg/tests/format_integration.rs +164 -164
  270. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  271. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  272. data/vendor/kreuzberg/tests/image_integration.rs +255 -255
  273. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  274. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  275. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  276. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  277. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  278. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  279. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  280. data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
  281. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
  282. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
  283. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
  284. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  285. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
  286. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  287. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  288. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
  289. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
  290. data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
  291. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
  292. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
  293. data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
  294. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  295. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
  296. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
  297. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
  298. data/vendor/kreuzberg/tests/security_validation.rs +416 -416
  299. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  300. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
  301. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
  302. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
  303. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  304. data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
  305. data/vendor/kreuzberg-ffi/README.md +851 -851
  306. data/vendor/kreuzberg-ffi/build.rs +176 -176
  307. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
  308. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -12
  309. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
  310. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
  311. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
  312. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
  313. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
  314. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
  315. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
  316. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
  317. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
  318. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
  319. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
  320. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
  321. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  322. data/vendor/kreuzberg-tesseract/LICENSE +22 -22
  323. data/vendor/kreuzberg-tesseract/README.md +399 -399
  324. data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
  325. data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
  326. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
  327. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
  328. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
  329. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
  330. data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
  331. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
  332. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
  333. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
  334. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
  335. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
  336. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
  337. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
  338. data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
  339. data/vendor/rb-sys/Cargo.lock +393 -393
  340. data/vendor/rb-sys/Cargo.toml +70 -70
  341. data/vendor/rb-sys/Cargo.toml.orig +57 -57
  342. data/vendor/rb-sys/LICENSE-APACHE +190 -190
  343. data/vendor/rb-sys/LICENSE-MIT +21 -21
  344. data/vendor/rb-sys/build/features.rs +111 -111
  345. data/vendor/rb-sys/build/main.rs +286 -286
  346. data/vendor/rb-sys/build/stable_api_config.rs +155 -155
  347. data/vendor/rb-sys/build/version.rs +50 -50
  348. data/vendor/rb-sys/readme.md +36 -36
  349. data/vendor/rb-sys/src/bindings.rs +21 -21
  350. data/vendor/rb-sys/src/hidden.rs +11 -11
  351. data/vendor/rb-sys/src/lib.rs +35 -35
  352. data/vendor/rb-sys/src/macros.rs +371 -371
  353. data/vendor/rb-sys/src/memory.rs +53 -53
  354. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
  355. data/vendor/rb-sys/src/special_consts.rs +31 -31
  356. data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
  357. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
  358. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
  359. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
  360. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
  361. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
  362. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
  363. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
  364. data/vendor/rb-sys/src/stable_api.rs +260 -260
  365. data/vendor/rb-sys/src/symbol.rs +31 -31
  366. data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
  367. data/vendor/rb-sys/src/utils.rs +89 -89
  368. data/vendor/rb-sys/src/value_type.rs +7 -7
  369. metadata +7 -80
@@ -1,674 +1,674 @@
1
- //! Comprehensive TDD test suite for ODT (OpenDocument Text) extraction
2
- //!
3
- //! This test suite validates ODT extraction capabilities using Pandoc's output as the baseline.
4
- //! It covers:
5
- //! - Metadata extraction (title, creator, date, keywords from meta.xml)
6
- //! - Content extraction (text, formatting, structure)
7
- //! - Table extraction with captions
8
- //! - Formatting preservation (bold, italic, strikeout)
9
- //! - Image handling with captions
10
- //! - Math formula extraction
11
- //! - Note handling (footnotes, endnotes)
12
- //! - Citation/reference extraction
13
- //! - Unicode and special character handling
14
- //!
15
- //! Note: These tests require the `office` feature to be enabled and Pandoc to be installed.
16
-
17
- #![cfg(feature = "office")]
18
-
19
- use kreuzberg::core::config::ExtractionConfig;
20
- use kreuzberg::core::extractor::extract_file;
21
- use std::path::{Path, PathBuf};
22
-
23
- mod helpers;
24
-
25
- /// Helper function to get the workspace root and construct test file paths
26
- fn get_test_file_path(filename: &str) -> PathBuf {
27
- let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
28
- .parent()
29
- .unwrap()
30
- .parent()
31
- .unwrap();
32
- workspace_root.join(format!("test_documents/odt/{}", filename))
33
- }
34
-
35
- /// Helper to verify a test file exists before running test
36
- fn ensure_test_file_exists(path: &Path) -> bool {
37
- if !path.exists() {
38
- println!("Skipping test: Test file not found at {:?}", path);
39
- false
40
- } else {
41
- true
42
- }
43
- }
44
-
45
- /// Tests extraction of document metadata from ODT meta.xml
46
- /// Validates: title, subject, creator, dates, generator
47
- #[tokio::test]
48
- async fn test_odt_metadata_extraction() {
49
- let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
50
- .parent()
51
- .unwrap()
52
- .parent()
53
- .unwrap();
54
- let test_file = workspace_root.join("test_documents/metadata_test.odt");
55
-
56
- if !ensure_test_file_exists(&test_file) {
57
- println!("Skipping metadata test: metadata_test.odt not found");
58
- return;
59
- }
60
-
61
- let config = ExtractionConfig::default();
62
- let result = extract_file(&test_file, None, &config)
63
- .await
64
- .expect("Should extract ODT metadata successfully");
65
-
66
- assert!(!result.content.is_empty(), "Content should not be empty");
67
- assert!(
68
- result.content.contains("Test Document"),
69
- "Should contain document title in content"
70
- );
71
-
72
- let metadata = &result.metadata.additional;
73
- println!("Extracted metadata: {:?}", metadata);
74
-
75
- if let Some(title) = metadata.get("title") {
76
- assert_eq!(title.as_str(), Some("Test Metadata Document"), "Title should match");
77
- }
78
-
79
- if let Some(subject) = metadata.get("subject") {
80
- assert_eq!(
81
- subject.as_str(),
82
- Some("Testing ODT Metadata Extraction"),
83
- "Subject should match"
84
- );
85
- }
86
-
87
- if let Some(created_by) = metadata.get("created_by") {
88
- assert_eq!(created_by.as_str(), Some("John Doe"), "Creator should match");
89
- }
90
-
91
- if let Some(authors) = metadata.get("authors") {
92
- let authors_array = authors.as_array().expect("Authors should be an array");
93
- assert_eq!(authors_array.len(), 1, "Should have one author");
94
- assert_eq!(authors_array[0].as_str(), Some("John Doe"), "Author name should match");
95
- }
96
-
97
- assert!(metadata.get("created_at").is_some(), "Creation date should be present");
98
-
99
- assert!(
100
- metadata.get("modified_at").is_some(),
101
- "Modification date should be present"
102
- );
103
-
104
- if let Some(generator) = metadata.get("generator") {
105
- let gen_str = generator.as_str().expect("Generator should be a string");
106
- assert!(gen_str.contains("Pandoc"), "Generator should be Pandoc");
107
- }
108
-
109
- println!("✅ ODT metadata extraction test passed!");
110
- println!(" Metadata fields extracted: {}", metadata.len());
111
- }
112
-
113
- /// Tests extraction of tables with captions from ODT
114
- /// Baseline from Pandoc: simpleTableWithCaption.odt
115
- /// Expected Pandoc output:
116
- /// ```
117
- /// --------- --------------
118
- /// Content More content
119
- /// --------- --------------
120
- /// : Table 1: Some caption for a table
121
- /// ```
122
- #[tokio::test]
123
- async fn test_odt_table_with_caption_extraction() {
124
- let test_file = get_test_file_path("simpleTableWithCaption.odt");
125
- if !ensure_test_file_exists(&test_file) {
126
- return;
127
- }
128
-
129
- let config = ExtractionConfig::default();
130
- let result = extract_file(&test_file, None, &config).await;
131
-
132
- if let Ok(result) = result {
133
- if !result.content.is_empty() {
134
- let content_lower = result.content.to_lowercase();
135
- assert!(
136
- content_lower.contains("content") || content_lower.contains("table") || !result.tables.is_empty(),
137
- "Should either extract table content or structured tables"
138
- );
139
- }
140
- println!("✅ ODT table with caption extraction test passed!");
141
- println!(" Extracted {} tables", result.tables.len());
142
- } else {
143
- println!("⚠️ ODT table extraction not fully supported yet (Pandoc integration needed)");
144
- }
145
- }
146
-
147
- /// Tests extraction of basic tables without captions
148
- /// Baseline from Pandoc: simpleTable.odt
149
- /// Expected: Table with "Content" and "More content" cells
150
- #[tokio::test]
151
- async fn test_odt_simple_table_extraction() {
152
- let test_file = get_test_file_path("simpleTable.odt");
153
- if !ensure_test_file_exists(&test_file) {
154
- return;
155
- }
156
-
157
- let config = ExtractionConfig::default();
158
- let result = extract_file(&test_file, None, &config).await;
159
-
160
- if let Ok(result) = result {
161
- if !result.content.is_empty() {
162
- let content_lower = result.content.to_lowercase();
163
- assert!(
164
- content_lower.contains("content") || !result.tables.is_empty(),
165
- "Table should either contain 'content' text or be in structured tables"
166
- );
167
- }
168
- println!("✅ ODT simple table extraction test passed!");
169
- } else {
170
- println!("⚠️ ODT table extraction not fully supported yet");
171
- }
172
- }
173
-
174
- /// Tests extraction of document heading hierarchy
175
- /// Baseline from Pandoc: headers.odt
176
- /// Expected:
177
- /// - H1: "A header (Lv 1)"
178
- /// - H2: "Another header (Lv 2)"
179
- /// - H1: "Back to Level 1"
180
- #[tokio::test]
181
- async fn test_odt_heading_structure_extraction() {
182
- let test_file = get_test_file_path("headers.odt");
183
- if !ensure_test_file_exists(&test_file) {
184
- return;
185
- }
186
-
187
- let config = ExtractionConfig::default();
188
- let result = extract_file(&test_file, None, &config)
189
- .await
190
- .expect("Should extract heading structure successfully");
191
-
192
- assert!(!result.content.is_empty(), "Content should not be empty");
193
-
194
- assert!(
195
- result.content.contains("header") || result.content.contains("Header"),
196
- "Should contain heading text"
197
- );
198
-
199
- assert!(
200
- result.content.contains("#") || result.content.contains("header"),
201
- "Should indicate heading structure"
202
- );
203
-
204
- println!("✅ ODT heading structure extraction test passed!");
205
- }
206
-
207
- /// Tests extraction of bold text formatting
208
- /// Baseline from Pandoc: bold.odt
209
- /// Expected Pandoc output: "Here comes **bold** text"
210
- #[tokio::test]
211
- async fn test_odt_bold_formatting_extraction() {
212
- let test_file = get_test_file_path("bold.odt");
213
- if !ensure_test_file_exists(&test_file) {
214
- return;
215
- }
216
-
217
- let config = ExtractionConfig::default();
218
- let result = extract_file(&test_file, None, &config)
219
- .await
220
- .expect("Should extract bold formatting successfully");
221
-
222
- assert!(!result.content.is_empty(), "Content should not be empty");
223
-
224
- let content = result.content.to_lowercase();
225
- assert!(content.contains("bold"), "Should contain 'bold' text");
226
-
227
- assert!(
228
- result.content.contains("**bold**") || result.content.contains("bold"),
229
- "Should preserve bold text"
230
- );
231
-
232
- println!("✅ ODT bold formatting extraction test passed!");
233
- }
234
-
235
- /// Tests extraction of italic text formatting
236
- /// Baseline from Pandoc: italic.odt
237
- /// Expected Pandoc output: "Here comes *italic* text"
238
- #[tokio::test]
239
- async fn test_odt_italic_formatting_extraction() {
240
- let test_file = get_test_file_path("italic.odt");
241
- if !ensure_test_file_exists(&test_file) {
242
- return;
243
- }
244
-
245
- let config = ExtractionConfig::default();
246
- let result = extract_file(&test_file, None, &config)
247
- .await
248
- .expect("Should extract italic formatting successfully");
249
-
250
- assert!(!result.content.is_empty(), "Content should not be empty");
251
-
252
- let content = result.content.to_lowercase();
253
- assert!(content.contains("italic"), "Should contain 'italic' text");
254
-
255
- assert!(
256
- result.content.contains("*italic*") || result.content.contains("italic"),
257
- "Should preserve italic text"
258
- );
259
-
260
- println!("✅ ODT italic formatting extraction test passed!");
261
- }
262
-
263
- /// Tests extraction of strikeout/strikethrough text formatting
264
- /// Baseline from Pandoc: strikeout.odt
265
- /// Expected Pandoc output: "Here comes text that was ~~striken out~~."
266
- #[tokio::test]
267
- async fn test_odt_strikeout_formatting_extraction() {
268
- let test_file = get_test_file_path("strikeout.odt");
269
- if !ensure_test_file_exists(&test_file) {
270
- return;
271
- }
272
-
273
- let config = ExtractionConfig::default();
274
- let result = extract_file(&test_file, None, &config)
275
- .await
276
- .expect("Should extract strikeout formatting successfully");
277
-
278
- assert!(!result.content.is_empty(), "Content should not be empty");
279
-
280
- let content = result.content.to_lowercase();
281
- assert!(
282
- content.contains("strike") || content.contains("striken"),
283
- "Should contain strikeout text"
284
- );
285
-
286
- println!("✅ ODT strikeout formatting extraction test passed!");
287
- }
288
-
289
- /// Tests extraction of images with captions
290
- /// Baseline from Pandoc: imageWithCaption.odt
291
- /// Expected: Image reference with caption
292
- /// Expected Pandoc output:
293
- /// ```
294
- /// ![Image caption](Pictures/10000000000000FA000000FAD6A15225.jpg)
295
- /// {alt="Abbildung 1: Image caption" width="5.292cm" height="5.292cm"}
296
- /// ```
297
- #[tokio::test]
298
- async fn test_odt_image_with_caption_extraction() {
299
- let test_file = get_test_file_path("imageWithCaption.odt");
300
- if !ensure_test_file_exists(&test_file) {
301
- return;
302
- }
303
-
304
- let config = ExtractionConfig::default();
305
- let result = extract_file(&test_file, None, &config).await;
306
-
307
- if let Ok(result) = result {
308
- if !result.content.is_empty() {
309
- let content_lower = result.content.to_lowercase();
310
- assert!(
311
- content_lower.contains("image")
312
- || content_lower.contains("caption")
313
- || content_lower.contains("!")
314
- || result.images.is_some(),
315
- "Should reference image or caption or have extracted images"
316
- );
317
- }
318
- println!("✅ ODT image with caption extraction test passed!");
319
- } else {
320
- println!("⚠️ ODT image extraction not fully supported yet");
321
- }
322
- }
323
-
324
- /// Tests extraction of mathematical formulas
325
- /// Baseline from Pandoc: formula.odt
326
- /// Expected Pandoc output: "$$E = {m \\cdot c^{2}}$$"
327
- #[tokio::test]
328
- async fn test_odt_formula_extraction() {
329
- let test_file = get_test_file_path("formula.odt");
330
- if !ensure_test_file_exists(&test_file) {
331
- return;
332
- }
333
-
334
- let config = ExtractionConfig::default();
335
- let result = extract_file(&test_file, None, &config)
336
- .await
337
- .expect("Should extract formula successfully");
338
-
339
- assert!(!result.content.is_empty(), "Content should not be empty");
340
-
341
- let content = &result.content;
342
- assert!(
343
- content.contains("E") && (content.contains("m") || content.contains("$")),
344
- "Should extract formula content"
345
- );
346
-
347
- println!("✅ ODT formula extraction test passed!");
348
- }
349
-
350
- /// Tests extraction of footnotes
351
- /// Baseline from Pandoc: footnote.odt
352
- /// Expected Pandoc output:
353
- /// ```
354
- /// Some text[^1] with a footnote.
355
- ///
356
- /// [^1]: Footnote text
357
- /// ```
358
- #[tokio::test]
359
- async fn test_odt_footnote_extraction() {
360
- let test_file = get_test_file_path("footnote.odt");
361
- if !ensure_test_file_exists(&test_file) {
362
- return;
363
- }
364
-
365
- let config = ExtractionConfig::default();
366
- let result = extract_file(&test_file, None, &config)
367
- .await
368
- .expect("Should extract footnote successfully");
369
-
370
- assert!(!result.content.is_empty(), "Content should not be empty");
371
-
372
- let content_lower = result.content.to_lowercase();
373
- assert!(
374
- content_lower.contains("footnote") || content_lower.contains("[^"),
375
- "Should extract footnote"
376
- );
377
-
378
- println!("✅ ODT footnote extraction test passed!");
379
- }
380
-
381
- /// Tests extraction of endnotes
382
- /// Baseline from Pandoc: endnote.odt
383
- /// Expected: Endnote content with reference (similar to footnotes)
384
- #[tokio::test]
385
- async fn test_odt_endnote_extraction() {
386
- let test_file = get_test_file_path("endnote.odt");
387
- if !ensure_test_file_exists(&test_file) {
388
- return;
389
- }
390
-
391
- let config = ExtractionConfig::default();
392
- let result = extract_file(&test_file, None, &config)
393
- .await
394
- .expect("Should extract endnote successfully");
395
-
396
- assert!(!result.content.is_empty(), "Content should not be empty");
397
-
398
- let content_lower = result.content.to_lowercase();
399
- assert!(
400
- content_lower.contains("endnote") || content_lower.contains("[^"),
401
- "Should extract endnote"
402
- );
403
-
404
- println!("✅ ODT endnote extraction test passed!");
405
- }
406
-
407
- /// Tests extraction of citations and references
408
- /// Baseline from Pandoc: citation.odt
409
- /// Expected Pandoc output: "Some text[@Ex] with a citation."
410
- #[tokio::test]
411
- async fn test_odt_citation_extraction() {
412
- let test_file = get_test_file_path("citation.odt");
413
- if !ensure_test_file_exists(&test_file) {
414
- return;
415
- }
416
-
417
- let config = ExtractionConfig::default();
418
- let result = extract_file(&test_file, None, &config)
419
- .await
420
- .expect("Should extract citation successfully");
421
-
422
- assert!(!result.content.is_empty(), "Content should not be empty");
423
-
424
- let content_lower = result.content.to_lowercase();
425
- assert!(
426
- content_lower.contains("citation") || content_lower.contains("text") || content_lower.contains("@"),
427
- "Should extract citation"
428
- );
429
-
430
- println!("✅ ODT citation extraction test passed!");
431
- }
432
-
433
- /// Tests extraction of unicode characters and special symbols
434
- /// Baseline from Pandoc: unicode.odt
435
- /// Expected: Proper preservation of unicode characters
436
- /// Expected Pandoc output: ""'çӨ©¼вбФШöɵ"
437
- #[tokio::test]
438
- async fn test_odt_unicode_extraction() {
439
- let test_file = get_test_file_path("unicode.odt");
440
- if !ensure_test_file_exists(&test_file) {
441
- return;
442
- }
443
-
444
- let config = ExtractionConfig::default();
445
- let result = extract_file(&test_file, None, &config)
446
- .await
447
- .expect("Should extract unicode successfully");
448
-
449
- assert!(!result.content.is_empty(), "Content should not be empty");
450
-
451
- assert!(!result.content.is_empty(), "Should extract unicode content (not empty)");
452
-
453
- println!("✅ ODT unicode extraction test passed!");
454
- println!(" Extracted unicode content: {:?}", result.content);
455
- }
456
-
457
- /// Tests extraction of inline code formatting
458
- /// Baseline from Pandoc: inlinedCode.odt
459
- /// Expected Pandoc output: "Here comes `inlined code` text and `an another` one."
460
- #[tokio::test]
461
- async fn test_odt_inlined_code_extraction() {
462
- let test_file = get_test_file_path("inlinedCode.odt");
463
- if !ensure_test_file_exists(&test_file) {
464
- return;
465
- }
466
-
467
- let config = ExtractionConfig::default();
468
- let result = extract_file(&test_file, None, &config)
469
- .await
470
- .expect("Should extract inline code successfully");
471
-
472
- assert!(!result.content.is_empty(), "Content should not be empty");
473
-
474
- let content_lower = result.content.to_lowercase();
475
- assert!(
476
- content_lower.contains("code") || content_lower.contains("`"),
477
- "Should extract inline code"
478
- );
479
-
480
- println!("✅ ODT inline code extraction test passed!");
481
- }
482
-
483
- /// Tests extraction of paragraph structure and content
484
- /// Baseline from Pandoc: paragraph.odt
485
- /// Expected: Multiple paragraphs separated by blank lines
486
- #[tokio::test]
487
- async fn test_odt_paragraph_structure_extraction() {
488
- let test_file = get_test_file_path("paragraph.odt");
489
- if !ensure_test_file_exists(&test_file) {
490
- return;
491
- }
492
-
493
- let config = ExtractionConfig::default();
494
- let result = extract_file(&test_file, None, &config)
495
- .await
496
- .expect("Should extract paragraph structure successfully");
497
-
498
- assert!(!result.content.is_empty(), "Content should not be empty");
499
-
500
- let content_lower = result.content.to_lowercase();
501
- assert!(content_lower.contains("paragraph"), "Should contain paragraph text");
502
-
503
- let paragraph_count = result.content.split('\n').filter(|l| !l.is_empty()).count();
504
- assert!(paragraph_count >= 2, "Should extract multiple paragraphs");
505
-
506
- println!("✅ ODT paragraph structure extraction test passed!");
507
- println!(" Extracted {} paragraph segments", paragraph_count);
508
- }
509
-
510
- /// Integration test: Verify ODT extraction works with standard API
511
- #[tokio::test]
512
- async fn test_odt_extraction_api_integration() {
513
- let test_file = get_test_file_path("bold.odt");
514
- if !ensure_test_file_exists(&test_file) {
515
- return;
516
- }
517
-
518
- let config = ExtractionConfig::default();
519
- let result = extract_file(&test_file, None, &config)
520
- .await
521
- .expect("Should extract via standard API");
522
-
523
- assert!(!result.content.is_empty(), "Should have content");
524
- assert_eq!(result.mime_type, "application/vnd.oasis.opendocument.text");
525
-
526
- println!("✅ ODT extraction API integration test passed!");
527
- }
528
-
529
- /// Test error handling for non-existent files
530
- #[tokio::test]
531
- async fn test_odt_extraction_missing_file_handling() {
532
- let test_file = get_test_file_path("nonexistent.odt");
533
- let config = ExtractionConfig::default();
534
-
535
- let result = extract_file(&test_file, None, &config).await;
536
-
537
- assert!(result.is_err(), "Should return error for non-existent file");
538
-
539
- println!("✅ ODT extraction error handling test passed!");
540
- }
541
-
542
- /// Test extraction from multiple representative files
543
- #[tokio::test]
544
- async fn test_odt_extraction_variety() {
545
- let test_files = vec![
546
- "bold.odt",
547
- "italic.odt",
548
- "headers.odt",
549
- "simpleTable.odt",
550
- "footnote.odt",
551
- ];
552
-
553
- let config = ExtractionConfig::default();
554
- let mut successful_extractions = 0;
555
-
556
- for filename in &test_files {
557
- let test_file = get_test_file_path(filename);
558
- if !test_file.exists() {
559
- continue;
560
- }
561
-
562
- if let Ok(result) = extract_file(&test_file, None, &config).await
563
- && !result.content.is_empty()
564
- {
565
- successful_extractions += 1;
566
- }
567
- }
568
-
569
- assert!(
570
- successful_extractions >= 3,
571
- "Should successfully extract from at least 3 test files"
572
- );
573
-
574
- println!("✅ ODT extraction variety test passed!");
575
- println!(
576
- " Successfully extracted {} out of {} files",
577
- successful_extractions,
578
- test_files.len()
579
- );
580
- }
581
-
582
- /// Test that ODT table extraction doesn't include duplicate cell content
583
- /// This is a regression test for the bug where table cells were extracted twice:
584
- /// once as markdown tables and once as raw cell text
585
- #[tokio::test]
586
- async fn test_odt_table_no_duplicate_content() {
587
- let test_file = get_test_file_path("simpleTable.odt");
588
- if !ensure_test_file_exists(&test_file) {
589
- return;
590
- }
591
-
592
- let config = ExtractionConfig::default();
593
- let result = extract_file(&test_file, None, &config)
594
- .await
595
- .expect("Should extract table successfully");
596
-
597
- assert!(!result.content.is_empty(), "Content should not be empty");
598
-
599
- let content_count = result.content.matches("Content").count();
600
-
601
- println!(" 'Content' appears {} times in output", content_count);
602
- println!(" Content preview:\n{}", result.content);
603
-
604
- assert!(
605
- content_count <= 3,
606
- "Content should not appear excessively, indicating no duplicate table cell extraction"
607
- );
608
-
609
- println!("✅ ODT table no duplicate content test passed!");
610
- }
611
-
612
- /// Test comprehensive table extraction with headers, multiple rows, and tables
613
- /// Uses the extraction_test document created with pandoc to ensure complete content
614
- #[tokio::test]
615
- async fn test_odt_comprehensive_table_extraction() {
616
- let test_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
617
- .parent()
618
- .unwrap()
619
- .parent()
620
- .unwrap()
621
- .join("test_documents/extraction_test.odt");
622
-
623
- if !test_file.exists() {
624
- println!("⚠️ Test document not found at {:?}, skipping", test_file);
625
- return;
626
- }
627
-
628
- let config = ExtractionConfig::default();
629
- let result = extract_file(&test_file, None, &config)
630
- .await
631
- .expect("Should extract comprehensive table document successfully");
632
-
633
- assert!(!result.content.is_empty(), "Content should not be empty");
634
-
635
- assert!(result.content.contains("Comprehensive"), "Should contain heading");
636
- assert!(
637
- result.content.contains("First Section") || result.content.contains("First"),
638
- "Should contain first section"
639
- );
640
- assert!(
641
- result.content.contains("Second Section") || result.content.contains("Second"),
642
- "Should contain second section"
643
- );
644
- assert!(
645
- result.content.contains("Third Section") || result.content.contains("Third"),
646
- "Should contain third section"
647
- );
648
-
649
- assert!(
650
- result.content.contains("|"),
651
- "Should contain pipe characters for markdown tables"
652
- );
653
- assert!(result.content.contains("---"), "Should contain table separator");
654
-
655
- assert!(
656
- result.content.contains("Header 1") || result.content.contains("Cell 1A"),
657
- "Should contain table data"
658
- );
659
- assert!(
660
- result.content.contains("Product") || result.content.contains("Apple"),
661
- "Should contain second table data"
662
- );
663
-
664
- let cell_count = result.content.matches("Cell 1A").count();
665
- assert!(
666
- cell_count <= 2,
667
- "Cell content should not be heavily duplicated (found {} instances)",
668
- cell_count
669
- );
670
-
671
- println!("✅ ODT comprehensive table extraction test passed!");
672
- println!(" Extracted content length: {} chars", result.content.len());
673
- println!(" Tables found in output: {}", result.tables.len());
674
- }
1
+ //! Comprehensive TDD test suite for ODT (OpenDocument Text) extraction
2
+ //!
3
+ //! This test suite validates ODT extraction capabilities using Pandoc's output as the baseline.
4
+ //! It covers:
5
+ //! - Metadata extraction (title, creator, date, keywords from meta.xml)
6
+ //! - Content extraction (text, formatting, structure)
7
+ //! - Table extraction with captions
8
+ //! - Formatting preservation (bold, italic, strikeout)
9
+ //! - Image handling with captions
10
+ //! - Math formula extraction
11
+ //! - Note handling (footnotes, endnotes)
12
+ //! - Citation/reference extraction
13
+ //! - Unicode and special character handling
14
+ //!
15
+ //! Note: These tests require the `office` feature to be enabled and Pandoc to be installed.
16
+
17
+ #![cfg(feature = "office")]
18
+
19
+ use kreuzberg::core::config::ExtractionConfig;
20
+ use kreuzberg::core::extractor::extract_file;
21
+ use std::path::{Path, PathBuf};
22
+
23
+ mod helpers;
24
+
25
+ /// Helper function to get the workspace root and construct test file paths
26
+ fn get_test_file_path(filename: &str) -> PathBuf {
27
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
28
+ .parent()
29
+ .unwrap()
30
+ .parent()
31
+ .unwrap();
32
+ workspace_root.join(format!("test_documents/odt/{}", filename))
33
+ }
34
+
35
+ /// Helper to verify a test file exists before running test
36
+ fn ensure_test_file_exists(path: &Path) -> bool {
37
+ if !path.exists() {
38
+ println!("Skipping test: Test file not found at {:?}", path);
39
+ false
40
+ } else {
41
+ true
42
+ }
43
+ }
44
+
45
+ /// Tests extraction of document metadata from ODT meta.xml
46
+ /// Validates: title, subject, creator, dates, generator
47
+ #[tokio::test]
48
+ async fn test_odt_metadata_extraction() {
49
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
50
+ .parent()
51
+ .unwrap()
52
+ .parent()
53
+ .unwrap();
54
+ let test_file = workspace_root.join("test_documents/metadata_test.odt");
55
+
56
+ if !ensure_test_file_exists(&test_file) {
57
+ println!("Skipping metadata test: metadata_test.odt not found");
58
+ return;
59
+ }
60
+
61
+ let config = ExtractionConfig::default();
62
+ let result = extract_file(&test_file, None, &config)
63
+ .await
64
+ .expect("Should extract ODT metadata successfully");
65
+
66
+ assert!(!result.content.is_empty(), "Content should not be empty");
67
+ assert!(
68
+ result.content.contains("Test Document"),
69
+ "Should contain document title in content"
70
+ );
71
+
72
+ let metadata = &result.metadata.additional;
73
+ println!("Extracted metadata: {:?}", metadata);
74
+
75
+ if let Some(title) = metadata.get("title") {
76
+ assert_eq!(title.as_str(), Some("Test Metadata Document"), "Title should match");
77
+ }
78
+
79
+ if let Some(subject) = metadata.get("subject") {
80
+ assert_eq!(
81
+ subject.as_str(),
82
+ Some("Testing ODT Metadata Extraction"),
83
+ "Subject should match"
84
+ );
85
+ }
86
+
87
+ if let Some(created_by) = metadata.get("created_by") {
88
+ assert_eq!(created_by.as_str(), Some("John Doe"), "Creator should match");
89
+ }
90
+
91
+ if let Some(authors) = metadata.get("authors") {
92
+ let authors_array = authors.as_array().expect("Authors should be an array");
93
+ assert_eq!(authors_array.len(), 1, "Should have one author");
94
+ assert_eq!(authors_array[0].as_str(), Some("John Doe"), "Author name should match");
95
+ }
96
+
97
+ assert!(metadata.get("created_at").is_some(), "Creation date should be present");
98
+
99
+ assert!(
100
+ metadata.get("modified_at").is_some(),
101
+ "Modification date should be present"
102
+ );
103
+
104
+ if let Some(generator) = metadata.get("generator") {
105
+ let gen_str = generator.as_str().expect("Generator should be a string");
106
+ assert!(gen_str.contains("Pandoc"), "Generator should be Pandoc");
107
+ }
108
+
109
+ println!("✅ ODT metadata extraction test passed!");
110
+ println!(" Metadata fields extracted: {}", metadata.len());
111
+ }
112
+
113
+ /// Tests extraction of tables with captions from ODT
114
+ /// Baseline from Pandoc: simpleTableWithCaption.odt
115
+ /// Expected Pandoc output:
116
+ /// ```
117
+ /// --------- --------------
118
+ /// Content More content
119
+ /// --------- --------------
120
+ /// : Table 1: Some caption for a table
121
+ /// ```
122
+ #[tokio::test]
123
+ async fn test_odt_table_with_caption_extraction() {
124
+ let test_file = get_test_file_path("simpleTableWithCaption.odt");
125
+ if !ensure_test_file_exists(&test_file) {
126
+ return;
127
+ }
128
+
129
+ let config = ExtractionConfig::default();
130
+ let result = extract_file(&test_file, None, &config).await;
131
+
132
+ if let Ok(result) = result {
133
+ if !result.content.is_empty() {
134
+ let content_lower = result.content.to_lowercase();
135
+ assert!(
136
+ content_lower.contains("content") || content_lower.contains("table") || !result.tables.is_empty(),
137
+ "Should either extract table content or structured tables"
138
+ );
139
+ }
140
+ println!("✅ ODT table with caption extraction test passed!");
141
+ println!(" Extracted {} tables", result.tables.len());
142
+ } else {
143
+ println!("⚠️ ODT table extraction not fully supported yet (Pandoc integration needed)");
144
+ }
145
+ }
146
+
147
+ /// Tests extraction of basic tables without captions
148
+ /// Baseline from Pandoc: simpleTable.odt
149
+ /// Expected: Table with "Content" and "More content" cells
150
+ #[tokio::test]
151
+ async fn test_odt_simple_table_extraction() {
152
+ let test_file = get_test_file_path("simpleTable.odt");
153
+ if !ensure_test_file_exists(&test_file) {
154
+ return;
155
+ }
156
+
157
+ let config = ExtractionConfig::default();
158
+ let result = extract_file(&test_file, None, &config).await;
159
+
160
+ if let Ok(result) = result {
161
+ if !result.content.is_empty() {
162
+ let content_lower = result.content.to_lowercase();
163
+ assert!(
164
+ content_lower.contains("content") || !result.tables.is_empty(),
165
+ "Table should either contain 'content' text or be in structured tables"
166
+ );
167
+ }
168
+ println!("✅ ODT simple table extraction test passed!");
169
+ } else {
170
+ println!("⚠️ ODT table extraction not fully supported yet");
171
+ }
172
+ }
173
+
174
+ /// Tests extraction of document heading hierarchy
175
+ /// Baseline from Pandoc: headers.odt
176
+ /// Expected:
177
+ /// - H1: "A header (Lv 1)"
178
+ /// - H2: "Another header (Lv 2)"
179
+ /// - H1: "Back to Level 1"
180
+ #[tokio::test]
181
+ async fn test_odt_heading_structure_extraction() {
182
+ let test_file = get_test_file_path("headers.odt");
183
+ if !ensure_test_file_exists(&test_file) {
184
+ return;
185
+ }
186
+
187
+ let config = ExtractionConfig::default();
188
+ let result = extract_file(&test_file, None, &config)
189
+ .await
190
+ .expect("Should extract heading structure successfully");
191
+
192
+ assert!(!result.content.is_empty(), "Content should not be empty");
193
+
194
+ assert!(
195
+ result.content.contains("header") || result.content.contains("Header"),
196
+ "Should contain heading text"
197
+ );
198
+
199
+ assert!(
200
+ result.content.contains("#") || result.content.contains("header"),
201
+ "Should indicate heading structure"
202
+ );
203
+
204
+ println!("✅ ODT heading structure extraction test passed!");
205
+ }
206
+
207
+ /// Tests extraction of bold text formatting
208
+ /// Baseline from Pandoc: bold.odt
209
+ /// Expected Pandoc output: "Here comes **bold** text"
210
+ #[tokio::test]
211
+ async fn test_odt_bold_formatting_extraction() {
212
+ let test_file = get_test_file_path("bold.odt");
213
+ if !ensure_test_file_exists(&test_file) {
214
+ return;
215
+ }
216
+
217
+ let config = ExtractionConfig::default();
218
+ let result = extract_file(&test_file, None, &config)
219
+ .await
220
+ .expect("Should extract bold formatting successfully");
221
+
222
+ assert!(!result.content.is_empty(), "Content should not be empty");
223
+
224
+ let content = result.content.to_lowercase();
225
+ assert!(content.contains("bold"), "Should contain 'bold' text");
226
+
227
+ assert!(
228
+ result.content.contains("**bold**") || result.content.contains("bold"),
229
+ "Should preserve bold text"
230
+ );
231
+
232
+ println!("✅ ODT bold formatting extraction test passed!");
233
+ }
234
+
235
+ /// Tests extraction of italic text formatting
236
+ /// Baseline from Pandoc: italic.odt
237
+ /// Expected Pandoc output: "Here comes *italic* text"
238
+ #[tokio::test]
239
+ async fn test_odt_italic_formatting_extraction() {
240
+ let test_file = get_test_file_path("italic.odt");
241
+ if !ensure_test_file_exists(&test_file) {
242
+ return;
243
+ }
244
+
245
+ let config = ExtractionConfig::default();
246
+ let result = extract_file(&test_file, None, &config)
247
+ .await
248
+ .expect("Should extract italic formatting successfully");
249
+
250
+ assert!(!result.content.is_empty(), "Content should not be empty");
251
+
252
+ let content = result.content.to_lowercase();
253
+ assert!(content.contains("italic"), "Should contain 'italic' text");
254
+
255
+ assert!(
256
+ result.content.contains("*italic*") || result.content.contains("italic"),
257
+ "Should preserve italic text"
258
+ );
259
+
260
+ println!("✅ ODT italic formatting extraction test passed!");
261
+ }
262
+
263
+ /// Tests extraction of strikeout/strikethrough text formatting
264
+ /// Baseline from Pandoc: strikeout.odt
265
+ /// Expected Pandoc output: "Here comes text that was ~~striken out~~."
266
+ #[tokio::test]
267
+ async fn test_odt_strikeout_formatting_extraction() {
268
+ let test_file = get_test_file_path("strikeout.odt");
269
+ if !ensure_test_file_exists(&test_file) {
270
+ return;
271
+ }
272
+
273
+ let config = ExtractionConfig::default();
274
+ let result = extract_file(&test_file, None, &config)
275
+ .await
276
+ .expect("Should extract strikeout formatting successfully");
277
+
278
+ assert!(!result.content.is_empty(), "Content should not be empty");
279
+
280
+ let content = result.content.to_lowercase();
281
+ assert!(
282
+ content.contains("strike") || content.contains("striken"),
283
+ "Should contain strikeout text"
284
+ );
285
+
286
+ println!("✅ ODT strikeout formatting extraction test passed!");
287
+ }
288
+
289
+ /// Tests extraction of images with captions
290
+ /// Baseline from Pandoc: imageWithCaption.odt
291
+ /// Expected: Image reference with caption
292
+ /// Expected Pandoc output:
293
+ /// ```
294
+ /// ![Image caption](Pictures/10000000000000FA000000FAD6A15225.jpg)
295
+ /// {alt="Abbildung 1: Image caption" width="5.292cm" height="5.292cm"}
296
+ /// ```
297
+ #[tokio::test]
298
+ async fn test_odt_image_with_caption_extraction() {
299
+ let test_file = get_test_file_path("imageWithCaption.odt");
300
+ if !ensure_test_file_exists(&test_file) {
301
+ return;
302
+ }
303
+
304
+ let config = ExtractionConfig::default();
305
+ let result = extract_file(&test_file, None, &config).await;
306
+
307
+ if let Ok(result) = result {
308
+ if !result.content.is_empty() {
309
+ let content_lower = result.content.to_lowercase();
310
+ assert!(
311
+ content_lower.contains("image")
312
+ || content_lower.contains("caption")
313
+ || content_lower.contains("!")
314
+ || result.images.is_some(),
315
+ "Should reference image or caption or have extracted images"
316
+ );
317
+ }
318
+ println!("✅ ODT image with caption extraction test passed!");
319
+ } else {
320
+ println!("⚠️ ODT image extraction not fully supported yet");
321
+ }
322
+ }
323
+
324
+ /// Tests extraction of mathematical formulas
325
+ /// Baseline from Pandoc: formula.odt
326
+ /// Expected Pandoc output: "$$E = {m \\cdot c^{2}}$$"
327
+ #[tokio::test]
328
+ async fn test_odt_formula_extraction() {
329
+ let test_file = get_test_file_path("formula.odt");
330
+ if !ensure_test_file_exists(&test_file) {
331
+ return;
332
+ }
333
+
334
+ let config = ExtractionConfig::default();
335
+ let result = extract_file(&test_file, None, &config)
336
+ .await
337
+ .expect("Should extract formula successfully");
338
+
339
+ assert!(!result.content.is_empty(), "Content should not be empty");
340
+
341
+ let content = &result.content;
342
+ assert!(
343
+ content.contains("E") && (content.contains("m") || content.contains("$")),
344
+ "Should extract formula content"
345
+ );
346
+
347
+ println!("✅ ODT formula extraction test passed!");
348
+ }
349
+
350
+ /// Tests extraction of footnotes
351
+ /// Baseline from Pandoc: footnote.odt
352
+ /// Expected Pandoc output:
353
+ /// ```
354
+ /// Some text[^1] with a footnote.
355
+ ///
356
+ /// [^1]: Footnote text
357
+ /// ```
358
+ #[tokio::test]
359
+ async fn test_odt_footnote_extraction() {
360
+ let test_file = get_test_file_path("footnote.odt");
361
+ if !ensure_test_file_exists(&test_file) {
362
+ return;
363
+ }
364
+
365
+ let config = ExtractionConfig::default();
366
+ let result = extract_file(&test_file, None, &config)
367
+ .await
368
+ .expect("Should extract footnote successfully");
369
+
370
+ assert!(!result.content.is_empty(), "Content should not be empty");
371
+
372
+ let content_lower = result.content.to_lowercase();
373
+ assert!(
374
+ content_lower.contains("footnote") || content_lower.contains("[^"),
375
+ "Should extract footnote"
376
+ );
377
+
378
+ println!("✅ ODT footnote extraction test passed!");
379
+ }
380
+
381
+ /// Tests extraction of endnotes
382
+ /// Baseline from Pandoc: endnote.odt
383
+ /// Expected: Endnote content with reference (similar to footnotes)
384
+ #[tokio::test]
385
+ async fn test_odt_endnote_extraction() {
386
+ let test_file = get_test_file_path("endnote.odt");
387
+ if !ensure_test_file_exists(&test_file) {
388
+ return;
389
+ }
390
+
391
+ let config = ExtractionConfig::default();
392
+ let result = extract_file(&test_file, None, &config)
393
+ .await
394
+ .expect("Should extract endnote successfully");
395
+
396
+ assert!(!result.content.is_empty(), "Content should not be empty");
397
+
398
+ let content_lower = result.content.to_lowercase();
399
+ assert!(
400
+ content_lower.contains("endnote") || content_lower.contains("[^"),
401
+ "Should extract endnote"
402
+ );
403
+
404
+ println!("✅ ODT endnote extraction test passed!");
405
+ }
406
+
407
+ /// Tests extraction of citations and references
408
+ /// Baseline from Pandoc: citation.odt
409
+ /// Expected Pandoc output: "Some text[@Ex] with a citation."
410
+ #[tokio::test]
411
+ async fn test_odt_citation_extraction() {
412
+ let test_file = get_test_file_path("citation.odt");
413
+ if !ensure_test_file_exists(&test_file) {
414
+ return;
415
+ }
416
+
417
+ let config = ExtractionConfig::default();
418
+ let result = extract_file(&test_file, None, &config)
419
+ .await
420
+ .expect("Should extract citation successfully");
421
+
422
+ assert!(!result.content.is_empty(), "Content should not be empty");
423
+
424
+ let content_lower = result.content.to_lowercase();
425
+ assert!(
426
+ content_lower.contains("citation") || content_lower.contains("text") || content_lower.contains("@"),
427
+ "Should extract citation"
428
+ );
429
+
430
+ println!("✅ ODT citation extraction test passed!");
431
+ }
432
+
433
+ /// Tests extraction of unicode characters and special symbols
434
+ /// Baseline from Pandoc: unicode.odt
435
+ /// Expected: Proper preservation of unicode characters
436
+ /// Expected Pandoc output: ""'çӨ©¼вбФШöɵ"
437
+ #[tokio::test]
438
+ async fn test_odt_unicode_extraction() {
439
+ let test_file = get_test_file_path("unicode.odt");
440
+ if !ensure_test_file_exists(&test_file) {
441
+ return;
442
+ }
443
+
444
+ let config = ExtractionConfig::default();
445
+ let result = extract_file(&test_file, None, &config)
446
+ .await
447
+ .expect("Should extract unicode successfully");
448
+
449
+ assert!(!result.content.is_empty(), "Content should not be empty");
450
+
451
+ assert!(!result.content.is_empty(), "Should extract unicode content (not empty)");
452
+
453
+ println!("✅ ODT unicode extraction test passed!");
454
+ println!(" Extracted unicode content: {:?}", result.content);
455
+ }
456
+
457
+ /// Tests extraction of inline code formatting
458
+ /// Baseline from Pandoc: inlinedCode.odt
459
+ /// Expected Pandoc output: "Here comes `inlined code` text and `an another` one."
460
+ #[tokio::test]
461
+ async fn test_odt_inlined_code_extraction() {
462
+ let test_file = get_test_file_path("inlinedCode.odt");
463
+ if !ensure_test_file_exists(&test_file) {
464
+ return;
465
+ }
466
+
467
+ let config = ExtractionConfig::default();
468
+ let result = extract_file(&test_file, None, &config)
469
+ .await
470
+ .expect("Should extract inline code successfully");
471
+
472
+ assert!(!result.content.is_empty(), "Content should not be empty");
473
+
474
+ let content_lower = result.content.to_lowercase();
475
+ assert!(
476
+ content_lower.contains("code") || content_lower.contains("`"),
477
+ "Should extract inline code"
478
+ );
479
+
480
+ println!("✅ ODT inline code extraction test passed!");
481
+ }
482
+
483
+ /// Tests extraction of paragraph structure and content
484
+ /// Baseline from Pandoc: paragraph.odt
485
+ /// Expected: Multiple paragraphs separated by blank lines
486
+ #[tokio::test]
487
+ async fn test_odt_paragraph_structure_extraction() {
488
+ let test_file = get_test_file_path("paragraph.odt");
489
+ if !ensure_test_file_exists(&test_file) {
490
+ return;
491
+ }
492
+
493
+ let config = ExtractionConfig::default();
494
+ let result = extract_file(&test_file, None, &config)
495
+ .await
496
+ .expect("Should extract paragraph structure successfully");
497
+
498
+ assert!(!result.content.is_empty(), "Content should not be empty");
499
+
500
+ let content_lower = result.content.to_lowercase();
501
+ assert!(content_lower.contains("paragraph"), "Should contain paragraph text");
502
+
503
+ let paragraph_count = result.content.split('\n').filter(|l| !l.is_empty()).count();
504
+ assert!(paragraph_count >= 2, "Should extract multiple paragraphs");
505
+
506
+ println!("✅ ODT paragraph structure extraction test passed!");
507
+ println!(" Extracted {} paragraph segments", paragraph_count);
508
+ }
509
+
510
+ /// Integration test: Verify ODT extraction works with standard API
511
+ #[tokio::test]
512
+ async fn test_odt_extraction_api_integration() {
513
+ let test_file = get_test_file_path("bold.odt");
514
+ if !ensure_test_file_exists(&test_file) {
515
+ return;
516
+ }
517
+
518
+ let config = ExtractionConfig::default();
519
+ let result = extract_file(&test_file, None, &config)
520
+ .await
521
+ .expect("Should extract via standard API");
522
+
523
+ assert!(!result.content.is_empty(), "Should have content");
524
+ assert_eq!(result.mime_type, "application/vnd.oasis.opendocument.text");
525
+
526
+ println!("✅ ODT extraction API integration test passed!");
527
+ }
528
+
529
+ /// Test error handling for non-existent files
530
+ #[tokio::test]
531
+ async fn test_odt_extraction_missing_file_handling() {
532
+ let test_file = get_test_file_path("nonexistent.odt");
533
+ let config = ExtractionConfig::default();
534
+
535
+ let result = extract_file(&test_file, None, &config).await;
536
+
537
+ assert!(result.is_err(), "Should return error for non-existent file");
538
+
539
+ println!("✅ ODT extraction error handling test passed!");
540
+ }
541
+
542
+ /// Test extraction from multiple representative files
543
+ #[tokio::test]
544
+ async fn test_odt_extraction_variety() {
545
+ let test_files = vec![
546
+ "bold.odt",
547
+ "italic.odt",
548
+ "headers.odt",
549
+ "simpleTable.odt",
550
+ "footnote.odt",
551
+ ];
552
+
553
+ let config = ExtractionConfig::default();
554
+ let mut successful_extractions = 0;
555
+
556
+ for filename in &test_files {
557
+ let test_file = get_test_file_path(filename);
558
+ if !test_file.exists() {
559
+ continue;
560
+ }
561
+
562
+ if let Ok(result) = extract_file(&test_file, None, &config).await
563
+ && !result.content.is_empty()
564
+ {
565
+ successful_extractions += 1;
566
+ }
567
+ }
568
+
569
+ assert!(
570
+ successful_extractions >= 3,
571
+ "Should successfully extract from at least 3 test files"
572
+ );
573
+
574
+ println!("✅ ODT extraction variety test passed!");
575
+ println!(
576
+ " Successfully extracted {} out of {} files",
577
+ successful_extractions,
578
+ test_files.len()
579
+ );
580
+ }
581
+
582
+ /// Test that ODT table extraction doesn't include duplicate cell content
583
+ /// This is a regression test for the bug where table cells were extracted twice:
584
+ /// once as markdown tables and once as raw cell text
585
+ #[tokio::test]
586
+ async fn test_odt_table_no_duplicate_content() {
587
+ let test_file = get_test_file_path("simpleTable.odt");
588
+ if !ensure_test_file_exists(&test_file) {
589
+ return;
590
+ }
591
+
592
+ let config = ExtractionConfig::default();
593
+ let result = extract_file(&test_file, None, &config)
594
+ .await
595
+ .expect("Should extract table successfully");
596
+
597
+ assert!(!result.content.is_empty(), "Content should not be empty");
598
+
599
+ let content_count = result.content.matches("Content").count();
600
+
601
+ println!(" 'Content' appears {} times in output", content_count);
602
+ println!(" Content preview:\n{}", result.content);
603
+
604
+ assert!(
605
+ content_count <= 3,
606
+ "Content should not appear excessively, indicating no duplicate table cell extraction"
607
+ );
608
+
609
+ println!("✅ ODT table no duplicate content test passed!");
610
+ }
611
+
612
+ /// Test comprehensive table extraction with headers, multiple rows, and tables
613
+ /// Uses the extraction_test document created with pandoc to ensure complete content
614
+ #[tokio::test]
615
+ async fn test_odt_comprehensive_table_extraction() {
616
+ let test_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
617
+ .parent()
618
+ .unwrap()
619
+ .parent()
620
+ .unwrap()
621
+ .join("test_documents/extraction_test.odt");
622
+
623
+ if !test_file.exists() {
624
+ println!("⚠️ Test document not found at {:?}, skipping", test_file);
625
+ return;
626
+ }
627
+
628
+ let config = ExtractionConfig::default();
629
+ let result = extract_file(&test_file, None, &config)
630
+ .await
631
+ .expect("Should extract comprehensive table document successfully");
632
+
633
+ assert!(!result.content.is_empty(), "Content should not be empty");
634
+
635
+ assert!(result.content.contains("Comprehensive"), "Should contain heading");
636
+ assert!(
637
+ result.content.contains("First Section") || result.content.contains("First"),
638
+ "Should contain first section"
639
+ );
640
+ assert!(
641
+ result.content.contains("Second Section") || result.content.contains("Second"),
642
+ "Should contain second section"
643
+ );
644
+ assert!(
645
+ result.content.contains("Third Section") || result.content.contains("Third"),
646
+ "Should contain third section"
647
+ );
648
+
649
+ assert!(
650
+ result.content.contains("|"),
651
+ "Should contain pipe characters for markdown tables"
652
+ );
653
+ assert!(result.content.contains("---"), "Should contain table separator");
654
+
655
+ assert!(
656
+ result.content.contains("Header 1") || result.content.contains("Cell 1A"),
657
+ "Should contain table data"
658
+ );
659
+ assert!(
660
+ result.content.contains("Product") || result.content.contains("Apple"),
661
+ "Should contain second table data"
662
+ );
663
+
664
+ let cell_count = result.content.matches("Cell 1A").count();
665
+ assert!(
666
+ cell_count <= 2,
667
+ "Cell content should not be heavily duplicated (found {} instances)",
668
+ cell_count
669
+ );
670
+
671
+ println!("✅ ODT comprehensive table extraction test passed!");
672
+ println!(" Extracted content length: {} chars", result.content.len());
673
+ println!(" Tables found in output: {}", result.tables.len());
674
+ }