kreuzberg 4.0.0.rc2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +391 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,692 +1,694 @@
1
- //! Comprehensive TDD test suite for RST (reStructuredText) extraction
2
- //!
3
- //! Tests RST extraction using Pandoc as the baseline for quality validation.
4
- //! The test documents are derived from the Pandoc test suite and provide
5
- //! comprehensive coverage of RST-specific features including:
6
- //! - Metadata extraction from field lists (:Author:, :Date:, etc.)
7
- //! - Directive handling (.. code-block::, .. image::, .. math::, etc.)
8
- //! - Section structure and heading levels
9
- //! - Table extraction (simple and grid tables)
10
- //! - Reference links and images
11
- //! - Comments and special blocks
12
- //! - Content quality validation
13
-
14
- use kreuzberg::core::config::ExtractionConfig;
15
- use kreuzberg::core::extractor::extract_bytes;
16
-
17
- mod helpers;
18
-
19
- const RST_FIXTURE: &str = include_str!("../../../test_documents/rst/rst-reader.rst");
20
-
21
- fn rst_fixture_bytes() -> Vec<u8> {
22
- RST_FIXTURE.as_bytes().to_vec()
23
- }
24
-
25
- /// Test extraction of document title from RST file structure
26
- #[tokio::test]
27
- async fn test_rst_title_extraction() {
28
- let content = rst_fixture_bytes();
29
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
30
- .await
31
- .expect("Should extract RST successfully");
32
-
33
- assert!(
34
- result.content.to_lowercase().contains("pandoc test suite"),
35
- "Should contain document title 'Pandoc Test Suite'"
36
- );
37
-
38
- assert!(
39
- result.content.contains("Level one header") || result.content.contains("header"),
40
- "Should contain document headers"
41
- );
42
-
43
- println!("✅ RST title extraction test passed!");
44
- }
45
-
46
- /// Test field list metadata extraction (:Authors:, :Date:, :Revision:)
47
- #[tokio::test]
48
- async fn test_rst_field_list_metadata_extraction() {
49
- let content = rst_fixture_bytes();
50
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
51
- .await
52
- .expect("Should extract RST successfully");
53
-
54
- println!(
55
- "Content excerpt (first 500 chars): {}",
56
- &result.content[..std::cmp::min(500, result.content.len())]
57
- );
58
-
59
- assert!(
60
- result.content.contains("John MacFarlane")
61
- || result.content.contains("July 17")
62
- || result.content.contains("Pandoc Test Suite"),
63
- "Should contain metadata information or title"
64
- );
65
-
66
- println!("✅ RST field list metadata extraction test passed!");
67
- }
68
-
69
- /// Test extraction of multiple heading levels
70
- #[tokio::test]
71
- async fn test_rst_section_hierarchy() {
72
- let content = rst_fixture_bytes();
73
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
74
- .await
75
- .expect("Should extract RST successfully");
76
-
77
- let headings = vec![
78
- "Level one header",
79
- "Level two header",
80
- "Level three",
81
- "Paragraphs",
82
- "Block Quotes",
83
- "Code Blocks",
84
- "Lists",
85
- "Field Lists",
86
- "HTML Blocks",
87
- "LaTeX Block",
88
- "Images",
89
- "Tables",
90
- ];
91
-
92
- for heading in headings {
93
- assert!(
94
- result.content.contains(heading),
95
- "Should contain heading: '{}'",
96
- heading
97
- );
98
- }
99
-
100
- println!("✅ RST section hierarchy test passed!");
101
- }
102
-
103
- /// Test that emphasis in headings is preserved
104
- #[tokio::test]
105
- async fn test_rst_heading_with_inline_markup() {
106
- let content = rst_fixture_bytes();
107
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
108
- .await
109
- .expect("Should extract RST successfully");
110
-
111
- assert!(
112
- result.content.contains("emphasis") || result.content.contains("Level four"),
113
- "Should contain heading with emphasis"
114
- );
115
-
116
- println!("✅ RST heading with inline markup test passed!");
117
- }
118
-
119
- /// Test code block extraction with language specification
120
- #[tokio::test]
121
- async fn test_rst_code_block_extraction() {
122
- let content = rst_fixture_bytes();
123
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
124
- .await
125
- .expect("Should extract RST successfully");
126
-
127
- assert!(
128
- result.content.contains("def my_function") || result.content.contains("python"),
129
- "Should contain Python code block or language specification"
130
- );
131
-
132
- assert!(
133
- result.content.contains("return x + 1") || result.content.contains("my_function"),
134
- "Should contain Python function code"
135
- );
136
-
137
- println!("✅ RST code block extraction test passed!");
138
- }
139
-
140
- /// Test Haskell code blocks with highlight directive
141
- #[tokio::test]
142
- async fn test_rst_highlight_directive_code_blocks() {
143
- let content = rst_fixture_bytes();
144
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
145
- .await
146
- .expect("Should extract RST successfully");
147
-
148
- assert!(
149
- result.content.contains("haskell") || result.content.contains("Tree") || result.content.contains("data Tree"),
150
- "Should contain Haskell code blocks"
151
- );
152
-
153
- assert!(
154
- result.content.contains("Leaf") || result.content.contains("Node"),
155
- "Should contain Haskell data constructors"
156
- );
157
-
158
- println!("✅ RST highlight directive code blocks test passed!");
159
- }
160
-
161
- /// Test JavaScript code blocks
162
- #[tokio::test]
163
- async fn test_rst_javascript_code_blocks() {
164
- let content = rst_fixture_bytes();
165
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
166
- .await
167
- .expect("Should extract RST successfully");
168
-
169
- assert!(
170
- result.content.contains("javascript") || result.content.contains("=>") || result.content.contains("let f"),
171
- "Should contain JavaScript code"
172
- );
173
-
174
- println!("✅ RST JavaScript code blocks test passed!");
175
- }
176
-
177
- /// Test unordered list extraction
178
- #[tokio::test]
179
- async fn test_rst_unordered_lists() {
180
- let content = rst_fixture_bytes();
181
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
182
- .await
183
- .expect("Should extract RST successfully");
184
-
185
- let list_items = vec![
186
- "asterisk 1",
187
- "asterisk 2",
188
- "asterisk 3",
189
- "Plus 1",
190
- "Plus 2",
191
- "Plus 3",
192
- "Minus 1",
193
- "Minus 2",
194
- "Minus 3",
195
- ];
196
-
197
- for item in list_items {
198
- assert!(result.content.contains(item), "Should contain list item: '{}'", item);
199
- }
200
-
201
- println!("✅ RST unordered lists test passed!");
202
- }
203
-
204
- /// Test ordered list extraction
205
- #[tokio::test]
206
- async fn test_rst_ordered_lists() {
207
- let content = rst_fixture_bytes();
208
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
209
- .await
210
- .expect("Should extract RST successfully");
211
-
212
- let list_items = vec!["First", "Second", "Third"];
213
-
214
- for item in list_items {
215
- assert!(
216
- result.content.contains(item),
217
- "Should contain ordered list item: '{}'",
218
- item
219
- );
220
- }
221
-
222
- println!("✅ RST ordered lists test passed!");
223
- }
224
-
225
- /// Test nested lists extraction
226
- #[tokio::test]
227
- async fn test_rst_nested_lists() {
228
- let content = rst_fixture_bytes();
229
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
230
- .await
231
- .expect("Should extract RST successfully");
232
-
233
- assert!(
234
- result.content.contains("First")
235
- || result.content.contains("Second")
236
- || result.content.contains("Third")
237
- || result.content.contains("Definition"),
238
- "Should contain nested or definition list content"
239
- );
240
-
241
- println!("✅ RST nested lists test passed!");
242
- }
243
-
244
- /// Test simple table extraction
245
- #[tokio::test]
246
- async fn test_rst_simple_table_extraction() {
247
- let content = rst_fixture_bytes();
248
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
249
- .await
250
- .expect("Should extract RST successfully");
251
-
252
- assert!(
253
- result.content.contains("Simple Tables")
254
- || result.content.contains("col")
255
- || (result.content.contains("r1") && result.content.contains("r2")),
256
- "Should contain simple table content"
257
- );
258
-
259
- println!("✅ RST simple table extraction test passed!");
260
- }
261
-
262
- /// Test grid table extraction
263
- #[tokio::test]
264
- async fn test_rst_grid_table_extraction() {
265
- let content = rst_fixture_bytes();
266
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
267
- .await
268
- .expect("Should extract RST successfully");
269
-
270
- assert!(
271
- result.content.contains("Grid Tables")
272
- || result.content.contains("r1 a")
273
- || (result.content.contains("r1") && result.content.contains("r2")),
274
- "Should contain grid table content"
275
- );
276
-
277
- println!("✅ RST grid table extraction test passed!");
278
- }
279
-
280
- /// Test table with complex structure (multiple rows/columns spanning)
281
- #[tokio::test]
282
- async fn test_rst_complex_table_with_spanning() {
283
- let content = rst_fixture_bytes();
284
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
285
- .await
286
- .expect("Should extract RST successfully");
287
-
288
- assert!(
289
- result.content.contains("Table with cells")
290
- || result.content.contains("Property")
291
- || result.content.contains("min")
292
- || result.content.contains("°C"),
293
- "Should contain complex table content"
294
- );
295
-
296
- println!("✅ RST complex table with spanning test passed!");
297
- }
298
-
299
- /// Test emphasis and strong markup
300
- #[tokio::test]
301
- async fn test_rst_emphasis_and_strong() {
302
- let content = rst_fixture_bytes();
303
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
304
- .await
305
- .expect("Should extract RST successfully");
306
-
307
- assert!(
308
- result.content.contains("emphasized") || result.content.contains("strong"),
309
- "Should contain emphasis markers or converted text"
310
- );
311
-
312
- println!("✅ RST emphasis and strong test passed!");
313
- }
314
-
315
- /// Test inline code extraction
316
- #[tokio::test]
317
- async fn test_rst_inline_code() {
318
- let content = rst_fixture_bytes();
319
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
320
- .await
321
- .expect("Should extract RST successfully");
322
-
323
- assert!(
324
- result.content.contains(">") || result.content.contains("code"),
325
- "Should contain inline code or code markers"
326
- );
327
-
328
- println!("✅ RST inline code test passed!");
329
- }
330
-
331
- /// Test subscript and superscript
332
- #[tokio::test]
333
- async fn test_rst_subscript_superscript() {
334
- let content = rst_fixture_bytes();
335
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
336
- .await
337
- .expect("Should extract RST successfully");
338
-
339
- assert!(
340
- result.content.contains("subscript") || result.content.contains("superscript"),
341
- "Should contain subscript/superscript text"
342
- );
343
-
344
- println!("✅ RST subscript/superscript test passed!");
345
- }
346
-
347
- /// Test explicit links extraction
348
- #[tokio::test]
349
- async fn test_rst_explicit_links() {
350
- let content = rst_fixture_bytes();
351
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
352
- .await
353
- .expect("Should extract RST successfully");
354
-
355
- assert!(
356
- result.content.contains("/url") || result.content.contains("URL"),
357
- "Should contain link URLs"
358
- );
359
-
360
- assert!(
361
- result.content.contains("link"),
362
- "Should contain link references or text"
363
- );
364
-
365
- println!("✅ RST explicit links test passed!");
366
- }
367
-
368
- /// Test reference links
369
- #[tokio::test]
370
- async fn test_rst_reference_links() {
371
- let content = rst_fixture_bytes();
372
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
373
- .await
374
- .expect("Should extract RST successfully");
375
-
376
- assert!(
377
- result.content.contains("link1") || result.content.contains("link2") || result.content.contains("link"),
378
- "Should contain resolved reference links"
379
- );
380
-
381
- println!("✅ RST reference links test passed!");
382
- }
383
-
384
- /// Test autolinks (bare URLs and email addresses)
385
- #[tokio::test]
386
- async fn test_rst_autolinks() {
387
- let content = rst_fixture_bytes();
388
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
389
- .await
390
- .expect("Should extract RST successfully");
391
-
392
- assert!(
393
- result.content.contains("example.com") || result.content.contains("http"),
394
- "Should contain URLs from autolinks"
395
- );
396
-
397
- assert!(
398
- result.content.contains("nowhere") || result.content.contains("@"),
399
- "Should contain email references"
400
- );
401
-
402
- println!("✅ RST autolinks test passed!");
403
- }
404
-
405
- /// Test image directive extraction
406
- #[tokio::test]
407
- async fn test_rst_image_directive() {
408
- let content = rst_fixture_bytes();
409
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
410
- .await
411
- .expect("Should extract RST successfully");
412
-
413
- assert!(
414
- result.content.contains("image") || result.content.contains("lalune") || result.content.contains("movie"),
415
- "Should contain image directives or references"
416
- );
417
-
418
- assert!(
419
- result.content.contains("Voyage") || result.content.contains("Melies"),
420
- "Should contain image descriptions"
421
- );
422
-
423
- println!("✅ RST image directive test passed!");
424
- }
425
-
426
- /// Test raw HTML block extraction
427
- #[tokio::test]
428
- async fn test_rst_raw_html_blocks() {
429
- let content = rst_fixture_bytes();
430
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
431
- .await
432
- .expect("Should extract RST successfully");
433
-
434
- assert!(
435
- result.content.contains("div") || result.content.contains("foo"),
436
- "Should contain HTML block content"
437
- );
438
-
439
- println!("✅ RST raw HTML blocks test passed!");
440
- }
441
-
442
- /// Test LaTeX block extraction
443
- #[tokio::test]
444
- async fn test_rst_latex_blocks() {
445
- let content = rst_fixture_bytes();
446
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
447
- .await
448
- .expect("Should extract RST successfully");
449
-
450
- assert!(
451
- result.content.contains("LaTeX Block")
452
- || result.content.contains("begin{tabular}")
453
- || result.content.contains("Animal")
454
- || result.content.contains("Dog"),
455
- "Should contain LaTeX block or content"
456
- );
457
-
458
- println!("✅ RST LaTeX blocks test passed!");
459
- }
460
-
461
- /// Test math directive extraction
462
- #[tokio::test]
463
- async fn test_rst_math_directive() {
464
- let content = rst_fixture_bytes();
465
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
466
- .await
467
- .expect("Should extract RST successfully");
468
-
469
- assert!(
470
- result.content.contains("E=mc^2")
471
- || result.content.contains("E = mc")
472
- || result.content.contains("alpha")
473
- || result.content.contains("Math"),
474
- "Should contain math formulas"
475
- );
476
-
477
- println!("✅ RST math directive test passed!");
478
- }
479
-
480
- /// Test comment blocks are excluded from output
481
- #[tokio::test]
482
- async fn test_rst_comment_blocks_excluded() {
483
- let content = rst_fixture_bytes();
484
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
485
- .await
486
- .expect("Should extract RST successfully");
487
-
488
- assert!(
489
- !result.content.contains("should not appear"),
490
- "Comments should be excluded from output"
491
- );
492
-
493
- assert!(
494
- result.content.contains("First paragraph") || result.content.contains("paragraph"),
495
- "Non-comment content should be present"
496
- );
497
-
498
- println!("✅ RST comment blocks excluded test passed!");
499
- }
500
-
501
- /// Test line blocks extraction
502
- #[tokio::test]
503
- async fn test_rst_line_blocks() {
504
- let content = rst_fixture_bytes();
505
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
506
- .await
507
- .expect("Should extract RST successfully");
508
-
509
- assert!(
510
- result.content.contains("Line blocks")
511
- || result.content.contains("bee")
512
- || result.content.contains("entire bee"),
513
- "Should contain line block content or heading"
514
- );
515
-
516
- println!("✅ RST line blocks test passed!");
517
- }
518
-
519
- /// Test unicode character preservation
520
- #[tokio::test]
521
- async fn test_rst_unicode_characters() {
522
- let content = rst_fixture_bytes();
523
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
524
- .await
525
- .expect("Should extract RST successfully");
526
-
527
- assert!(
528
- result.content.contains("©")
529
- || result.content.contains("copyright")
530
- || result.content.contains("umlaut")
531
- || result.content.contains("unicode"),
532
- "Should contain unicode characters or references"
533
- );
534
-
535
- println!("✅ RST unicode characters test passed!");
536
- }
537
-
538
- /// Test escaped characters
539
- #[tokio::test]
540
- async fn test_rst_escaped_characters() {
541
- let content = rst_fixture_bytes();
542
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
543
- .await
544
- .expect("Should extract RST successfully");
545
-
546
- assert!(
547
- result.content.contains("Backslash")
548
- || result.content.contains("Backtick")
549
- || result.content.contains("Asterisk"),
550
- "Should contain escaped special character sections"
551
- );
552
-
553
- println!("✅ RST escaped characters test passed!");
554
- }
555
-
556
- // SECTION 12: FOOTNOTES AND REFERENCES
557
-
558
- /// Test footnote extraction
559
- #[tokio::test]
560
- async fn test_rst_footnotes() {
561
- let content = rst_fixture_bytes();
562
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
563
- .await
564
- .expect("Should extract RST successfully");
565
-
566
- assert!(
567
- result.content.contains("Note") || result.content.contains("continuation"),
568
- "Should contain footnote content"
569
- );
570
-
571
- println!("✅ RST footnotes test passed!");
572
- }
573
-
574
- /// Test block quote extraction
575
- #[tokio::test]
576
- async fn test_rst_block_quotes() {
577
- let content = rst_fixture_bytes();
578
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
579
- .await
580
- .expect("Should extract RST successfully");
581
-
582
- assert!(
583
- result.content.contains("block quote") || result.content.contains("pretty short"),
584
- "Should contain block quote content"
585
- );
586
-
587
- println!("✅ RST block quotes test passed!");
588
- }
589
-
590
- /// Test overall content extraction volume
591
- #[tokio::test]
592
- async fn test_rst_content_extraction_volume() {
593
- let content = rst_fixture_bytes();
594
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
595
- .await
596
- .expect("Should extract RST successfully");
597
-
598
- let content_length = result.content.len();
599
- println!("Extracted content length: {} bytes", content_length);
600
-
601
- assert!(
602
- content_length > 1000,
603
- "Extracted content should be substantial (> 1000 bytes), got {} bytes",
604
- content_length
605
- );
606
-
607
- assert_eq!(result.mime_type, "text/x-rst", "MIME type should be preserved");
608
-
609
- println!(" RST content extraction volume test passed!");
610
- println!(" Extracted {} bytes from RST file", content_length);
611
- }
612
-
613
- /// Test extracted content contains all major sections
614
- #[tokio::test]
615
- async fn test_rst_all_major_sections_present() {
616
- let content = rst_fixture_bytes();
617
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
618
- .await
619
- .expect("Should extract RST successfully");
620
-
621
- let major_sections = vec![
622
- "Paragraphs",
623
- "Block Quotes",
624
- "Code Blocks",
625
- "Lists",
626
- "Field Lists",
627
- "HTML Blocks",
628
- "LaTeX Block",
629
- "Inline Markup",
630
- "Special Characters",
631
- "Links",
632
- "Images",
633
- "Comments",
634
- "Tables",
635
- "Math",
636
- ];
637
-
638
- let content_lower = result.content.to_lowercase();
639
- let mut found_count = 0;
640
-
641
- for section in major_sections {
642
- if content_lower.contains(&section.to_lowercase()) {
643
- found_count += 1;
644
- println!("✓ Found section: {}", section);
645
- } else {
646
- println!(" Missing section: {}", section);
647
- }
648
- }
649
-
650
- assert!(
651
- found_count >= 10,
652
- "Should find at least 10 major sections, found {}",
653
- found_count
654
- );
655
-
656
- println!("✅ RST all major sections present test passed!");
657
- println!(" Found {}/14 major sections", found_count);
658
- }
659
-
660
- /// Test MIME type detection
661
- #[tokio::test]
662
- async fn test_rst_mime_type_detection() {
663
- let content = rst_fixture_bytes();
664
-
665
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
666
- .await
667
- .expect("Should extract with text/x-rst MIME type");
668
-
669
- assert_eq!(result.mime_type, "text/x-rst");
670
-
671
- println!("✅ RST MIME type detection test passed!");
672
- }
673
-
674
- /// Test that no extraction errors occur on valid RST file
675
- #[tokio::test]
676
- async fn test_rst_extraction_no_errors() {
677
- let content = rst_fixture_bytes();
678
-
679
- let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()).await;
680
-
681
- assert!(
682
- result.is_ok(),
683
- "RST extraction should succeed without errors: {:?}",
684
- result.err()
685
- );
686
-
687
- let extraction = result.unwrap();
688
-
689
- assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
690
-
691
- println!(" RST extraction no errors test passed!");
692
- }
1
+ //! Comprehensive TDD test suite for RST (reStructuredText) extraction
2
+ //!
3
+ //! Tests RST extraction using Pandoc as the baseline for quality validation.
4
+ //! The test documents are derived from the Pandoc test suite and provide
5
+ //! comprehensive coverage of RST-specific features including:
6
+ //! - Metadata extraction from field lists (:Author:, :Date:, etc.)
7
+ //! - Directive handling (.. code-block::, .. image::, .. math::, etc.)
8
+ //! - Section structure and heading levels
9
+ //! - Table extraction (simple and grid tables)
10
+ //! - Reference links and images
11
+
12
+ #![cfg(feature = "office")]
13
+ //! - Comments and special blocks
14
+ //! - Content quality validation
15
+
16
+ use kreuzberg::core::config::ExtractionConfig;
17
+ use kreuzberg::core::extractor::extract_bytes;
18
+
19
+ mod helpers;
20
+
21
+ const RST_FIXTURE: &str = include_str!("../../../test_documents/rst/rst-reader.rst");
22
+
23
+ fn rst_fixture_bytes() -> Vec<u8> {
24
+ RST_FIXTURE.as_bytes().to_vec()
25
+ }
26
+
27
+ /// Test extraction of document title from RST file structure
28
+ #[tokio::test]
29
+ async fn test_rst_title_extraction() {
30
+ let content = rst_fixture_bytes();
31
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
32
+ .await
33
+ .expect("Should extract RST successfully");
34
+
35
+ assert!(
36
+ result.content.to_lowercase().contains("pandoc test suite"),
37
+ "Should contain document title 'Pandoc Test Suite'"
38
+ );
39
+
40
+ assert!(
41
+ result.content.contains("Level one header") || result.content.contains("header"),
42
+ "Should contain document headers"
43
+ );
44
+
45
+ println!("✅ RST title extraction test passed!");
46
+ }
47
+
48
+ /// Test field list metadata extraction (:Authors:, :Date:, :Revision:)
49
+ #[tokio::test]
50
+ async fn test_rst_field_list_metadata_extraction() {
51
+ let content = rst_fixture_bytes();
52
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
53
+ .await
54
+ .expect("Should extract RST successfully");
55
+
56
+ println!(
57
+ "Content excerpt (first 500 chars): {}",
58
+ &result.content[..std::cmp::min(500, result.content.len())]
59
+ );
60
+
61
+ assert!(
62
+ result.content.contains("John MacFarlane")
63
+ || result.content.contains("July 17")
64
+ || result.content.contains("Pandoc Test Suite"),
65
+ "Should contain metadata information or title"
66
+ );
67
+
68
+ println!("✅ RST field list metadata extraction test passed!");
69
+ }
70
+
71
+ /// Test extraction of multiple heading levels
72
+ #[tokio::test]
73
+ async fn test_rst_section_hierarchy() {
74
+ let content = rst_fixture_bytes();
75
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
76
+ .await
77
+ .expect("Should extract RST successfully");
78
+
79
+ let headings = vec![
80
+ "Level one header",
81
+ "Level two header",
82
+ "Level three",
83
+ "Paragraphs",
84
+ "Block Quotes",
85
+ "Code Blocks",
86
+ "Lists",
87
+ "Field Lists",
88
+ "HTML Blocks",
89
+ "LaTeX Block",
90
+ "Images",
91
+ "Tables",
92
+ ];
93
+
94
+ for heading in headings {
95
+ assert!(
96
+ result.content.contains(heading),
97
+ "Should contain heading: '{}'",
98
+ heading
99
+ );
100
+ }
101
+
102
+ println!("✅ RST section hierarchy test passed!");
103
+ }
104
+
105
+ /// Test that emphasis in headings is preserved
106
+ #[tokio::test]
107
+ async fn test_rst_heading_with_inline_markup() {
108
+ let content = rst_fixture_bytes();
109
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
110
+ .await
111
+ .expect("Should extract RST successfully");
112
+
113
+ assert!(
114
+ result.content.contains("emphasis") || result.content.contains("Level four"),
115
+ "Should contain heading with emphasis"
116
+ );
117
+
118
+ println!("✅ RST heading with inline markup test passed!");
119
+ }
120
+
121
+ /// Test code block extraction with language specification
122
+ #[tokio::test]
123
+ async fn test_rst_code_block_extraction() {
124
+ let content = rst_fixture_bytes();
125
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
126
+ .await
127
+ .expect("Should extract RST successfully");
128
+
129
+ assert!(
130
+ result.content.contains("def my_function") || result.content.contains("python"),
131
+ "Should contain Python code block or language specification"
132
+ );
133
+
134
+ assert!(
135
+ result.content.contains("return x + 1") || result.content.contains("my_function"),
136
+ "Should contain Python function code"
137
+ );
138
+
139
+ println!("✅ RST code block extraction test passed!");
140
+ }
141
+
142
+ /// Test Haskell code blocks with highlight directive
143
+ #[tokio::test]
144
+ async fn test_rst_highlight_directive_code_blocks() {
145
+ let content = rst_fixture_bytes();
146
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
147
+ .await
148
+ .expect("Should extract RST successfully");
149
+
150
+ assert!(
151
+ result.content.contains("haskell") || result.content.contains("Tree") || result.content.contains("data Tree"),
152
+ "Should contain Haskell code blocks"
153
+ );
154
+
155
+ assert!(
156
+ result.content.contains("Leaf") || result.content.contains("Node"),
157
+ "Should contain Haskell data constructors"
158
+ );
159
+
160
+ println!("✅ RST highlight directive code blocks test passed!");
161
+ }
162
+
163
+ /// Test JavaScript code blocks
164
+ #[tokio::test]
165
+ async fn test_rst_javascript_code_blocks() {
166
+ let content = rst_fixture_bytes();
167
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
168
+ .await
169
+ .expect("Should extract RST successfully");
170
+
171
+ assert!(
172
+ result.content.contains("javascript") || result.content.contains("=>") || result.content.contains("let f"),
173
+ "Should contain JavaScript code"
174
+ );
175
+
176
+ println!("✅ RST JavaScript code blocks test passed!");
177
+ }
178
+
179
+ /// Test unordered list extraction
180
+ #[tokio::test]
181
+ async fn test_rst_unordered_lists() {
182
+ let content = rst_fixture_bytes();
183
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
184
+ .await
185
+ .expect("Should extract RST successfully");
186
+
187
+ let list_items = vec![
188
+ "asterisk 1",
189
+ "asterisk 2",
190
+ "asterisk 3",
191
+ "Plus 1",
192
+ "Plus 2",
193
+ "Plus 3",
194
+ "Minus 1",
195
+ "Minus 2",
196
+ "Minus 3",
197
+ ];
198
+
199
+ for item in list_items {
200
+ assert!(result.content.contains(item), "Should contain list item: '{}'", item);
201
+ }
202
+
203
+ println!("✅ RST unordered lists test passed!");
204
+ }
205
+
206
+ /// Test ordered list extraction
207
+ #[tokio::test]
208
+ async fn test_rst_ordered_lists() {
209
+ let content = rst_fixture_bytes();
210
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
211
+ .await
212
+ .expect("Should extract RST successfully");
213
+
214
+ let list_items = vec!["First", "Second", "Third"];
215
+
216
+ for item in list_items {
217
+ assert!(
218
+ result.content.contains(item),
219
+ "Should contain ordered list item: '{}'",
220
+ item
221
+ );
222
+ }
223
+
224
+ println!("✅ RST ordered lists test passed!");
225
+ }
226
+
227
+ /// Test nested lists extraction
228
+ #[tokio::test]
229
+ async fn test_rst_nested_lists() {
230
+ let content = rst_fixture_bytes();
231
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
232
+ .await
233
+ .expect("Should extract RST successfully");
234
+
235
+ assert!(
236
+ result.content.contains("First")
237
+ || result.content.contains("Second")
238
+ || result.content.contains("Third")
239
+ || result.content.contains("Definition"),
240
+ "Should contain nested or definition list content"
241
+ );
242
+
243
+ println!("✅ RST nested lists test passed!");
244
+ }
245
+
246
+ /// Test simple table extraction
247
+ #[tokio::test]
248
+ async fn test_rst_simple_table_extraction() {
249
+ let content = rst_fixture_bytes();
250
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
251
+ .await
252
+ .expect("Should extract RST successfully");
253
+
254
+ assert!(
255
+ result.content.contains("Simple Tables")
256
+ || result.content.contains("col")
257
+ || (result.content.contains("r1") && result.content.contains("r2")),
258
+ "Should contain simple table content"
259
+ );
260
+
261
+ println!("✅ RST simple table extraction test passed!");
262
+ }
263
+
264
+ /// Test grid table extraction
265
+ #[tokio::test]
266
+ async fn test_rst_grid_table_extraction() {
267
+ let content = rst_fixture_bytes();
268
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
269
+ .await
270
+ .expect("Should extract RST successfully");
271
+
272
+ assert!(
273
+ result.content.contains("Grid Tables")
274
+ || result.content.contains("r1 a")
275
+ || (result.content.contains("r1") && result.content.contains("r2")),
276
+ "Should contain grid table content"
277
+ );
278
+
279
+ println!("✅ RST grid table extraction test passed!");
280
+ }
281
+
282
+ /// Test table with complex structure (multiple rows/columns spanning)
283
+ #[tokio::test]
284
+ async fn test_rst_complex_table_with_spanning() {
285
+ let content = rst_fixture_bytes();
286
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
287
+ .await
288
+ .expect("Should extract RST successfully");
289
+
290
+ assert!(
291
+ result.content.contains("Table with cells")
292
+ || result.content.contains("Property")
293
+ || result.content.contains("min")
294
+ || result.content.contains("°C"),
295
+ "Should contain complex table content"
296
+ );
297
+
298
+ println!("✅ RST complex table with spanning test passed!");
299
+ }
300
+
301
+ /// Test emphasis and strong markup
302
+ #[tokio::test]
303
+ async fn test_rst_emphasis_and_strong() {
304
+ let content = rst_fixture_bytes();
305
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
306
+ .await
307
+ .expect("Should extract RST successfully");
308
+
309
+ assert!(
310
+ result.content.contains("emphasized") || result.content.contains("strong"),
311
+ "Should contain emphasis markers or converted text"
312
+ );
313
+
314
+ println!("✅ RST emphasis and strong test passed!");
315
+ }
316
+
317
+ /// Test inline code extraction
318
+ #[tokio::test]
319
+ async fn test_rst_inline_code() {
320
+ let content = rst_fixture_bytes();
321
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
322
+ .await
323
+ .expect("Should extract RST successfully");
324
+
325
+ assert!(
326
+ result.content.contains(">") || result.content.contains("code"),
327
+ "Should contain inline code or code markers"
328
+ );
329
+
330
+ println!("✅ RST inline code test passed!");
331
+ }
332
+
333
+ /// Test subscript and superscript
334
+ #[tokio::test]
335
+ async fn test_rst_subscript_superscript() {
336
+ let content = rst_fixture_bytes();
337
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
338
+ .await
339
+ .expect("Should extract RST successfully");
340
+
341
+ assert!(
342
+ result.content.contains("subscript") || result.content.contains("superscript"),
343
+ "Should contain subscript/superscript text"
344
+ );
345
+
346
+ println!("✅ RST subscript/superscript test passed!");
347
+ }
348
+
349
+ /// Test explicit links extraction
350
+ #[tokio::test]
351
+ async fn test_rst_explicit_links() {
352
+ let content = rst_fixture_bytes();
353
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
354
+ .await
355
+ .expect("Should extract RST successfully");
356
+
357
+ assert!(
358
+ result.content.contains("/url") || result.content.contains("URL"),
359
+ "Should contain link URLs"
360
+ );
361
+
362
+ assert!(
363
+ result.content.contains("link"),
364
+ "Should contain link references or text"
365
+ );
366
+
367
+ println!("✅ RST explicit links test passed!");
368
+ }
369
+
370
+ /// Test reference links
371
+ #[tokio::test]
372
+ async fn test_rst_reference_links() {
373
+ let content = rst_fixture_bytes();
374
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
375
+ .await
376
+ .expect("Should extract RST successfully");
377
+
378
+ assert!(
379
+ result.content.contains("link1") || result.content.contains("link2") || result.content.contains("link"),
380
+ "Should contain resolved reference links"
381
+ );
382
+
383
+ println!("✅ RST reference links test passed!");
384
+ }
385
+
386
+ /// Test autolinks (bare URLs and email addresses)
387
+ #[tokio::test]
388
+ async fn test_rst_autolinks() {
389
+ let content = rst_fixture_bytes();
390
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
391
+ .await
392
+ .expect("Should extract RST successfully");
393
+
394
+ assert!(
395
+ result.content.contains("example.com") || result.content.contains("http"),
396
+ "Should contain URLs from autolinks"
397
+ );
398
+
399
+ assert!(
400
+ result.content.contains("nowhere") || result.content.contains("@"),
401
+ "Should contain email references"
402
+ );
403
+
404
+ println!("✅ RST autolinks test passed!");
405
+ }
406
+
407
+ /// Test image directive extraction
408
+ #[tokio::test]
409
+ async fn test_rst_image_directive() {
410
+ let content = rst_fixture_bytes();
411
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
412
+ .await
413
+ .expect("Should extract RST successfully");
414
+
415
+ assert!(
416
+ result.content.contains("image") || result.content.contains("lalune") || result.content.contains("movie"),
417
+ "Should contain image directives or references"
418
+ );
419
+
420
+ assert!(
421
+ result.content.contains("Voyage") || result.content.contains("Melies"),
422
+ "Should contain image descriptions"
423
+ );
424
+
425
+ println!("✅ RST image directive test passed!");
426
+ }
427
+
428
+ /// Test raw HTML block extraction
429
+ #[tokio::test]
430
+ async fn test_rst_raw_html_blocks() {
431
+ let content = rst_fixture_bytes();
432
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
433
+ .await
434
+ .expect("Should extract RST successfully");
435
+
436
+ assert!(
437
+ result.content.contains("div") || result.content.contains("foo"),
438
+ "Should contain HTML block content"
439
+ );
440
+
441
+ println!("✅ RST raw HTML blocks test passed!");
442
+ }
443
+
444
+ /// Test LaTeX block extraction
445
+ #[tokio::test]
446
+ async fn test_rst_latex_blocks() {
447
+ let content = rst_fixture_bytes();
448
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
449
+ .await
450
+ .expect("Should extract RST successfully");
451
+
452
+ assert!(
453
+ result.content.contains("LaTeX Block")
454
+ || result.content.contains("begin{tabular}")
455
+ || result.content.contains("Animal")
456
+ || result.content.contains("Dog"),
457
+ "Should contain LaTeX block or content"
458
+ );
459
+
460
+ println!("✅ RST LaTeX blocks test passed!");
461
+ }
462
+
463
+ /// Test math directive extraction
464
+ #[tokio::test]
465
+ async fn test_rst_math_directive() {
466
+ let content = rst_fixture_bytes();
467
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
468
+ .await
469
+ .expect("Should extract RST successfully");
470
+
471
+ assert!(
472
+ result.content.contains("E=mc^2")
473
+ || result.content.contains("E = mc")
474
+ || result.content.contains("alpha")
475
+ || result.content.contains("Math"),
476
+ "Should contain math formulas"
477
+ );
478
+
479
+ println!("✅ RST math directive test passed!");
480
+ }
481
+
482
+ /// Test comment blocks are excluded from output
483
+ #[tokio::test]
484
+ async fn test_rst_comment_blocks_excluded() {
485
+ let content = rst_fixture_bytes();
486
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
487
+ .await
488
+ .expect("Should extract RST successfully");
489
+
490
+ assert!(
491
+ !result.content.contains("should not appear"),
492
+ "Comments should be excluded from output"
493
+ );
494
+
495
+ assert!(
496
+ result.content.contains("First paragraph") || result.content.contains("paragraph"),
497
+ "Non-comment content should be present"
498
+ );
499
+
500
+ println!("✅ RST comment blocks excluded test passed!");
501
+ }
502
+
503
+ /// Test line blocks extraction
504
+ #[tokio::test]
505
+ async fn test_rst_line_blocks() {
506
+ let content = rst_fixture_bytes();
507
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
508
+ .await
509
+ .expect("Should extract RST successfully");
510
+
511
+ assert!(
512
+ result.content.contains("Line blocks")
513
+ || result.content.contains("bee")
514
+ || result.content.contains("entire bee"),
515
+ "Should contain line block content or heading"
516
+ );
517
+
518
+ println!("✅ RST line blocks test passed!");
519
+ }
520
+
521
+ /// Test unicode character preservation
522
+ #[tokio::test]
523
+ async fn test_rst_unicode_characters() {
524
+ let content = rst_fixture_bytes();
525
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
526
+ .await
527
+ .expect("Should extract RST successfully");
528
+
529
+ assert!(
530
+ result.content.contains("©")
531
+ || result.content.contains("copyright")
532
+ || result.content.contains("umlaut")
533
+ || result.content.contains("unicode"),
534
+ "Should contain unicode characters or references"
535
+ );
536
+
537
+ println!("✅ RST unicode characters test passed!");
538
+ }
539
+
540
+ /// Test escaped characters
541
+ #[tokio::test]
542
+ async fn test_rst_escaped_characters() {
543
+ let content = rst_fixture_bytes();
544
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
545
+ .await
546
+ .expect("Should extract RST successfully");
547
+
548
+ assert!(
549
+ result.content.contains("Backslash")
550
+ || result.content.contains("Backtick")
551
+ || result.content.contains("Asterisk"),
552
+ "Should contain escaped special character sections"
553
+ );
554
+
555
+ println!("✅ RST escaped characters test passed!");
556
+ }
557
+
558
+ // SECTION 12: FOOTNOTES AND REFERENCES
559
+
560
+ /// Test footnote extraction
561
+ #[tokio::test]
562
+ async fn test_rst_footnotes() {
563
+ let content = rst_fixture_bytes();
564
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
565
+ .await
566
+ .expect("Should extract RST successfully");
567
+
568
+ assert!(
569
+ result.content.contains("Note") || result.content.contains("continuation"),
570
+ "Should contain footnote content"
571
+ );
572
+
573
+ println!("✅ RST footnotes test passed!");
574
+ }
575
+
576
+ /// Test block quote extraction
577
+ #[tokio::test]
578
+ async fn test_rst_block_quotes() {
579
+ let content = rst_fixture_bytes();
580
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
581
+ .await
582
+ .expect("Should extract RST successfully");
583
+
584
+ assert!(
585
+ result.content.contains("block quote") || result.content.contains("pretty short"),
586
+ "Should contain block quote content"
587
+ );
588
+
589
+ println!("✅ RST block quotes test passed!");
590
+ }
591
+
592
+ /// Test overall content extraction volume
593
+ #[tokio::test]
594
+ async fn test_rst_content_extraction_volume() {
595
+ let content = rst_fixture_bytes();
596
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
597
+ .await
598
+ .expect("Should extract RST successfully");
599
+
600
+ let content_length = result.content.len();
601
+ println!("Extracted content length: {} bytes", content_length);
602
+
603
+ assert!(
604
+ content_length > 1000,
605
+ "Extracted content should be substantial (> 1000 bytes), got {} bytes",
606
+ content_length
607
+ );
608
+
609
+ assert_eq!(result.mime_type, "text/x-rst", "MIME type should be preserved");
610
+
611
+ println!("✅ RST content extraction volume test passed!");
612
+ println!(" Extracted {} bytes from RST file", content_length);
613
+ }
614
+
615
+ /// Test extracted content contains all major sections
616
+ #[tokio::test]
617
+ async fn test_rst_all_major_sections_present() {
618
+ let content = rst_fixture_bytes();
619
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
620
+ .await
621
+ .expect("Should extract RST successfully");
622
+
623
+ let major_sections = vec![
624
+ "Paragraphs",
625
+ "Block Quotes",
626
+ "Code Blocks",
627
+ "Lists",
628
+ "Field Lists",
629
+ "HTML Blocks",
630
+ "LaTeX Block",
631
+ "Inline Markup",
632
+ "Special Characters",
633
+ "Links",
634
+ "Images",
635
+ "Comments",
636
+ "Tables",
637
+ "Math",
638
+ ];
639
+
640
+ let content_lower = result.content.to_lowercase();
641
+ let mut found_count = 0;
642
+
643
+ for section in major_sections {
644
+ if content_lower.contains(&section.to_lowercase()) {
645
+ found_count += 1;
646
+ println!(" Found section: {}", section);
647
+ } else {
648
+ println!("✗ Missing section: {}", section);
649
+ }
650
+ }
651
+
652
+ assert!(
653
+ found_count >= 10,
654
+ "Should find at least 10 major sections, found {}",
655
+ found_count
656
+ );
657
+
658
+ println!("✅ RST all major sections present test passed!");
659
+ println!(" Found {}/14 major sections", found_count);
660
+ }
661
+
662
+ /// Test MIME type detection
663
+ #[tokio::test]
664
+ async fn test_rst_mime_type_detection() {
665
+ let content = rst_fixture_bytes();
666
+
667
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
668
+ .await
669
+ .expect("Should extract with text/x-rst MIME type");
670
+
671
+ assert_eq!(result.mime_type, "text/x-rst");
672
+
673
+ println!("✅ RST MIME type detection test passed!");
674
+ }
675
+
676
+ /// Test that no extraction errors occur on valid RST file
677
+ #[tokio::test]
678
+ async fn test_rst_extraction_no_errors() {
679
+ let content = rst_fixture_bytes();
680
+
681
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()).await;
682
+
683
+ assert!(
684
+ result.is_ok(),
685
+ "RST extraction should succeed without errors: {:?}",
686
+ result.err()
687
+ );
688
+
689
+ let extraction = result.unwrap();
690
+
691
+ assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
692
+
693
+ println!("✅ RST extraction no errors test passed!");
694
+ }