kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (369) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +538 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +104 -2
  8. data/README.md +454 -454
  9. data/Rakefile +33 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -341
  12. data/ext/kreuzberg_rb/extconf.rb +45 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
  14. data/ext/kreuzberg_rb/native/Cargo.lock +6750 -6941
  15. data/ext/kreuzberg_rb/native/Cargo.toml +53 -54
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +52 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
  23. data/extconf.rb +28 -28
  24. data/kreuzberg.gemspec +214 -214
  25. data/lib/kreuzberg/api_proxy.rb +142 -142
  26. data/lib/kreuzberg/cache_api.rb +81 -81
  27. data/lib/kreuzberg/cli.rb +55 -55
  28. data/lib/kreuzberg/cli_proxy.rb +127 -127
  29. data/lib/kreuzberg/config.rb +724 -724
  30. data/lib/kreuzberg/error_context.rb +80 -80
  31. data/lib/kreuzberg/errors.rb +118 -118
  32. data/lib/kreuzberg/extraction_api.rb +340 -340
  33. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  36. data/lib/kreuzberg/result.rb +279 -279
  37. data/lib/kreuzberg/setup_lib_path.rb +80 -80
  38. data/lib/kreuzberg/validator_protocol.rb +89 -89
  39. data/lib/kreuzberg/version.rb +5 -5
  40. data/lib/kreuzberg.rb +109 -109
  41. data/lib/{pdfium.dll → libpdfium.so} +0 -0
  42. data/sig/kreuzberg/internal.rbs +184 -184
  43. data/sig/kreuzberg.rbs +546 -546
  44. data/spec/binding/cache_spec.rb +227 -227
  45. data/spec/binding/cli_proxy_spec.rb +85 -85
  46. data/spec/binding/cli_spec.rb +55 -55
  47. data/spec/binding/config_spec.rb +345 -345
  48. data/spec/binding/config_validation_spec.rb +283 -283
  49. data/spec/binding/error_handling_spec.rb +213 -213
  50. data/spec/binding/errors_spec.rb +66 -66
  51. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  52. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  53. data/spec/binding/plugins/validator_spec.rb +274 -274
  54. data/spec/fixtures/config.toml +39 -39
  55. data/spec/fixtures/config.yaml +41 -41
  56. data/spec/fixtures/invalid_config.toml +4 -4
  57. data/spec/smoke/package_spec.rb +178 -178
  58. data/spec/spec_helper.rb +42 -42
  59. data/vendor/Cargo.toml +2 -2
  60. data/vendor/kreuzberg/Cargo.toml +5 -5
  61. data/vendor/kreuzberg/README.md +230 -230
  62. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
  63. data/vendor/kreuzberg/build.rs +887 -843
  64. data/vendor/kreuzberg/src/api/error.rs +81 -81
  65. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  66. data/vendor/kreuzberg/src/api/mod.rs +87 -79
  67. data/vendor/kreuzberg/src/api/server.rs +353 -353
  68. data/vendor/kreuzberg/src/api/types.rs +170 -170
  69. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  70. data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
  71. data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
  72. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  73. data/vendor/kreuzberg/src/core/config.rs +1080 -1080
  74. data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
  75. data/vendor/kreuzberg/src/core/io.rs +329 -329
  76. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  77. data/vendor/kreuzberg/src/core/mod.rs +47 -47
  78. data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
  79. data/vendor/kreuzberg/src/embeddings.rs +500 -500
  80. data/vendor/kreuzberg/src/error.rs +431 -431
  81. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  82. data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
  83. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  84. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  85. data/vendor/kreuzberg/src/extraction/html.rs +634 -601
  86. data/vendor/kreuzberg/src/extraction/image.rs +491 -491
  87. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
  88. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
  89. data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
  90. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  91. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  92. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  93. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  94. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
  95. data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
  96. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  97. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  98. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  99. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  100. data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
  101. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
  102. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
  103. data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
  104. data/vendor/kreuzberg/src/extractors/email.rs +157 -157
  105. data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
  106. data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
  107. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
  108. data/vendor/kreuzberg/src/extractors/html.rs +407 -407
  109. data/vendor/kreuzberg/src/extractors/image.rs +219 -219
  110. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
  112. data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
  114. data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
  115. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  116. data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
  117. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
  118. data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
  119. data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
  120. data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
  121. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
  122. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  123. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  124. data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
  125. data/vendor/kreuzberg/src/extractors/text.rs +265 -265
  126. data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
  127. data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
  128. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  129. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  130. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  131. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  132. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  133. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  134. data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
  135. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  136. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  137. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  138. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
  139. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
  140. data/vendor/kreuzberg/src/lib.rs +113 -113
  141. data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
  142. data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
  143. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  144. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  145. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  146. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  147. data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
  148. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  149. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  150. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
  151. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  152. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  153. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  154. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  155. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
  156. data/vendor/kreuzberg/src/pdf/bundled.rs +452 -346
  157. data/vendor/kreuzberg/src/pdf/error.rs +130 -130
  158. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  159. data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
  160. data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
  161. data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
  162. data/vendor/kreuzberg/src/pdf/table.rs +420 -420
  163. data/vendor/kreuzberg/src/pdf/text.rs +240 -240
  164. data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
  165. data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
  166. data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
  167. data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
  168. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
  169. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  170. data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
  171. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  172. data/vendor/kreuzberg/src/text/mod.rs +25 -25
  173. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  174. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
  175. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  176. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  177. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  178. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  179. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  180. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  181. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  182. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  183. data/vendor/kreuzberg/src/types.rs +1055 -1055
  184. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  185. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  186. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  187. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  188. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  189. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  190. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  191. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  192. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  193. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  194. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  195. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  196. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  198. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  199. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  200. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  201. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  202. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  203. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  204. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  205. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  206. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  207. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  208. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  209. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  210. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  211. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  212. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  213. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  214. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  215. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  216. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  217. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  218. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  219. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  220. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  221. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  222. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  223. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  224. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  225. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  226. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  227. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  228. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  229. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  230. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  231. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  232. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  233. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  234. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  235. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  236. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  237. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  238. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  239. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  240. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  241. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  242. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  243. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  244. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  245. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  246. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  247. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  248. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  249. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  250. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  251. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  252. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  253. data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
  254. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
  255. data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
  256. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  257. data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
  258. data/vendor/kreuzberg/tests/config_features.rs +612 -612
  259. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
  260. data/vendor/kreuzberg/tests/core_integration.rs +510 -510
  261. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  262. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
  263. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  264. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  265. data/vendor/kreuzberg/tests/email_integration.rs +327 -327
  266. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  267. data/vendor/kreuzberg/tests/error_handling.rs +402 -402
  268. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  269. data/vendor/kreuzberg/tests/format_integration.rs +165 -164
  270. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  271. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  272. data/vendor/kreuzberg/tests/image_integration.rs +255 -255
  273. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  274. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  275. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  276. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  277. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  278. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  279. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  280. data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
  281. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
  282. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
  283. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
  284. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  285. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
  286. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  287. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  288. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
  289. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
  290. data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
  291. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
  292. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
  293. data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
  294. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  295. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
  296. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
  297. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
  298. data/vendor/kreuzberg/tests/security_validation.rs +416 -416
  299. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  300. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
  301. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
  302. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
  303. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  304. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
  305. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
  306. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
  307. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  308. data/vendor/kreuzberg-tesseract/LICENSE +22 -22
  309. data/vendor/kreuzberg-tesseract/README.md +399 -399
  310. data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
  311. data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
  312. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
  313. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
  314. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
  315. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
  316. data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
  317. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
  318. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
  319. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
  320. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
  321. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
  322. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
  323. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
  324. data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
  325. data/vendor/rb-sys/Cargo.lock +393 -393
  326. data/vendor/rb-sys/Cargo.toml +70 -70
  327. data/vendor/rb-sys/Cargo.toml.orig +57 -57
  328. data/vendor/rb-sys/LICENSE-APACHE +190 -190
  329. data/vendor/rb-sys/LICENSE-MIT +21 -21
  330. data/vendor/rb-sys/build/features.rs +111 -111
  331. data/vendor/rb-sys/build/main.rs +286 -286
  332. data/vendor/rb-sys/build/stable_api_config.rs +155 -155
  333. data/vendor/rb-sys/build/version.rs +50 -50
  334. data/vendor/rb-sys/readme.md +36 -36
  335. data/vendor/rb-sys/src/bindings.rs +21 -21
  336. data/vendor/rb-sys/src/hidden.rs +11 -11
  337. data/vendor/rb-sys/src/lib.rs +35 -35
  338. data/vendor/rb-sys/src/macros.rs +371 -371
  339. data/vendor/rb-sys/src/memory.rs +53 -53
  340. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
  341. data/vendor/rb-sys/src/special_consts.rs +31 -31
  342. data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
  343. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
  344. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
  345. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
  346. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
  347. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
  348. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
  349. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
  350. data/vendor/rb-sys/src/stable_api.rs +260 -260
  351. data/vendor/rb-sys/src/symbol.rs +31 -31
  352. data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
  353. data/vendor/rb-sys/src/utils.rs +89 -89
  354. data/vendor/rb-sys/src/value_type.rs +7 -7
  355. metadata +81 -22
  356. data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
  357. data/vendor/kreuzberg-ffi/README.md +0 -851
  358. data/vendor/kreuzberg-ffi/build.rs +0 -176
  359. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
  360. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
  361. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  362. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
  363. data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
  364. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
  365. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  366. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  367. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  368. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  369. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
@@ -1,1260 +1,1260 @@
1
- #![allow(clippy::len_zero, clippy::unnecessary_get_then_check, clippy::single_match)]
2
- #![cfg(feature = "office")]
3
- //! Comprehensive behavioral tests for Typst extractor against Pandoc baselines.
4
- //!
5
- //! These tests expose the critical bugs found in code review:
6
- //! 1. 62% heading loss bug - only matches single `=` headings
7
- //! 2. Blockquotes not implemented
8
- //! 3. Display math not extracted
9
- //! 4. Nested table brackets cause corruption
10
- //! 5. Empty headings output (just `= ` with no text)
11
- //! 6. Regex failures silently lose metadata
12
- //!
13
- //! The tests are designed to FAIL initially, exposing real bugs that need fixing.
14
- //! They compare extracted output against Pandoc baseline outputs for behavioral parity.
15
-
16
- use kreuzberg::core::config::ExtractionConfig;
17
- use kreuzberg::core::extractor::extract_bytes;
18
- use std::{fs, path::PathBuf};
19
-
20
- fn typst_doc_root() -> PathBuf {
21
- PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/typst")
22
- }
23
-
24
- /// Load a test document from the test_documents/typst directory
25
- fn load_test_document(filename: &str) -> Vec<u8> {
26
- let path = typst_doc_root().join(filename);
27
- fs::read(&path).unwrap_or_else(|_| panic!("Failed to read test document: {}", filename))
28
- }
29
-
30
- /// Load Pandoc baseline output for comparison
31
- fn load_pandoc_baseline(filename_base: &str) -> String {
32
- let path = typst_doc_root().join(format!("{filename_base}_pandoc_baseline.txt"));
33
- fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read baseline: {}", filename_base))
34
- }
35
-
36
- /// Load Pandoc metadata JSON for comparison
37
- fn load_pandoc_metadata(filename_base: &str) -> String {
38
- let path = typst_doc_root().join(format!("{filename_base}_pandoc_meta.json"));
39
- fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read metadata: {}", filename_base))
40
- }
41
-
42
- /// Count specific heading levels (= for level 1, == for level 2, etc.)
43
- fn count_heading_level(content: &str, level: usize) -> usize {
44
- let exact_marker = format!("{} ", "=".repeat(level));
45
- content
46
- .lines()
47
- .filter(|l| l.trim_start().starts_with(&exact_marker))
48
- .count()
49
- }
50
-
51
- /// Extract all headings from content
52
- fn extract_all_headings(content: &str) -> Vec<String> {
53
- content
54
- .lines()
55
- .filter(|l| {
56
- let trimmed = l.trim_start();
57
- trimmed.starts_with('=') && !trimmed.starts_with("#set")
58
- })
59
- .map(|l| l.to_string())
60
- .collect()
61
- }
62
-
63
- /// Count lines that are pure metadata/directives (not content)
64
- fn count_directive_lines(content: &str) -> usize {
65
- content
66
- .lines()
67
- .filter(|l| {
68
- let t = l.trim();
69
- t.starts_with("#set ") || t.starts_with("#let ") || t.starts_with("#import ")
70
- })
71
- .count()
72
- }
73
-
74
- /// Count empty headings (headings with just `= ` and no text)
75
- fn count_empty_headings(content: &str) -> usize {
76
- content
77
- .lines()
78
- .filter(|l| {
79
- let trimmed = l.trim_start();
80
- trimmed == "="
81
- || trimmed == "=="
82
- || trimmed == "==="
83
- || trimmed == "===="
84
- || trimmed == "====="
85
- || trimmed == "======"
86
- })
87
- .count()
88
- }
89
-
90
- /// Extract all text between headings (content blocks)
91
- fn extract_content_blocks(content: &str) -> Vec<String> {
92
- let mut blocks = Vec::new();
93
- let mut current_block = String::new();
94
- let mut in_block = false;
95
-
96
- for line in content.lines() {
97
- let trimmed = line.trim_start();
98
- if trimmed.starts_with('=') && !trimmed.starts_with("#set") {
99
- if !current_block.is_empty() {
100
- blocks.push(current_block.trim().to_string());
101
- current_block.clear();
102
- }
103
- in_block = true;
104
- } else if in_block && !trimmed.is_empty() {
105
- current_block.push_str(line);
106
- current_block.push('\n');
107
- }
108
- }
109
-
110
- if !current_block.is_empty() {
111
- blocks.push(current_block.trim().to_string());
112
- }
113
-
114
- blocks
115
- }
116
-
117
- /// Check if content has reasonable parity with baseline (within tolerance)
118
- fn content_parity_check(extracted: &str, baseline: &str, tolerance_percent: f64) -> bool {
119
- let extracted_len = extracted.len();
120
- let baseline_len = baseline.len();
121
-
122
- if baseline_len == 0 {
123
- return extracted_len == 0;
124
- }
125
-
126
- let ratio = (extracted_len as f64) / (baseline_len as f64);
127
- let acceptable_min = 1.0 - (tolerance_percent / 100.0);
128
- let acceptable_max = 1.0 + (tolerance_percent / 100.0);
129
-
130
- ratio >= acceptable_min && ratio <= acceptable_max
131
- }
132
-
133
- // CRITICAL BUG TESTS - These expose the 45+ issues
134
-
135
- /// TEST 1: CRITICAL - 62% heading loss bug
136
- ///
137
- /// The extractor only matches single `=` headings, completely skipping
138
- /// `==`, `===`, and higher levels. This causes catastrophic data loss
139
- /// in hierarchical documents.
140
- ///
141
- /// Expected: All heading levels should be extracted
142
- /// Current behavior: Only level 1 headings extracted
143
- /// WILL FAIL: Exposing the heading loss bug
144
- #[tokio::test]
145
- async fn test_typst_all_heading_levels_not_lost() {
146
- let content = load_test_document("headings.typ");
147
- let _baseline = load_pandoc_baseline("headings");
148
- let config = ExtractionConfig::default();
149
-
150
- let result = extract_bytes(&content, "application/x-typst", &config)
151
- .await
152
- .expect("Extraction failed");
153
-
154
- let extracted_all_headings = extract_all_headings(&result.content);
155
-
156
- assert!(
157
- extracted_all_headings.len() >= 6,
158
- "CRITICAL BUG: Only extracted {} headings, should have extracted 6+ heading levels. \
159
- This is the 62% heading loss bug - extractor only matches '=' but skips '==', '===', etc.",
160
- extracted_all_headings.len()
161
- );
162
-
163
- for level in 1..=6 {
164
- let count = count_heading_level(&result.content, level);
165
- assert_eq!(
166
- count, 1,
167
- "Heading level {} should appear exactly once (found {}). \
168
- Missing heading levels cause data loss in hierarchical documents.",
169
- level, count
170
- );
171
- }
172
- }
173
-
174
- /// TEST 2: Display math not extracted
175
- ///
176
- /// Display math ($$...$$) is completely lost from extraction,
177
- /// breaking mathematical content preservation.
178
- ///
179
- /// Expected: Display math should be preserved in output
180
- /// Current behavior: Silently dropped
181
- /// WILL FAIL: Exposing display math loss
182
- #[tokio::test]
183
- async fn test_typst_display_math_preserved() {
184
- let content = load_test_document("advanced.typ");
185
- let baseline = load_pandoc_baseline("advanced");
186
- let config = ExtractionConfig::default();
187
-
188
- let result = extract_bytes(&content, "application/x-typst", &config)
189
- .await
190
- .expect("Extraction failed");
191
-
192
- let has_display_math_in_baseline =
193
- baseline.contains("²") || baseline.contains("Display math") || baseline.contains("x^2");
194
-
195
- if has_display_math_in_baseline {
196
- let our_has_math = result.content.contains("$")
197
- || result.content.contains("Display")
198
- || result.content.contains("²")
199
- || result.content.contains("²");
200
-
201
- assert!(
202
- our_has_math,
203
- "Display math should be extracted. Pandoc preserves mathematical notation, \
204
- but extractor drops it entirely. This breaks scientific/academic documents."
205
- );
206
- }
207
-
208
- let has_pythagorean = result.content.contains("^2")
209
- || result.content.contains("²")
210
- || result.content.contains("x") && result.content.contains("y") && result.content.contains("r");
211
-
212
- assert!(
213
- has_pythagorean,
214
- "Pythagorean theorem expression should be present. Display math is being dropped."
215
- );
216
- }
217
-
218
- /// TEST 3: Empty headings output
219
- ///
220
- /// When heading text is missing or malformed, extractor outputs
221
- /// just the marker like "= " with no text, polluting the output.
222
- ///
223
- /// Expected: Either full heading text or no heading at all
224
- /// Current behavior: "= " with no content
225
- /// WILL FAIL: Exposing empty heading bug
226
- #[tokio::test]
227
- async fn test_typst_no_empty_headings_output() {
228
- let content = load_test_document("headings.typ");
229
- let config = ExtractionConfig::default();
230
-
231
- let result = extract_bytes(&content, "application/x-typst", &config)
232
- .await
233
- .expect("Extraction failed");
234
-
235
- let empty_headings = count_empty_headings(&result.content);
236
-
237
- assert_eq!(
238
- empty_headings, 0,
239
- "Found {} empty heading lines (just '=' with no text). \
240
- Extractor outputs malformed headings like '= ' with no text, \
241
- corrupting the document structure.",
242
- empty_headings
243
- );
244
-
245
- for heading in extract_all_headings(&result.content) {
246
- let trimmed = heading.trim_start();
247
- let after_marker = trimmed.trim_start_matches('=').trim();
248
- assert!(
249
- !after_marker.is_empty(),
250
- "Heading '{}' has no text after marker. Should not output empty headings.",
251
- trimmed
252
- );
253
- }
254
- }
255
-
256
- /// TEST 4: Metadata extraction fails with regex silently
257
- ///
258
- /// When regex patterns fail to match metadata fields,
259
- /// the extractor silently returns None instead of logging/failing,
260
- /// causing complete metadata loss for certain formats.
261
- ///
262
- /// Expected: All metadata fields should be extracted
263
- /// Current behavior: Some formats fail silently
264
- /// WILL FAIL: Exposing metadata loss
265
- #[tokio::test]
266
- async fn test_typst_metadata_extraction_completeness() {
267
- let content = load_test_document("metadata.typ");
268
- let _baseline_meta = load_pandoc_metadata("metadata");
269
- let config = ExtractionConfig::default();
270
-
271
- let result = extract_bytes(&content, "application/x-typst", &config)
272
- .await
273
- .expect("Extraction failed");
274
-
275
- let has_title = result
276
- .metadata
277
- .additional
278
- .get("title")
279
- .map(|t| t.to_string().len() > 0)
280
- .unwrap_or(false);
281
-
282
- let has_author = result
283
- .metadata
284
- .additional
285
- .get("author")
286
- .map(|a| a.to_string().len() > 0)
287
- .unwrap_or(false);
288
-
289
- let has_keywords = result
290
- .metadata
291
- .additional
292
- .get("keywords")
293
- .map(|k| k.to_string().len() > 0)
294
- .unwrap_or(false);
295
-
296
- assert!(
297
- has_title,
298
- "Title metadata should be extracted. Regex pattern matching fails silently \
299
- and metadata is lost with no error reporting."
300
- );
301
-
302
- assert!(
303
- has_author,
304
- "Author metadata should be extracted. Some metadata formats fail silently."
305
- );
306
-
307
- assert!(
308
- has_keywords,
309
- "Keywords should be extracted. Regex failures cause silent data loss."
310
- );
311
- }
312
-
313
- /// TEST 5: Nested table brackets cause corruption
314
- ///
315
- /// Tables with nested brackets like [Name [full]] corrupt the
316
- /// table content extraction because bracket counting is naive.
317
- ///
318
- /// Expected: Table cells should be extracted correctly even with nesting
319
- /// Current behavior: Bracket nesting causes cells to be malformed
320
- /// WILL FAIL: Exposing table corruption bug
321
- #[tokio::test]
322
- async fn test_typst_tables_with_nested_brackets_not_corrupted() {
323
- let content = load_test_document("advanced.typ");
324
- let baseline = load_pandoc_baseline("advanced");
325
- let config = ExtractionConfig::default();
326
-
327
- let result = extract_bytes(&content, "application/x-typst", &config)
328
- .await
329
- .expect("Extraction failed");
330
-
331
- let has_table_in_baseline = baseline.contains("Name") && baseline.contains("Alice");
332
-
333
- if has_table_in_baseline {
334
- let table_content_extracted =
335
- result.content.contains("Name") && result.content.contains("Alice") && result.content.contains("Age");
336
-
337
- assert!(
338
- table_content_extracted,
339
- "Table content should be extracted correctly. Nested brackets cause corruption \
340
- and table cells are malformed."
341
- );
342
-
343
- let corrupted_brackets = result.content.matches("[[").count();
344
- assert_eq!(
345
- corrupted_brackets, 0,
346
- "Found corrupted bracket sequences [[. Table extraction with nested brackets \
347
- produces malformed output."
348
- );
349
- }
350
- }
351
-
352
- /// TEST 6: Content volume parity - within tolerance of Pandoc
353
- ///
354
- /// Our extractor should extract roughly the same amount of content
355
- /// as Pandoc (baseline). Large discrepancies indicate data loss or
356
- /// noise injection.
357
- ///
358
- /// Expected: Within reasonable tolerance of baseline content size
359
- /// Current behavior: Significant data loss on complex documents (e.g., advanced.typ)
360
- /// WILL FAIL: Exposing data loss on complex documents with formatting
361
- #[tokio::test]
362
- async fn test_typst_content_volume_parity_with_pandoc() {
363
- let documents = vec![("simple", 30.0), ("headings", 20.0)];
364
-
365
- for (doc_name, tolerance) in documents {
366
- let content = load_test_document(&format!("{}.typ", doc_name));
367
- let baseline = load_pandoc_baseline(doc_name);
368
- let config = ExtractionConfig::default();
369
-
370
- let result = extract_bytes(&content, "application/x-typst", &config)
371
- .await
372
- .unwrap_or_else(|_| panic!("Extraction failed for {}", doc_name));
373
-
374
- let baseline_size = baseline.len();
375
- let extracted_size = result.content.len();
376
-
377
- let is_within_tolerance = content_parity_check(&result.content, &baseline, tolerance);
378
-
379
- assert!(
380
- is_within_tolerance,
381
- "Content volume parity failed for {}: \
382
- Baseline: {} bytes, Extracted: {} bytes ({}% tolerance allowed). \
383
- Data loss indicates missing extraction features or formatting issues.",
384
- doc_name, baseline_size, extracted_size, tolerance
385
- );
386
- }
387
- }
388
-
389
- /// TEST 7: Blockquotes not implemented
390
- ///
391
- /// Blockquotes (using > syntax in other formats, typst uses #quote)
392
- /// are completely unimplemented, causing loss of semantic structure.
393
- ///
394
- /// Expected: Blockquote content should be extracted
395
- /// Current behavior: Feature not implemented
396
- /// WILL FAIL: Exposing missing blockquote support
397
- #[tokio::test]
398
- async fn test_typst_blockquote_handling() {
399
- let test_content = b"#quote[
400
- This is a blockquote.
401
- It should be extracted.
402
- ]";
403
-
404
- let config = ExtractionConfig::default();
405
- let result = extract_bytes(test_content, "application/x-typst", &config)
406
- .await
407
- .expect("Extraction failed");
408
-
409
- let has_blockquote_content =
410
- result.content.contains("blockquote") || result.content.contains("This is a blockquote");
411
-
412
- assert!(
413
- has_blockquote_content,
414
- "Blockquote content should be extracted. Blockquotes are not implemented \
415
- in the extractor, causing complete loss of quoted content."
416
- );
417
- }
418
-
419
- /// TEST 8: Inline code preservation
420
- ///
421
- /// Test that inline code blocks are properly extracted and marked.
422
- /// This ensures code snippets aren't corrupted.
423
- ///
424
- /// Expected: Inline code preserved with backticks or clearly marked
425
- /// Current behavior: May be corrupted
426
- /// WILL FAIL: If inline code is not preserved
427
- #[tokio::test]
428
- async fn test_typst_inline_code_preserved() {
429
- let content = load_test_document("advanced.typ");
430
- let baseline = load_pandoc_baseline("advanced");
431
- let config = ExtractionConfig::default();
432
-
433
- let result = extract_bytes(&content, "application/x-typst", &config)
434
- .await
435
- .expect("Extraction failed");
436
-
437
- let has_inline_code =
438
- result.content.contains("`") || (result.content.contains("code") && baseline.contains("`code`"));
439
-
440
- assert!(
441
- has_inline_code,
442
- "Inline code should be preserved with backticks or clearly marked."
443
- );
444
- }
445
-
446
- /// TEST 9: Inline math extraction
447
- ///
448
- /// Inline math (single $ delimiters) should be extracted and preserved.
449
- ///
450
- /// Expected: Inline math formulas preserved
451
- /// Current behavior: May be dropped
452
- /// WILL FAIL: If inline math is lost
453
- #[tokio::test]
454
- async fn test_typst_inline_math_preserved() {
455
- let content = load_test_document("advanced.typ");
456
- let baseline = load_pandoc_baseline("advanced");
457
- let config = ExtractionConfig::default();
458
-
459
- let result = extract_bytes(&content, "application/x-typst", &config)
460
- .await
461
- .expect("Extraction failed");
462
-
463
- let has_inline_math =
464
- result.content.contains("$") || result.content.contains("sqrt") || result.content.contains("equation");
465
-
466
- if baseline.contains("$") || baseline.contains("equation") {
467
- assert!(
468
- has_inline_math,
469
- "Inline math should be extracted. Mathematical formulas are being dropped."
470
- );
471
- }
472
- }
473
-
474
- /// TEST 10: Figures and captions
475
- ///
476
- /// Figure extraction with captions should preserve both image references
477
- /// and caption text.
478
- ///
479
- /// Expected: Figure content and captions extracted
480
- /// Current behavior: May be unimplemented
481
- #[tokio::test]
482
- async fn test_typst_figures_and_captions() {
483
- let test_content = b"#figure(
484
- image(\"example.png\"),
485
- caption: [This is a figure caption]
486
- )";
487
-
488
- let config = ExtractionConfig::default();
489
- let result = extract_bytes(test_content, "application/x-typst", &config)
490
- .await
491
- .expect("Extraction failed");
492
-
493
- let _has_caption = result.content.contains("caption") || result.content.contains("figure");
494
-
495
- println!(
496
- "Figure extraction result (feature may be unimplemented): {:?}",
497
- result.content
498
- );
499
- }
500
-
501
- /// TEST 11: Citation/reference handling
502
- ///
503
- /// Citations and references should be extracted when present.
504
- ///
505
- /// Expected: Citation markers and text preserved
506
- /// Current behavior: May be dropped
507
- #[tokio::test]
508
- async fn test_typst_citations_preserved() {
509
- let test_content = b"Here is a citation @smith2020.
510
-
511
- = References
512
-
513
- #bibliography()";
514
-
515
- let config = ExtractionConfig::default();
516
- let result = extract_bytes(test_content, "application/x-typst", &config)
517
- .await
518
- .expect("Extraction failed");
519
-
520
- let _has_citation = result.content.contains("@smith2020")
521
- || result.content.contains("smith")
522
- || result.content.contains("References");
523
-
524
- println!("Citation handling (may be limited): {:?}", result.content);
525
- }
526
-
527
- /// TEST 12: Link extraction and formatting
528
- ///
529
- /// Links should be extracted with both URL and link text.
530
- ///
531
- /// Expected: Links in markdown format [text](url)
532
- /// Current behavior: May lose URL or text
533
- #[tokio::test]
534
- async fn test_typst_link_extraction() {
535
- let content = load_test_document("advanced.typ");
536
- let _baseline = load_pandoc_baseline("advanced");
537
- let config = ExtractionConfig::default();
538
-
539
- let result = extract_bytes(&content, "application/x-typst", &config)
540
- .await
541
- .expect("Extraction failed");
542
-
543
- let has_link_content =
544
- result.content.contains("example") || result.content.contains("link") || result.content.contains("https");
545
-
546
- assert!(
547
- has_link_content,
548
- "Link content should be extracted. Links may be completely dropped."
549
- );
550
- }
551
-
552
- /// TEST 13: Unordered list extraction
553
- ///
554
- /// Both + and - list markers should be converted to standard format.
555
- ///
556
- /// Expected: All list items extracted and normalized
557
- /// Current behavior: May lose some items
558
- #[tokio::test]
559
- async fn test_typst_list_extraction() {
560
- let content = load_test_document("simple.typ");
561
- let _baseline = load_pandoc_baseline("simple");
562
- let config = ExtractionConfig::default();
563
-
564
- let result = extract_bytes(&content, "application/x-typst", &config)
565
- .await
566
- .expect("Extraction failed");
567
-
568
- let has_list_markers = result.content.contains("-") || result.content.contains("+");
569
- let has_list_content =
570
- result.content.contains("First") || result.content.contains("Second") || result.content.contains("item");
571
-
572
- assert!(
573
- has_list_markers || has_list_content,
574
- "List items should be extracted with markers or content preserved."
575
- );
576
- }
577
-
578
- /// TEST 14: Code block extraction
579
- ///
580
- /// Triple-backtick code blocks should be fully extracted with language specifiers.
581
- ///
582
- /// Expected: Code blocks with language markers preserved
583
- /// Current behavior: May be malformed
584
- #[tokio::test]
585
- async fn test_typst_code_block_extraction() {
586
- let content = load_test_document("advanced.typ");
587
- let _baseline = load_pandoc_baseline("advanced");
588
- let config = ExtractionConfig::default();
589
-
590
- let result = extract_bytes(&content, "application/x-typst", &config)
591
- .await
592
- .expect("Extraction failed");
593
-
594
- let has_code = result.content.contains("```")
595
- || result.content.contains("def")
596
- || result.content.contains("fibonacci")
597
- || result.content.contains("python");
598
-
599
- assert!(has_code, "Code blocks should be extracted with language specifiers.");
600
- }
601
-
602
- /// TEST 15: Bold and italic formatting
603
- ///
604
- /// Inline emphasis formatting should be preserved or normalized.
605
- ///
606
- /// Expected: Bold (*text*) and italic (_text_) markers present
607
- /// Current behavior: May be lost
608
- #[tokio::test]
609
- async fn test_typst_emphasis_formatting() {
610
- let content = load_test_document("advanced.typ");
611
- let config = ExtractionConfig::default();
612
-
613
- let result = extract_bytes(&content, "application/x-typst", &config)
614
- .await
615
- .expect("Extraction failed");
616
-
617
- let has_emphasis = result.content.contains("*") && result.content.contains("_");
618
-
619
- assert!(has_emphasis, "Bold and italic formatting markers should be preserved.");
620
- }
621
-
622
- /// TEST 16: Complex nested formatting
623
- ///
624
- /// Test handling of *_nested formatting_* combinations.
625
- ///
626
- /// Expected: Nested formatting preserved or flattened consistently
627
- /// Current behavior: May be malformed
628
- #[tokio::test]
629
- async fn test_typst_nested_formatting() {
630
- let test_content = b"This is *bold with _nested italic_* text.";
631
-
632
- let config = ExtractionConfig::default();
633
- let result = extract_bytes(test_content, "application/x-typst", &config)
634
- .await
635
- .expect("Extraction failed");
636
-
637
- let has_formatting = result.content.contains("*")
638
- || result.content.contains("_")
639
- || (result.content.contains("bold") && result.content.contains("italic"));
640
-
641
- assert!(
642
- has_formatting,
643
- "Nested formatting should be preserved or flattened consistently."
644
- );
645
- }
646
-
647
- /// TEST 17: Multiple paragraph handling
648
- ///
649
- /// Multiple paragraphs separated by blank lines should be preserved.
650
- ///
651
- /// Expected: Paragraph structure maintained
652
- /// Current behavior: May merge or lose paragraphs
653
- #[tokio::test]
654
- async fn test_typst_multiple_paragraphs() {
655
- let content = load_test_document("advanced.typ");
656
- let _baseline = load_pandoc_baseline("advanced");
657
- let config = ExtractionConfig::default();
658
-
659
- let result = extract_bytes(&content, "application/x-typst", &config)
660
- .await
661
- .expect("Extraction failed");
662
-
663
- let non_empty_lines: Vec<_> = result.content.lines().filter(|l| !l.trim().is_empty()).collect();
664
-
665
- assert!(
666
- non_empty_lines.len() >= 5,
667
- "Multiple paragraphs should be preserved. Found {} content lines.",
668
- non_empty_lines.len()
669
- );
670
- }
671
-
672
- /// TEST 18: Heading-content association
673
- ///
674
- /// Content should follow its heading logically in the output.
675
- ///
676
- /// Expected: Each heading followed by its content
677
- /// Current behavior: May be scrambled
678
- #[tokio::test]
679
- async fn test_typst_heading_content_association() {
680
- let content = load_test_document("advanced.typ");
681
- let config = ExtractionConfig::default();
682
-
683
- let result = extract_bytes(&content, "application/x-typst", &config)
684
- .await
685
- .expect("Extraction failed");
686
-
687
- let blocks = extract_content_blocks(&result.content);
688
-
689
- assert!(blocks.len() > 0, "Content blocks should be associated with headings.");
690
-
691
- for block in &blocks {
692
- assert!(block.len() > 0, "Content blocks should not be empty.");
693
- }
694
- }
695
-
696
- /// TEST 19: Whitespace normalization
697
- ///
698
- /// Multiple blank lines should be normalized consistently.
699
- ///
700
- /// Expected: Single blank lines between sections
701
- /// Current behavior: May have excessive whitespace
702
- #[tokio::test]
703
- async fn test_typst_whitespace_handling() {
704
- let content = load_test_document("advanced.typ");
705
- let config = ExtractionConfig::default();
706
-
707
- let result = extract_bytes(&content, "application/x-typst", &config)
708
- .await
709
- .expect("Extraction failed");
710
-
711
- let blank_line_runs: Vec<_> = result.content.split("\n\n\n").collect();
712
-
713
- assert!(
714
- blank_line_runs.len() <= 2,
715
- "Should not have excessive blank lines (triple newlines). \
716
- Found {} instances of triple newlines.",
717
- blank_line_runs.len() - 1
718
- );
719
- }
720
-
721
- /// TEST 20: Minimal document handling
722
- ///
723
- /// Even minimal documents should extract correctly.
724
- ///
725
- /// Expected: Basic content and structure
726
- /// Current behavior: May fail or lose content
727
- #[tokio::test]
728
- async fn test_typst_minimal_document() {
729
- let content = load_test_document("minimal.typ");
730
- let _baseline = load_pandoc_baseline("minimal");
731
- let config = ExtractionConfig::default();
732
-
733
- let result = extract_bytes(&content, "application/x-typst", &config)
734
- .await
735
- .expect("Extraction failed");
736
-
737
- assert!(
738
- !result.content.is_empty(),
739
- "Even minimal documents should extract some content."
740
- );
741
-
742
- assert!(
743
- result.content.len() > 0,
744
- "Minimal document should produce non-empty output."
745
- );
746
- }
747
-
748
- /// TEST 21: No directive pollution
749
- ///
750
- /// Extracted content should not contain #set, #let, #import directives.
751
- ///
752
- /// Expected: Clean extracted content without directives
753
- /// Current behavior: May include directives
754
- #[tokio::test]
755
- async fn test_typst_no_directive_pollution() {
756
- let content = load_test_document("advanced.typ");
757
- let config = ExtractionConfig::default();
758
-
759
- let result = extract_bytes(&content, "application/x-typst", &config)
760
- .await
761
- .expect("Extraction failed");
762
-
763
- let directive_count = count_directive_lines(&result.content);
764
-
765
- assert_eq!(
766
- directive_count, 0,
767
- "Extracted content should not contain directives (#set, #let, etc). \
768
- Found {} directive lines polluting the output.",
769
- directive_count
770
- );
771
- }
772
-
773
- /// TEST 22: Metadata field completeness
774
- ///
775
- /// All metadata fields from baseline should be present.
776
- ///
777
- /// Expected: Title, author, date, keywords all extracted
778
- /// Current behavior: Some fields missing
779
- #[tokio::test]
780
- async fn test_typst_metadata_field_completeness() {
781
- let content = load_test_document("advanced.typ");
782
- let config = ExtractionConfig::default();
783
-
784
- let result = extract_bytes(&content, "application/x-typst", &config)
785
- .await
786
- .expect("Extraction failed");
787
-
788
- let has_title = result.metadata.additional.get("title").is_some();
789
- let has_author = result.metadata.additional.get("author").is_some();
790
- let has_date = result.metadata.date.is_some();
791
-
792
- assert!(
793
- has_title && has_author && has_date,
794
- "All metadata fields should be extracted. \
795
- Title: {}, Author: {}, Date: {}",
796
- has_title,
797
- has_author,
798
- has_date
799
- );
800
- }
801
-
802
- /// TEST 23: Special character handling
803
- ///
804
- /// Unicode and special characters should be preserved.
805
- ///
806
- /// Expected: Special characters like ü, é, etc. preserved
807
- /// Current behavior: May be corrupted
808
- #[tokio::test]
809
- async fn test_typst_special_character_preservation() {
810
- let test_content = "Café with naïve français".as_bytes();
811
-
812
- let config = ExtractionConfig::default();
813
- let result = extract_bytes(test_content, "application/x-typst", &config)
814
- .await
815
- .expect("Extraction failed");
816
-
817
- let has_special_chars =
818
- result.content.contains("Café") || result.content.contains("naïve") || result.content.contains("français");
819
-
820
- assert!(
821
- has_special_chars,
822
- "Special characters should be preserved in extraction."
823
- );
824
- }
825
-
826
- /// TEST 24: Very long heading handling
827
- ///
828
- /// Long headings should not cause truncation or corruption.
829
- ///
830
- /// Expected: Full heading text preserved regardless of length
831
- /// Current behavior: May truncate
832
- #[tokio::test]
833
- async fn test_typst_long_heading_handling() {
834
- let test_content = b"= This is a very long heading that should be completely preserved without any truncation or corruption whatsoever";
835
-
836
- let config = ExtractionConfig::default();
837
- let result = extract_bytes(test_content, "application/x-typst", &config)
838
- .await
839
- .expect("Extraction failed");
840
-
841
- let has_heading_start = result.content.contains("very long heading");
842
-
843
- assert!(has_heading_start, "Long headings should not be truncated.");
844
- }
845
-
846
- /// TEST 25: Edge case - Empty heading recovery
847
- ///
848
- /// Even if a heading has no text, extraction should be robust.
849
- ///
850
- /// Expected: Graceful handling without crashes
851
- /// Current behavior: May panic or produce empty output
852
- #[tokio::test]
853
- async fn test_typst_empty_heading_edge_case() {
854
- let test_content = b"= \n\n== \nContent here";
855
-
856
- let config = ExtractionConfig::default();
857
- let result = extract_bytes(test_content, "application/x-typst", &config).await;
858
-
859
- match result {
860
- Ok(extraction) => {
861
- assert!(
862
- extraction.content.contains("Content"),
863
- "Should extract regular content even if some headings are empty."
864
- );
865
- }
866
- Err(_) => {}
867
- }
868
- }
869
-
870
- /// TEST 26: Regression - Basic heading extraction
871
- #[tokio::test]
872
- async fn test_typst_basic_heading_regression() {
873
- let test_content = b"= Main Heading\n\nContent here";
874
-
875
- let config = ExtractionConfig::default();
876
- let result = extract_bytes(test_content, "application/x-typst", &config)
877
- .await
878
- .expect("Extraction failed");
879
-
880
- assert!(
881
- result.content.contains("= Main Heading"),
882
- "Basic level-1 heading should be extracted."
883
- );
884
-
885
- assert!(result.content.contains("Content"), "Content should be extracted.");
886
- }
887
-
888
- /// TEST 27: Regression - Level 2 heading extraction
889
- #[tokio::test]
890
- async fn test_typst_level2_heading_regression() {
891
- let test_content = b"= Main\n\n== Subsection\n\nMore content";
892
-
893
- let config = ExtractionConfig::default();
894
- let result = extract_bytes(test_content, "application/x-typst", &config)
895
- .await
896
- .expect("Extraction failed");
897
-
898
- assert!(
899
- result.content.contains("== Subsection"),
900
- "Level 2 headings must be extracted."
901
- );
902
- }
903
-
904
- /// TEST 28: Regression - Basic metadata
905
- #[tokio::test]
906
- async fn test_typst_basic_metadata_regression() {
907
- let test_content = b"#set document(title: \"Test\", author: \"Me\")\n\n= Heading";
908
-
909
- let config = ExtractionConfig::default();
910
- let result = extract_bytes(test_content, "application/x-typst", &config)
911
- .await
912
- .expect("Extraction failed");
913
-
914
- assert!(
915
- result.metadata.additional.get("title").is_some(),
916
- "Title metadata must be extracted."
917
- );
918
-
919
- assert!(
920
- result.metadata.additional.get("author").is_some(),
921
- "Author metadata must be extracted."
922
- );
923
- }
924
-
925
- /// TEST 29: Regression - Bold formatting
926
- #[tokio::test]
927
- async fn test_typst_bold_regression() {
928
- let test_content = b"This is *bold text* here";
929
-
930
- let config = ExtractionConfig::default();
931
- let result = extract_bytes(test_content, "application/x-typst", &config)
932
- .await
933
- .expect("Extraction failed");
934
-
935
- assert!(
936
- result.content.contains("*bold*") || result.content.contains("bold"),
937
- "Bold text should be preserved."
938
- );
939
- }
940
-
941
- /// TEST 30: Regression - Inline code
942
- #[tokio::test]
943
- async fn test_typst_inline_code_regression() {
944
- let test_content = b"Use `println!(\"hello\")` in Rust";
945
-
946
- let config = ExtractionConfig::default();
947
- let result = extract_bytes(test_content, "application/x-typst", &config)
948
- .await
949
- .expect("Extraction failed");
950
-
951
- assert!(
952
- result.content.contains("`") && result.content.contains("println"),
953
- "Inline code should be preserved with backticks."
954
- );
955
- }
956
-
957
- /// TEST 31: Regression - Code blocks
958
- #[tokio::test]
959
- async fn test_typst_codeblock_regression() {
960
- let test_content = b"```rust\nfn main() {}\n```";
961
-
962
- let config = ExtractionConfig::default();
963
- let result = extract_bytes(test_content, "application/x-typst", &config)
964
- .await
965
- .expect("Extraction failed");
966
-
967
- assert!(
968
- result.content.contains("```"),
969
- "Code block delimiters should be preserved."
970
- );
971
-
972
- assert!(
973
- result.content.contains("fn main"),
974
- "Code block content should be preserved."
975
- );
976
- }
977
-
978
- /// TEST 32: Regression - List extraction
979
- #[tokio::test]
980
- async fn test_typst_list_regression() {
981
- let test_content = b"- Item 1\n+ Item 2\n- Item 3";
982
-
983
- let config = ExtractionConfig::default();
984
- let result = extract_bytes(test_content, "application/x-typst", &config)
985
- .await
986
- .expect("Extraction failed");
987
-
988
- assert!(
989
- result.content.contains("Item 1") && result.content.contains("Item 2") && result.content.contains("Item 3"),
990
- "All list items should be extracted."
991
- );
992
- }
993
-
994
- /// TEST 33: Regression - Math preservation
995
- #[tokio::test]
996
- async fn test_typst_math_regression() {
997
- let test_content = b"Formula: $E = mc^2$";
998
-
999
- let config = ExtractionConfig::default();
1000
- let result = extract_bytes(test_content, "application/x-typst", &config)
1001
- .await
1002
- .expect("Extraction failed");
1003
-
1004
- assert!(
1005
- result.content.contains("$") && (result.content.contains("mc") || result.content.contains("E")),
1006
- "Math formulas should be preserved."
1007
- );
1008
- }
1009
-
1010
- /// TEST 34: Regression - Link extraction
1011
- #[tokio::test]
1012
- async fn test_typst_link_regression() {
1013
- let test_content = b"Visit #link(\"https://example.com\")[example]";
1014
-
1015
- let config = ExtractionConfig::default();
1016
- let result = extract_bytes(test_content, "application/x-typst", &config)
1017
- .await
1018
- .expect("Extraction failed");
1019
-
1020
- assert!(
1021
- result.content.contains("example") || result.content.contains("example.com"),
1022
- "Link text or URL should be preserved."
1023
- );
1024
- }
1025
-
1026
- /// TEST 35: Regression - Table basic extraction
1027
- #[tokio::test]
1028
- async fn test_typst_table_regression() {
1029
- let test_content = b"#table(columns: 2, [A], [B], [1], [2])";
1030
-
1031
- let config = ExtractionConfig::default();
1032
- let result = extract_bytes(test_content, "application/x-typst", &config)
1033
- .await
1034
- .expect("Extraction failed");
1035
-
1036
- assert!(
1037
- result.content.contains("A") || result.content.contains("TABLE"),
1038
- "Table content should be extracted."
1039
- );
1040
- }
1041
-
1042
- /// TEST 36: Large document handling
1043
- #[tokio::test]
1044
- async fn test_typst_large_document_stress() {
1045
- let mut large_content = String::new();
1046
-
1047
- for i in 1..=50 {
1048
- large_content.push_str(&format!("= Heading {}\n\n", i));
1049
- large_content.push_str(&format!("Content for section {}.\n\n", i));
1050
- }
1051
-
1052
- let config = ExtractionConfig::default();
1053
- let result = extract_bytes(large_content.as_bytes(), "application/x-typst", &config)
1054
- .await
1055
- .expect("Extraction failed");
1056
-
1057
- let heading_count = extract_all_headings(&result.content).len();
1058
- assert!(
1059
- heading_count >= 40,
1060
- "Large documents should extract all headings. Found {} of 50.",
1061
- heading_count
1062
- );
1063
- }
1064
-
1065
- /// TEST 37: Deep nesting stress test
1066
- #[tokio::test]
1067
- async fn test_typst_deep_nesting_stress() {
1068
- let mut nested = String::new();
1069
-
1070
- for level in 1..=6 {
1071
- nested.push_str(&format!("{} Level {} Heading\n\n", "=".repeat(level), level));
1072
- nested.push_str(&format!("Content at level {}.\n\n", level));
1073
- }
1074
-
1075
- let config = ExtractionConfig::default();
1076
- let result = extract_bytes(nested.as_bytes(), "application/x-typst", &config)
1077
- .await
1078
- .expect("Extraction failed");
1079
-
1080
- for level in 1..=6 {
1081
- let count = count_heading_level(&result.content, level);
1082
- assert!(
1083
- count >= 1,
1084
- "Level {} heading should be extracted in deep nesting test.",
1085
- level
1086
- );
1087
- }
1088
- }
1089
-
1090
- /// TEST 38: Mixed formatting stress
1091
- #[tokio::test]
1092
- async fn test_typst_mixed_formatting_stress() {
1093
- let test_content = b"This text has *bold*, _italic_, `code`, and $math$ all mixed together!";
1094
-
1095
- let config = ExtractionConfig::default();
1096
- let result = extract_bytes(test_content, "application/x-typst", &config)
1097
- .await
1098
- .expect("Extraction failed");
1099
-
1100
- let has_formatting = (result.content.contains("*") || result.content.contains("bold"))
1101
- && (result.content.contains("_") || result.content.contains("italic"))
1102
- && (result.content.contains("`") || result.content.contains("code"))
1103
- && (result.content.contains("$") || result.content.contains("math"));
1104
-
1105
- assert!(has_formatting, "All mixed formatting should be preserved.");
1106
- }
1107
-
1108
- /// TEST 39: Unicode stress test
1109
- #[tokio::test]
1110
- async fn test_typst_unicode_stress() {
1111
- let test_content = "= Unicode Heading 中文 العربية\n\nContent with emojis: 🎉🚀💯\n\nGreek: α β γ δ ε ζ".as_bytes();
1112
-
1113
- let config = ExtractionConfig::default();
1114
- let result = extract_bytes(test_content, "application/x-typst", &config)
1115
- .await
1116
- .expect("Extraction failed");
1117
-
1118
- assert!(
1119
- result.content.contains("Unicode"),
1120
- "Unicode content should be preserved."
1121
- );
1122
- }
1123
-
1124
- /// TEST 40: Pathological whitespace
1125
- #[tokio::test]
1126
- async fn test_typst_pathological_whitespace() {
1127
- let test_content = b"= Heading\n\n\n\n\n\nContent with excessive blank lines\n\n\n\n\nMore content";
1128
-
1129
- let config = ExtractionConfig::default();
1130
- let result = extract_bytes(test_content, "application/x-typst", &config)
1131
- .await
1132
- .expect("Extraction failed");
1133
-
1134
- assert!(
1135
- result.content.contains("Heading") && result.content.contains("Content"),
1136
- "Should extract content even with excessive whitespace."
1137
- );
1138
- }
1139
-
1140
- /// TEST 41: Full document comparison - simple.typ
1141
- #[tokio::test]
1142
- async fn test_typst_full_simple_document_comparison() {
1143
- let content = load_test_document("simple.typ");
1144
- let _baseline = load_pandoc_baseline("simple");
1145
- let config = ExtractionConfig::default();
1146
-
1147
- let result = extract_bytes(&content, "application/x-typst", &config)
1148
- .await
1149
- .expect("Extraction failed");
1150
-
1151
- assert!(
1152
- result.content.len() > 50,
1153
- "simple.typ should extract substantial content"
1154
- );
1155
-
1156
- let heading_count = extract_all_headings(&result.content).len();
1157
- assert!(heading_count > 2, "simple.typ should have multiple sections");
1158
- }
1159
-
1160
- /// TEST 42: Full document comparison - advanced.typ
1161
- #[tokio::test]
1162
- async fn test_typst_full_advanced_document_comparison() {
1163
- let content = load_test_document("advanced.typ");
1164
- let _baseline = load_pandoc_baseline("advanced");
1165
- let config = ExtractionConfig::default();
1166
-
1167
- let result = extract_bytes(&content, "application/x-typst", &config)
1168
- .await
1169
- .expect("Extraction failed");
1170
-
1171
- assert!(
1172
- result.content.len() > 100,
1173
- "advanced.typ should extract comprehensive content"
1174
- );
1175
-
1176
- let heading_count = extract_all_headings(&result.content).len();
1177
- assert!(heading_count >= 5, "advanced.typ should preserve heading structure");
1178
- }
1179
-
1180
- /// TEST 43: MIME type consistency
1181
- ///
1182
- /// The extractor should support both standard MIME types for Typst.
1183
- /// Currently only supports application/x-typst, not text/x-typst.
1184
- #[tokio::test]
1185
- async fn test_typst_mime_type_consistency() {
1186
- let content = load_test_document("simple.typ");
1187
- let config = ExtractionConfig::default();
1188
-
1189
- let result_primary = extract_bytes(&content, "application/x-typst", &config)
1190
- .await
1191
- .expect("Primary MIME type should work");
1192
-
1193
- assert!(
1194
- result_primary.content.len() > 0,
1195
- "Primary MIME type should extract content"
1196
- );
1197
-
1198
- match extract_bytes(&content, "text/x-typst", &config).await {
1199
- Ok(result) => {
1200
- assert!(
1201
- result.content.len() > 0,
1202
- "Alternative MIME type should extract content if supported"
1203
- );
1204
- }
1205
- Err(_e) => {
1206
- println!("Note: text/x-typst is not currently supported (may be added in future)");
1207
- }
1208
- }
1209
- }
1210
-
1211
- /// TEST 44: Config parameter impact
1212
- #[tokio::test]
1213
- async fn test_typst_config_parameter_handling() {
1214
- let content = load_test_document("simple.typ");
1215
- let config = ExtractionConfig::default();
1216
-
1217
- let result = extract_bytes(&content, "application/x-typst", &config)
1218
- .await
1219
- .expect("Extraction failed");
1220
-
1221
- assert!(!result.content.is_empty(), "Extraction with default config should work");
1222
-
1223
- assert_eq!(result.mime_type, "application/x-typst", "MIME type should be preserved");
1224
- }
1225
-
1226
- /// TEST 45: Comparative heading analysis
1227
- ///
1228
- /// This final comprehensive test checks heading extraction
1229
- /// against the baseline to identify the exact scope of the heading loss bug.
1230
- #[tokio::test]
1231
- async fn test_typst_heading_loss_bug_analysis() {
1232
- let content = load_test_document("headings.typ");
1233
- let baseline = load_pandoc_baseline("headings");
1234
- let config = ExtractionConfig::default();
1235
-
1236
- let result = extract_bytes(&content, "application/x-typst", &config)
1237
- .await
1238
- .expect("Extraction failed");
1239
-
1240
- println!("\n===== HEADING EXTRACTION ANALYSIS =====");
1241
- println!("Baseline content:");
1242
- println!("{}", baseline);
1243
- println!("\nExtracted content:");
1244
- println!("{}", result.content);
1245
-
1246
- let extracted_headings = extract_all_headings(&result.content);
1247
- println!("\nExtracted headings: {}", extracted_headings.len());
1248
- for (i, h) in extracted_headings.iter().enumerate() {
1249
- println!(" {}: {}", i + 1, h);
1250
- }
1251
-
1252
- assert!(
1253
- extracted_headings.len() >= 6,
1254
- "BUG CONFIRMED: Heading loss detected. \
1255
- Expected 6 headings (1-6 levels), found {}. \
1256
- This is the 62% heading loss bug - only single '=' is matched, \
1257
- all '==' and higher are skipped entirely.",
1258
- extracted_headings.len()
1259
- );
1260
- }
1
+ #![allow(clippy::len_zero, clippy::unnecessary_get_then_check, clippy::single_match)]
2
+ #![cfg(feature = "office")]
3
+ //! Comprehensive behavioral tests for Typst extractor against Pandoc baselines.
4
+ //!
5
+ //! These tests expose the critical bugs found in code review:
6
+ //! 1. 62% heading loss bug - only matches single `=` headings
7
+ //! 2. Blockquotes not implemented
8
+ //! 3. Display math not extracted
9
+ //! 4. Nested table brackets cause corruption
10
+ //! 5. Empty headings output (just `= ` with no text)
11
+ //! 6. Regex failures silently lose metadata
12
+ //!
13
+ //! The tests are designed to FAIL initially, exposing real bugs that need fixing.
14
+ //! They compare extracted output against Pandoc baseline outputs for behavioral parity.
15
+
16
+ use kreuzberg::core::config::ExtractionConfig;
17
+ use kreuzberg::core::extractor::extract_bytes;
18
+ use std::{fs, path::PathBuf};
19
+
20
+ fn typst_doc_root() -> PathBuf {
21
+ PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/typst")
22
+ }
23
+
24
+ /// Load a test document from the test_documents/typst directory
25
+ fn load_test_document(filename: &str) -> Vec<u8> {
26
+ let path = typst_doc_root().join(filename);
27
+ fs::read(&path).unwrap_or_else(|_| panic!("Failed to read test document: {}", filename))
28
+ }
29
+
30
+ /// Load Pandoc baseline output for comparison
31
+ fn load_pandoc_baseline(filename_base: &str) -> String {
32
+ let path = typst_doc_root().join(format!("{filename_base}_pandoc_baseline.txt"));
33
+ fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read baseline: {}", filename_base))
34
+ }
35
+
36
+ /// Load Pandoc metadata JSON for comparison
37
+ fn load_pandoc_metadata(filename_base: &str) -> String {
38
+ let path = typst_doc_root().join(format!("{filename_base}_pandoc_meta.json"));
39
+ fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read metadata: {}", filename_base))
40
+ }
41
+
42
+ /// Count specific heading levels (= for level 1, == for level 2, etc.)
43
+ fn count_heading_level(content: &str, level: usize) -> usize {
44
+ let exact_marker = format!("{} ", "=".repeat(level));
45
+ content
46
+ .lines()
47
+ .filter(|l| l.trim_start().starts_with(&exact_marker))
48
+ .count()
49
+ }
50
+
51
+ /// Extract all headings from content
52
+ fn extract_all_headings(content: &str) -> Vec<String> {
53
+ content
54
+ .lines()
55
+ .filter(|l| {
56
+ let trimmed = l.trim_start();
57
+ trimmed.starts_with('=') && !trimmed.starts_with("#set")
58
+ })
59
+ .map(|l| l.to_string())
60
+ .collect()
61
+ }
62
+
63
+ /// Count lines that are pure metadata/directives (not content)
64
+ fn count_directive_lines(content: &str) -> usize {
65
+ content
66
+ .lines()
67
+ .filter(|l| {
68
+ let t = l.trim();
69
+ t.starts_with("#set ") || t.starts_with("#let ") || t.starts_with("#import ")
70
+ })
71
+ .count()
72
+ }
73
+
74
+ /// Count empty headings (headings with just `= ` and no text)
75
+ fn count_empty_headings(content: &str) -> usize {
76
+ content
77
+ .lines()
78
+ .filter(|l| {
79
+ let trimmed = l.trim_start();
80
+ trimmed == "="
81
+ || trimmed == "=="
82
+ || trimmed == "==="
83
+ || trimmed == "===="
84
+ || trimmed == "====="
85
+ || trimmed == "======"
86
+ })
87
+ .count()
88
+ }
89
+
90
+ /// Extract all text between headings (content blocks)
91
+ fn extract_content_blocks(content: &str) -> Vec<String> {
92
+ let mut blocks = Vec::new();
93
+ let mut current_block = String::new();
94
+ let mut in_block = false;
95
+
96
+ for line in content.lines() {
97
+ let trimmed = line.trim_start();
98
+ if trimmed.starts_with('=') && !trimmed.starts_with("#set") {
99
+ if !current_block.is_empty() {
100
+ blocks.push(current_block.trim().to_string());
101
+ current_block.clear();
102
+ }
103
+ in_block = true;
104
+ } else if in_block && !trimmed.is_empty() {
105
+ current_block.push_str(line);
106
+ current_block.push('\n');
107
+ }
108
+ }
109
+
110
+ if !current_block.is_empty() {
111
+ blocks.push(current_block.trim().to_string());
112
+ }
113
+
114
+ blocks
115
+ }
116
+
117
+ /// Check if content has reasonable parity with baseline (within tolerance)
118
+ fn content_parity_check(extracted: &str, baseline: &str, tolerance_percent: f64) -> bool {
119
+ let extracted_len = extracted.len();
120
+ let baseline_len = baseline.len();
121
+
122
+ if baseline_len == 0 {
123
+ return extracted_len == 0;
124
+ }
125
+
126
+ let ratio = (extracted_len as f64) / (baseline_len as f64);
127
+ let acceptable_min = 1.0 - (tolerance_percent / 100.0);
128
+ let acceptable_max = 1.0 + (tolerance_percent / 100.0);
129
+
130
+ ratio >= acceptable_min && ratio <= acceptable_max
131
+ }
132
+
133
+ // CRITICAL BUG TESTS - These expose the 45+ issues
134
+
135
+ /// TEST 1: CRITICAL - 62% heading loss bug
136
+ ///
137
+ /// The extractor only matches single `=` headings, completely skipping
138
+ /// `==`, `===`, and higher levels. This causes catastrophic data loss
139
+ /// in hierarchical documents.
140
+ ///
141
+ /// Expected: All heading levels should be extracted
142
+ /// Current behavior: Only level 1 headings extracted
143
+ /// WILL FAIL: Exposing the heading loss bug
144
+ #[tokio::test]
145
+ async fn test_typst_all_heading_levels_not_lost() {
146
+ let content = load_test_document("headings.typ");
147
+ let _baseline = load_pandoc_baseline("headings");
148
+ let config = ExtractionConfig::default();
149
+
150
+ let result = extract_bytes(&content, "application/x-typst", &config)
151
+ .await
152
+ .expect("Extraction failed");
153
+
154
+ let extracted_all_headings = extract_all_headings(&result.content);
155
+
156
+ assert!(
157
+ extracted_all_headings.len() >= 6,
158
+ "CRITICAL BUG: Only extracted {} headings, should have extracted 6+ heading levels. \
159
+ This is the 62% heading loss bug - extractor only matches '=' but skips '==', '===', etc.",
160
+ extracted_all_headings.len()
161
+ );
162
+
163
+ for level in 1..=6 {
164
+ let count = count_heading_level(&result.content, level);
165
+ assert_eq!(
166
+ count, 1,
167
+ "Heading level {} should appear exactly once (found {}). \
168
+ Missing heading levels cause data loss in hierarchical documents.",
169
+ level, count
170
+ );
171
+ }
172
+ }
173
+
174
+ /// TEST 2: Display math not extracted
175
+ ///
176
+ /// Display math ($$...$$) is completely lost from extraction,
177
+ /// breaking mathematical content preservation.
178
+ ///
179
+ /// Expected: Display math should be preserved in output
180
+ /// Current behavior: Silently dropped
181
+ /// WILL FAIL: Exposing display math loss
182
+ #[tokio::test]
183
+ async fn test_typst_display_math_preserved() {
184
+ let content = load_test_document("advanced.typ");
185
+ let baseline = load_pandoc_baseline("advanced");
186
+ let config = ExtractionConfig::default();
187
+
188
+ let result = extract_bytes(&content, "application/x-typst", &config)
189
+ .await
190
+ .expect("Extraction failed");
191
+
192
+ let has_display_math_in_baseline =
193
+ baseline.contains("²") || baseline.contains("Display math") || baseline.contains("x^2");
194
+
195
+ if has_display_math_in_baseline {
196
+ let our_has_math = result.content.contains("$")
197
+ || result.content.contains("Display")
198
+ || result.content.contains("²")
199
+ || result.content.contains("²");
200
+
201
+ assert!(
202
+ our_has_math,
203
+ "Display math should be extracted. Pandoc preserves mathematical notation, \
204
+ but extractor drops it entirely. This breaks scientific/academic documents."
205
+ );
206
+ }
207
+
208
+ let has_pythagorean = result.content.contains("^2")
209
+ || result.content.contains("²")
210
+ || result.content.contains("x") && result.content.contains("y") && result.content.contains("r");
211
+
212
+ assert!(
213
+ has_pythagorean,
214
+ "Pythagorean theorem expression should be present. Display math is being dropped."
215
+ );
216
+ }
217
+
218
+ /// TEST 3: Empty headings output
219
+ ///
220
+ /// When heading text is missing or malformed, extractor outputs
221
+ /// just the marker like "= " with no text, polluting the output.
222
+ ///
223
+ /// Expected: Either full heading text or no heading at all
224
+ /// Current behavior: "= " with no content
225
+ /// WILL FAIL: Exposing empty heading bug
226
+ #[tokio::test]
227
+ async fn test_typst_no_empty_headings_output() {
228
+ let content = load_test_document("headings.typ");
229
+ let config = ExtractionConfig::default();
230
+
231
+ let result = extract_bytes(&content, "application/x-typst", &config)
232
+ .await
233
+ .expect("Extraction failed");
234
+
235
+ let empty_headings = count_empty_headings(&result.content);
236
+
237
+ assert_eq!(
238
+ empty_headings, 0,
239
+ "Found {} empty heading lines (just '=' with no text). \
240
+ Extractor outputs malformed headings like '= ' with no text, \
241
+ corrupting the document structure.",
242
+ empty_headings
243
+ );
244
+
245
+ for heading in extract_all_headings(&result.content) {
246
+ let trimmed = heading.trim_start();
247
+ let after_marker = trimmed.trim_start_matches('=').trim();
248
+ assert!(
249
+ !after_marker.is_empty(),
250
+ "Heading '{}' has no text after marker. Should not output empty headings.",
251
+ trimmed
252
+ );
253
+ }
254
+ }
255
+
256
+ /// TEST 4: Metadata extraction fails with regex silently
257
+ ///
258
+ /// When regex patterns fail to match metadata fields,
259
+ /// the extractor silently returns None instead of logging/failing,
260
+ /// causing complete metadata loss for certain formats.
261
+ ///
262
+ /// Expected: All metadata fields should be extracted
263
+ /// Current behavior: Some formats fail silently
264
+ /// WILL FAIL: Exposing metadata loss
265
+ #[tokio::test]
266
+ async fn test_typst_metadata_extraction_completeness() {
267
+ let content = load_test_document("metadata.typ");
268
+ let _baseline_meta = load_pandoc_metadata("metadata");
269
+ let config = ExtractionConfig::default();
270
+
271
+ let result = extract_bytes(&content, "application/x-typst", &config)
272
+ .await
273
+ .expect("Extraction failed");
274
+
275
+ let has_title = result
276
+ .metadata
277
+ .additional
278
+ .get("title")
279
+ .map(|t| t.to_string().len() > 0)
280
+ .unwrap_or(false);
281
+
282
+ let has_author = result
283
+ .metadata
284
+ .additional
285
+ .get("author")
286
+ .map(|a| a.to_string().len() > 0)
287
+ .unwrap_or(false);
288
+
289
+ let has_keywords = result
290
+ .metadata
291
+ .additional
292
+ .get("keywords")
293
+ .map(|k| k.to_string().len() > 0)
294
+ .unwrap_or(false);
295
+
296
+ assert!(
297
+ has_title,
298
+ "Title metadata should be extracted. Regex pattern matching fails silently \
299
+ and metadata is lost with no error reporting."
300
+ );
301
+
302
+ assert!(
303
+ has_author,
304
+ "Author metadata should be extracted. Some metadata formats fail silently."
305
+ );
306
+
307
+ assert!(
308
+ has_keywords,
309
+ "Keywords should be extracted. Regex failures cause silent data loss."
310
+ );
311
+ }
312
+
313
+ /// TEST 5: Nested table brackets cause corruption
314
+ ///
315
+ /// Tables with nested brackets like [Name [full]] corrupt the
316
+ /// table content extraction because bracket counting is naive.
317
+ ///
318
+ /// Expected: Table cells should be extracted correctly even with nesting
319
+ /// Current behavior: Bracket nesting causes cells to be malformed
320
+ /// WILL FAIL: Exposing table corruption bug
321
+ #[tokio::test]
322
+ async fn test_typst_tables_with_nested_brackets_not_corrupted() {
323
+ let content = load_test_document("advanced.typ");
324
+ let baseline = load_pandoc_baseline("advanced");
325
+ let config = ExtractionConfig::default();
326
+
327
+ let result = extract_bytes(&content, "application/x-typst", &config)
328
+ .await
329
+ .expect("Extraction failed");
330
+
331
+ let has_table_in_baseline = baseline.contains("Name") && baseline.contains("Alice");
332
+
333
+ if has_table_in_baseline {
334
+ let table_content_extracted =
335
+ result.content.contains("Name") && result.content.contains("Alice") && result.content.contains("Age");
336
+
337
+ assert!(
338
+ table_content_extracted,
339
+ "Table content should be extracted correctly. Nested brackets cause corruption \
340
+ and table cells are malformed."
341
+ );
342
+
343
+ let corrupted_brackets = result.content.matches("[[").count();
344
+ assert_eq!(
345
+ corrupted_brackets, 0,
346
+ "Found corrupted bracket sequences [[. Table extraction with nested brackets \
347
+ produces malformed output."
348
+ );
349
+ }
350
+ }
351
+
352
+ /// TEST 6: Content volume parity - within tolerance of Pandoc
353
+ ///
354
+ /// Our extractor should extract roughly the same amount of content
355
+ /// as Pandoc (baseline). Large discrepancies indicate data loss or
356
+ /// noise injection.
357
+ ///
358
+ /// Expected: Within reasonable tolerance of baseline content size
359
+ /// Current behavior: Significant data loss on complex documents (e.g., advanced.typ)
360
+ /// WILL FAIL: Exposing data loss on complex documents with formatting
361
+ #[tokio::test]
362
+ async fn test_typst_content_volume_parity_with_pandoc() {
363
+ let documents = vec![("simple", 30.0), ("headings", 20.0)];
364
+
365
+ for (doc_name, tolerance) in documents {
366
+ let content = load_test_document(&format!("{}.typ", doc_name));
367
+ let baseline = load_pandoc_baseline(doc_name);
368
+ let config = ExtractionConfig::default();
369
+
370
+ let result = extract_bytes(&content, "application/x-typst", &config)
371
+ .await
372
+ .unwrap_or_else(|_| panic!("Extraction failed for {}", doc_name));
373
+
374
+ let baseline_size = baseline.len();
375
+ let extracted_size = result.content.len();
376
+
377
+ let is_within_tolerance = content_parity_check(&result.content, &baseline, tolerance);
378
+
379
+ assert!(
380
+ is_within_tolerance,
381
+ "Content volume parity failed for {}: \
382
+ Baseline: {} bytes, Extracted: {} bytes ({}% tolerance allowed). \
383
+ Data loss indicates missing extraction features or formatting issues.",
384
+ doc_name, baseline_size, extracted_size, tolerance
385
+ );
386
+ }
387
+ }
388
+
389
+ /// TEST 7: Blockquotes not implemented
390
+ ///
391
+ /// Blockquotes (using > syntax in other formats, typst uses #quote)
392
+ /// are completely unimplemented, causing loss of semantic structure.
393
+ ///
394
+ /// Expected: Blockquote content should be extracted
395
+ /// Current behavior: Feature not implemented
396
+ /// WILL FAIL: Exposing missing blockquote support
397
+ #[tokio::test]
398
+ async fn test_typst_blockquote_handling() {
399
+ let test_content = b"#quote[
400
+ This is a blockquote.
401
+ It should be extracted.
402
+ ]";
403
+
404
+ let config = ExtractionConfig::default();
405
+ let result = extract_bytes(test_content, "application/x-typst", &config)
406
+ .await
407
+ .expect("Extraction failed");
408
+
409
+ let has_blockquote_content =
410
+ result.content.contains("blockquote") || result.content.contains("This is a blockquote");
411
+
412
+ assert!(
413
+ has_blockquote_content,
414
+ "Blockquote content should be extracted. Blockquotes are not implemented \
415
+ in the extractor, causing complete loss of quoted content."
416
+ );
417
+ }
418
+
419
+ /// TEST 8: Inline code preservation
420
+ ///
421
+ /// Test that inline code blocks are properly extracted and marked.
422
+ /// This ensures code snippets aren't corrupted.
423
+ ///
424
+ /// Expected: Inline code preserved with backticks or clearly marked
425
+ /// Current behavior: May be corrupted
426
+ /// WILL FAIL: If inline code is not preserved
427
+ #[tokio::test]
428
+ async fn test_typst_inline_code_preserved() {
429
+ let content = load_test_document("advanced.typ");
430
+ let baseline = load_pandoc_baseline("advanced");
431
+ let config = ExtractionConfig::default();
432
+
433
+ let result = extract_bytes(&content, "application/x-typst", &config)
434
+ .await
435
+ .expect("Extraction failed");
436
+
437
+ let has_inline_code =
438
+ result.content.contains("`") || (result.content.contains("code") && baseline.contains("`code`"));
439
+
440
+ assert!(
441
+ has_inline_code,
442
+ "Inline code should be preserved with backticks or clearly marked."
443
+ );
444
+ }
445
+
446
+ /// TEST 9: Inline math extraction
447
+ ///
448
+ /// Inline math (single $ delimiters) should be extracted and preserved.
449
+ ///
450
+ /// Expected: Inline math formulas preserved
451
+ /// Current behavior: May be dropped
452
+ /// WILL FAIL: If inline math is lost
453
+ #[tokio::test]
454
+ async fn test_typst_inline_math_preserved() {
455
+ let content = load_test_document("advanced.typ");
456
+ let baseline = load_pandoc_baseline("advanced");
457
+ let config = ExtractionConfig::default();
458
+
459
+ let result = extract_bytes(&content, "application/x-typst", &config)
460
+ .await
461
+ .expect("Extraction failed");
462
+
463
+ let has_inline_math =
464
+ result.content.contains("$") || result.content.contains("sqrt") || result.content.contains("equation");
465
+
466
+ if baseline.contains("$") || baseline.contains("equation") {
467
+ assert!(
468
+ has_inline_math,
469
+ "Inline math should be extracted. Mathematical formulas are being dropped."
470
+ );
471
+ }
472
+ }
473
+
474
+ /// TEST 10: Figures and captions
475
+ ///
476
+ /// Figure extraction with captions should preserve both image references
477
+ /// and caption text.
478
+ ///
479
+ /// Expected: Figure content and captions extracted
480
+ /// Current behavior: May be unimplemented
481
+ #[tokio::test]
482
+ async fn test_typst_figures_and_captions() {
483
+ let test_content = b"#figure(
484
+ image(\"example.png\"),
485
+ caption: [This is a figure caption]
486
+ )";
487
+
488
+ let config = ExtractionConfig::default();
489
+ let result = extract_bytes(test_content, "application/x-typst", &config)
490
+ .await
491
+ .expect("Extraction failed");
492
+
493
+ let _has_caption = result.content.contains("caption") || result.content.contains("figure");
494
+
495
+ println!(
496
+ "Figure extraction result (feature may be unimplemented): {:?}",
497
+ result.content
498
+ );
499
+ }
500
+
501
+ /// TEST 11: Citation/reference handling
502
+ ///
503
+ /// Citations and references should be extracted when present.
504
+ ///
505
+ /// Expected: Citation markers and text preserved
506
+ /// Current behavior: May be dropped
507
+ #[tokio::test]
508
+ async fn test_typst_citations_preserved() {
509
+ let test_content = b"Here is a citation @smith2020.
510
+
511
+ = References
512
+
513
+ #bibliography()";
514
+
515
+ let config = ExtractionConfig::default();
516
+ let result = extract_bytes(test_content, "application/x-typst", &config)
517
+ .await
518
+ .expect("Extraction failed");
519
+
520
+ let _has_citation = result.content.contains("@smith2020")
521
+ || result.content.contains("smith")
522
+ || result.content.contains("References");
523
+
524
+ println!("Citation handling (may be limited): {:?}", result.content);
525
+ }
526
+
527
+ /// TEST 12: Link extraction and formatting
528
+ ///
529
+ /// Links should be extracted with both URL and link text.
530
+ ///
531
+ /// Expected: Links in markdown format [text](url)
532
+ /// Current behavior: May lose URL or text
533
+ #[tokio::test]
534
+ async fn test_typst_link_extraction() {
535
+ let content = load_test_document("advanced.typ");
536
+ let _baseline = load_pandoc_baseline("advanced");
537
+ let config = ExtractionConfig::default();
538
+
539
+ let result = extract_bytes(&content, "application/x-typst", &config)
540
+ .await
541
+ .expect("Extraction failed");
542
+
543
+ let has_link_content =
544
+ result.content.contains("example") || result.content.contains("link") || result.content.contains("https");
545
+
546
+ assert!(
547
+ has_link_content,
548
+ "Link content should be extracted. Links may be completely dropped."
549
+ );
550
+ }
551
+
552
+ /// TEST 13: Unordered list extraction
553
+ ///
554
+ /// Both + and - list markers should be converted to standard format.
555
+ ///
556
+ /// Expected: All list items extracted and normalized
557
+ /// Current behavior: May lose some items
558
+ #[tokio::test]
559
+ async fn test_typst_list_extraction() {
560
+ let content = load_test_document("simple.typ");
561
+ let _baseline = load_pandoc_baseline("simple");
562
+ let config = ExtractionConfig::default();
563
+
564
+ let result = extract_bytes(&content, "application/x-typst", &config)
565
+ .await
566
+ .expect("Extraction failed");
567
+
568
+ let has_list_markers = result.content.contains("-") || result.content.contains("+");
569
+ let has_list_content =
570
+ result.content.contains("First") || result.content.contains("Second") || result.content.contains("item");
571
+
572
+ assert!(
573
+ has_list_markers || has_list_content,
574
+ "List items should be extracted with markers or content preserved."
575
+ );
576
+ }
577
+
578
+ /// TEST 14: Code block extraction
579
+ ///
580
+ /// Triple-backtick code blocks should be fully extracted with language specifiers.
581
+ ///
582
+ /// Expected: Code blocks with language markers preserved
583
+ /// Current behavior: May be malformed
584
+ #[tokio::test]
585
+ async fn test_typst_code_block_extraction() {
586
+ let content = load_test_document("advanced.typ");
587
+ let _baseline = load_pandoc_baseline("advanced");
588
+ let config = ExtractionConfig::default();
589
+
590
+ let result = extract_bytes(&content, "application/x-typst", &config)
591
+ .await
592
+ .expect("Extraction failed");
593
+
594
+ let has_code = result.content.contains("```")
595
+ || result.content.contains("def")
596
+ || result.content.contains("fibonacci")
597
+ || result.content.contains("python");
598
+
599
+ assert!(has_code, "Code blocks should be extracted with language specifiers.");
600
+ }
601
+
602
+ /// TEST 15: Bold and italic formatting
603
+ ///
604
+ /// Inline emphasis formatting should be preserved or normalized.
605
+ ///
606
+ /// Expected: Bold (*text*) and italic (_text_) markers present
607
+ /// Current behavior: May be lost
608
+ #[tokio::test]
609
+ async fn test_typst_emphasis_formatting() {
610
+ let content = load_test_document("advanced.typ");
611
+ let config = ExtractionConfig::default();
612
+
613
+ let result = extract_bytes(&content, "application/x-typst", &config)
614
+ .await
615
+ .expect("Extraction failed");
616
+
617
+ let has_emphasis = result.content.contains("*") && result.content.contains("_");
618
+
619
+ assert!(has_emphasis, "Bold and italic formatting markers should be preserved.");
620
+ }
621
+
622
+ /// TEST 16: Complex nested formatting
623
+ ///
624
+ /// Test handling of *_nested formatting_* combinations.
625
+ ///
626
+ /// Expected: Nested formatting preserved or flattened consistently
627
+ /// Current behavior: May be malformed
628
+ #[tokio::test]
629
+ async fn test_typst_nested_formatting() {
630
+ let test_content = b"This is *bold with _nested italic_* text.";
631
+
632
+ let config = ExtractionConfig::default();
633
+ let result = extract_bytes(test_content, "application/x-typst", &config)
634
+ .await
635
+ .expect("Extraction failed");
636
+
637
+ let has_formatting = result.content.contains("*")
638
+ || result.content.contains("_")
639
+ || (result.content.contains("bold") && result.content.contains("italic"));
640
+
641
+ assert!(
642
+ has_formatting,
643
+ "Nested formatting should be preserved or flattened consistently."
644
+ );
645
+ }
646
+
647
+ /// TEST 17: Multiple paragraph handling
648
+ ///
649
+ /// Multiple paragraphs separated by blank lines should be preserved.
650
+ ///
651
+ /// Expected: Paragraph structure maintained
652
+ /// Current behavior: May merge or lose paragraphs
653
+ #[tokio::test]
654
+ async fn test_typst_multiple_paragraphs() {
655
+ let content = load_test_document("advanced.typ");
656
+ let _baseline = load_pandoc_baseline("advanced");
657
+ let config = ExtractionConfig::default();
658
+
659
+ let result = extract_bytes(&content, "application/x-typst", &config)
660
+ .await
661
+ .expect("Extraction failed");
662
+
663
+ let non_empty_lines: Vec<_> = result.content.lines().filter(|l| !l.trim().is_empty()).collect();
664
+
665
+ assert!(
666
+ non_empty_lines.len() >= 5,
667
+ "Multiple paragraphs should be preserved. Found {} content lines.",
668
+ non_empty_lines.len()
669
+ );
670
+ }
671
+
672
+ /// TEST 18: Heading-content association
673
+ ///
674
+ /// Content should follow its heading logically in the output.
675
+ ///
676
+ /// Expected: Each heading followed by its content
677
+ /// Current behavior: May be scrambled
678
+ #[tokio::test]
679
+ async fn test_typst_heading_content_association() {
680
+ let content = load_test_document("advanced.typ");
681
+ let config = ExtractionConfig::default();
682
+
683
+ let result = extract_bytes(&content, "application/x-typst", &config)
684
+ .await
685
+ .expect("Extraction failed");
686
+
687
+ let blocks = extract_content_blocks(&result.content);
688
+
689
+ assert!(blocks.len() > 0, "Content blocks should be associated with headings.");
690
+
691
+ for block in &blocks {
692
+ assert!(block.len() > 0, "Content blocks should not be empty.");
693
+ }
694
+ }
695
+
696
+ /// TEST 19: Whitespace normalization
697
+ ///
698
+ /// Multiple blank lines should be normalized consistently.
699
+ ///
700
+ /// Expected: Single blank lines between sections
701
+ /// Current behavior: May have excessive whitespace
702
+ #[tokio::test]
703
+ async fn test_typst_whitespace_handling() {
704
+ let content = load_test_document("advanced.typ");
705
+ let config = ExtractionConfig::default();
706
+
707
+ let result = extract_bytes(&content, "application/x-typst", &config)
708
+ .await
709
+ .expect("Extraction failed");
710
+
711
+ let blank_line_runs: Vec<_> = result.content.split("\n\n\n").collect();
712
+
713
+ assert!(
714
+ blank_line_runs.len() <= 2,
715
+ "Should not have excessive blank lines (triple newlines). \
716
+ Found {} instances of triple newlines.",
717
+ blank_line_runs.len() - 1
718
+ );
719
+ }
720
+
721
+ /// TEST 20: Minimal document handling
722
+ ///
723
+ /// Even minimal documents should extract correctly.
724
+ ///
725
+ /// Expected: Basic content and structure
726
+ /// Current behavior: May fail or lose content
727
+ #[tokio::test]
728
+ async fn test_typst_minimal_document() {
729
+ let content = load_test_document("minimal.typ");
730
+ let _baseline = load_pandoc_baseline("minimal");
731
+ let config = ExtractionConfig::default();
732
+
733
+ let result = extract_bytes(&content, "application/x-typst", &config)
734
+ .await
735
+ .expect("Extraction failed");
736
+
737
+ assert!(
738
+ !result.content.is_empty(),
739
+ "Even minimal documents should extract some content."
740
+ );
741
+
742
+ assert!(
743
+ result.content.len() > 0,
744
+ "Minimal document should produce non-empty output."
745
+ );
746
+ }
747
+
748
+ /// TEST 21: No directive pollution
749
+ ///
750
+ /// Extracted content should not contain #set, #let, #import directives.
751
+ ///
752
+ /// Expected: Clean extracted content without directives
753
+ /// Current behavior: May include directives
754
+ #[tokio::test]
755
+ async fn test_typst_no_directive_pollution() {
756
+ let content = load_test_document("advanced.typ");
757
+ let config = ExtractionConfig::default();
758
+
759
+ let result = extract_bytes(&content, "application/x-typst", &config)
760
+ .await
761
+ .expect("Extraction failed");
762
+
763
+ let directive_count = count_directive_lines(&result.content);
764
+
765
+ assert_eq!(
766
+ directive_count, 0,
767
+ "Extracted content should not contain directives (#set, #let, etc). \
768
+ Found {} directive lines polluting the output.",
769
+ directive_count
770
+ );
771
+ }
772
+
773
+ /// TEST 22: Metadata field completeness
774
+ ///
775
+ /// All metadata fields from baseline should be present.
776
+ ///
777
+ /// Expected: Title, author, date, keywords all extracted
778
+ /// Current behavior: Some fields missing
779
+ #[tokio::test]
780
+ async fn test_typst_metadata_field_completeness() {
781
+ let content = load_test_document("advanced.typ");
782
+ let config = ExtractionConfig::default();
783
+
784
+ let result = extract_bytes(&content, "application/x-typst", &config)
785
+ .await
786
+ .expect("Extraction failed");
787
+
788
+ let has_title = result.metadata.additional.get("title").is_some();
789
+ let has_author = result.metadata.additional.get("author").is_some();
790
+ let has_date = result.metadata.date.is_some();
791
+
792
+ assert!(
793
+ has_title && has_author && has_date,
794
+ "All metadata fields should be extracted. \
795
+ Title: {}, Author: {}, Date: {}",
796
+ has_title,
797
+ has_author,
798
+ has_date
799
+ );
800
+ }
801
+
802
+ /// TEST 23: Special character handling
803
+ ///
804
+ /// Unicode and special characters should be preserved.
805
+ ///
806
+ /// Expected: Special characters like ü, é, etc. preserved
807
+ /// Current behavior: May be corrupted
808
+ #[tokio::test]
809
+ async fn test_typst_special_character_preservation() {
810
+ let test_content = "Café with naïve français".as_bytes();
811
+
812
+ let config = ExtractionConfig::default();
813
+ let result = extract_bytes(test_content, "application/x-typst", &config)
814
+ .await
815
+ .expect("Extraction failed");
816
+
817
+ let has_special_chars =
818
+ result.content.contains("Café") || result.content.contains("naïve") || result.content.contains("français");
819
+
820
+ assert!(
821
+ has_special_chars,
822
+ "Special characters should be preserved in extraction."
823
+ );
824
+ }
825
+
826
+ /// TEST 24: Very long heading handling
827
+ ///
828
+ /// Long headings should not cause truncation or corruption.
829
+ ///
830
+ /// Expected: Full heading text preserved regardless of length
831
+ /// Current behavior: May truncate
832
+ #[tokio::test]
833
+ async fn test_typst_long_heading_handling() {
834
+ let test_content = b"= This is a very long heading that should be completely preserved without any truncation or corruption whatsoever";
835
+
836
+ let config = ExtractionConfig::default();
837
+ let result = extract_bytes(test_content, "application/x-typst", &config)
838
+ .await
839
+ .expect("Extraction failed");
840
+
841
+ let has_heading_start = result.content.contains("very long heading");
842
+
843
+ assert!(has_heading_start, "Long headings should not be truncated.");
844
+ }
845
+
846
+ /// TEST 25: Edge case - Empty heading recovery
847
+ ///
848
+ /// Even if a heading has no text, extraction should be robust.
849
+ ///
850
+ /// Expected: Graceful handling without crashes
851
+ /// Current behavior: May panic or produce empty output
852
+ #[tokio::test]
853
+ async fn test_typst_empty_heading_edge_case() {
854
+ let test_content = b"= \n\n== \nContent here";
855
+
856
+ let config = ExtractionConfig::default();
857
+ let result = extract_bytes(test_content, "application/x-typst", &config).await;
858
+
859
+ match result {
860
+ Ok(extraction) => {
861
+ assert!(
862
+ extraction.content.contains("Content"),
863
+ "Should extract regular content even if some headings are empty."
864
+ );
865
+ }
866
+ Err(_) => {}
867
+ }
868
+ }
869
+
870
+ /// TEST 26: Regression - Basic heading extraction
871
+ #[tokio::test]
872
+ async fn test_typst_basic_heading_regression() {
873
+ let test_content = b"= Main Heading\n\nContent here";
874
+
875
+ let config = ExtractionConfig::default();
876
+ let result = extract_bytes(test_content, "application/x-typst", &config)
877
+ .await
878
+ .expect("Extraction failed");
879
+
880
+ assert!(
881
+ result.content.contains("= Main Heading"),
882
+ "Basic level-1 heading should be extracted."
883
+ );
884
+
885
+ assert!(result.content.contains("Content"), "Content should be extracted.");
886
+ }
887
+
888
+ /// TEST 27: Regression - Level 2 heading extraction
889
+ #[tokio::test]
890
+ async fn test_typst_level2_heading_regression() {
891
+ let test_content = b"= Main\n\n== Subsection\n\nMore content";
892
+
893
+ let config = ExtractionConfig::default();
894
+ let result = extract_bytes(test_content, "application/x-typst", &config)
895
+ .await
896
+ .expect("Extraction failed");
897
+
898
+ assert!(
899
+ result.content.contains("== Subsection"),
900
+ "Level 2 headings must be extracted."
901
+ );
902
+ }
903
+
904
+ /// TEST 28: Regression - Basic metadata
905
+ #[tokio::test]
906
+ async fn test_typst_basic_metadata_regression() {
907
+ let test_content = b"#set document(title: \"Test\", author: \"Me\")\n\n= Heading";
908
+
909
+ let config = ExtractionConfig::default();
910
+ let result = extract_bytes(test_content, "application/x-typst", &config)
911
+ .await
912
+ .expect("Extraction failed");
913
+
914
+ assert!(
915
+ result.metadata.additional.get("title").is_some(),
916
+ "Title metadata must be extracted."
917
+ );
918
+
919
+ assert!(
920
+ result.metadata.additional.get("author").is_some(),
921
+ "Author metadata must be extracted."
922
+ );
923
+ }
924
+
925
+ /// TEST 29: Regression - Bold formatting
926
+ #[tokio::test]
927
+ async fn test_typst_bold_regression() {
928
+ let test_content = b"This is *bold text* here";
929
+
930
+ let config = ExtractionConfig::default();
931
+ let result = extract_bytes(test_content, "application/x-typst", &config)
932
+ .await
933
+ .expect("Extraction failed");
934
+
935
+ assert!(
936
+ result.content.contains("*bold*") || result.content.contains("bold"),
937
+ "Bold text should be preserved."
938
+ );
939
+ }
940
+
941
+ /// TEST 30: Regression - Inline code
942
+ #[tokio::test]
943
+ async fn test_typst_inline_code_regression() {
944
+ let test_content = b"Use `println!(\"hello\")` in Rust";
945
+
946
+ let config = ExtractionConfig::default();
947
+ let result = extract_bytes(test_content, "application/x-typst", &config)
948
+ .await
949
+ .expect("Extraction failed");
950
+
951
+ assert!(
952
+ result.content.contains("`") && result.content.contains("println"),
953
+ "Inline code should be preserved with backticks."
954
+ );
955
+ }
956
+
957
+ /// TEST 31: Regression - Code blocks
958
+ #[tokio::test]
959
+ async fn test_typst_codeblock_regression() {
960
+ let test_content = b"```rust\nfn main() {}\n```";
961
+
962
+ let config = ExtractionConfig::default();
963
+ let result = extract_bytes(test_content, "application/x-typst", &config)
964
+ .await
965
+ .expect("Extraction failed");
966
+
967
+ assert!(
968
+ result.content.contains("```"),
969
+ "Code block delimiters should be preserved."
970
+ );
971
+
972
+ assert!(
973
+ result.content.contains("fn main"),
974
+ "Code block content should be preserved."
975
+ );
976
+ }
977
+
978
+ /// TEST 32: Regression - List extraction
979
+ #[tokio::test]
980
+ async fn test_typst_list_regression() {
981
+ let test_content = b"- Item 1\n+ Item 2\n- Item 3";
982
+
983
+ let config = ExtractionConfig::default();
984
+ let result = extract_bytes(test_content, "application/x-typst", &config)
985
+ .await
986
+ .expect("Extraction failed");
987
+
988
+ assert!(
989
+ result.content.contains("Item 1") && result.content.contains("Item 2") && result.content.contains("Item 3"),
990
+ "All list items should be extracted."
991
+ );
992
+ }
993
+
994
+ /// TEST 33: Regression - Math preservation
995
+ #[tokio::test]
996
+ async fn test_typst_math_regression() {
997
+ let test_content = b"Formula: $E = mc^2$";
998
+
999
+ let config = ExtractionConfig::default();
1000
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1001
+ .await
1002
+ .expect("Extraction failed");
1003
+
1004
+ assert!(
1005
+ result.content.contains("$") && (result.content.contains("mc") || result.content.contains("E")),
1006
+ "Math formulas should be preserved."
1007
+ );
1008
+ }
1009
+
1010
+ /// TEST 34: Regression - Link extraction
1011
+ #[tokio::test]
1012
+ async fn test_typst_link_regression() {
1013
+ let test_content = b"Visit #link(\"https://example.com\")[example]";
1014
+
1015
+ let config = ExtractionConfig::default();
1016
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1017
+ .await
1018
+ .expect("Extraction failed");
1019
+
1020
+ assert!(
1021
+ result.content.contains("example") || result.content.contains("example.com"),
1022
+ "Link text or URL should be preserved."
1023
+ );
1024
+ }
1025
+
1026
+ /// TEST 35: Regression - Table basic extraction
1027
+ #[tokio::test]
1028
+ async fn test_typst_table_regression() {
1029
+ let test_content = b"#table(columns: 2, [A], [B], [1], [2])";
1030
+
1031
+ let config = ExtractionConfig::default();
1032
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1033
+ .await
1034
+ .expect("Extraction failed");
1035
+
1036
+ assert!(
1037
+ result.content.contains("A") || result.content.contains("TABLE"),
1038
+ "Table content should be extracted."
1039
+ );
1040
+ }
1041
+
1042
+ /// TEST 36: Large document handling
1043
+ #[tokio::test]
1044
+ async fn test_typst_large_document_stress() {
1045
+ let mut large_content = String::new();
1046
+
1047
+ for i in 1..=50 {
1048
+ large_content.push_str(&format!("= Heading {}\n\n", i));
1049
+ large_content.push_str(&format!("Content for section {}.\n\n", i));
1050
+ }
1051
+
1052
+ let config = ExtractionConfig::default();
1053
+ let result = extract_bytes(large_content.as_bytes(), "application/x-typst", &config)
1054
+ .await
1055
+ .expect("Extraction failed");
1056
+
1057
+ let heading_count = extract_all_headings(&result.content).len();
1058
+ assert!(
1059
+ heading_count >= 40,
1060
+ "Large documents should extract all headings. Found {} of 50.",
1061
+ heading_count
1062
+ );
1063
+ }
1064
+
1065
+ /// TEST 37: Deep nesting stress test
1066
+ #[tokio::test]
1067
+ async fn test_typst_deep_nesting_stress() {
1068
+ let mut nested = String::new();
1069
+
1070
+ for level in 1..=6 {
1071
+ nested.push_str(&format!("{} Level {} Heading\n\n", "=".repeat(level), level));
1072
+ nested.push_str(&format!("Content at level {}.\n\n", level));
1073
+ }
1074
+
1075
+ let config = ExtractionConfig::default();
1076
+ let result = extract_bytes(nested.as_bytes(), "application/x-typst", &config)
1077
+ .await
1078
+ .expect("Extraction failed");
1079
+
1080
+ for level in 1..=6 {
1081
+ let count = count_heading_level(&result.content, level);
1082
+ assert!(
1083
+ count >= 1,
1084
+ "Level {} heading should be extracted in deep nesting test.",
1085
+ level
1086
+ );
1087
+ }
1088
+ }
1089
+
1090
+ /// TEST 38: Mixed formatting stress
1091
+ #[tokio::test]
1092
+ async fn test_typst_mixed_formatting_stress() {
1093
+ let test_content = b"This text has *bold*, _italic_, `code`, and $math$ all mixed together!";
1094
+
1095
+ let config = ExtractionConfig::default();
1096
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1097
+ .await
1098
+ .expect("Extraction failed");
1099
+
1100
+ let has_formatting = (result.content.contains("*") || result.content.contains("bold"))
1101
+ && (result.content.contains("_") || result.content.contains("italic"))
1102
+ && (result.content.contains("`") || result.content.contains("code"))
1103
+ && (result.content.contains("$") || result.content.contains("math"));
1104
+
1105
+ assert!(has_formatting, "All mixed formatting should be preserved.");
1106
+ }
1107
+
1108
+ /// TEST 39: Unicode stress test
1109
+ #[tokio::test]
1110
+ async fn test_typst_unicode_stress() {
1111
+ let test_content = "= Unicode Heading 中文 العربية\n\nContent with emojis: 🎉🚀💯\n\nGreek: α β γ δ ε ζ".as_bytes();
1112
+
1113
+ let config = ExtractionConfig::default();
1114
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1115
+ .await
1116
+ .expect("Extraction failed");
1117
+
1118
+ assert!(
1119
+ result.content.contains("Unicode"),
1120
+ "Unicode content should be preserved."
1121
+ );
1122
+ }
1123
+
1124
+ /// TEST 40: Pathological whitespace
1125
+ #[tokio::test]
1126
+ async fn test_typst_pathological_whitespace() {
1127
+ let test_content = b"= Heading\n\n\n\n\n\nContent with excessive blank lines\n\n\n\n\nMore content";
1128
+
1129
+ let config = ExtractionConfig::default();
1130
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1131
+ .await
1132
+ .expect("Extraction failed");
1133
+
1134
+ assert!(
1135
+ result.content.contains("Heading") && result.content.contains("Content"),
1136
+ "Should extract content even with excessive whitespace."
1137
+ );
1138
+ }
1139
+
1140
+ /// TEST 41: Full document comparison - simple.typ
1141
+ #[tokio::test]
1142
+ async fn test_typst_full_simple_document_comparison() {
1143
+ let content = load_test_document("simple.typ");
1144
+ let _baseline = load_pandoc_baseline("simple");
1145
+ let config = ExtractionConfig::default();
1146
+
1147
+ let result = extract_bytes(&content, "application/x-typst", &config)
1148
+ .await
1149
+ .expect("Extraction failed");
1150
+
1151
+ assert!(
1152
+ result.content.len() > 50,
1153
+ "simple.typ should extract substantial content"
1154
+ );
1155
+
1156
+ let heading_count = extract_all_headings(&result.content).len();
1157
+ assert!(heading_count > 2, "simple.typ should have multiple sections");
1158
+ }
1159
+
1160
+ /// TEST 42: Full document comparison - advanced.typ
1161
+ #[tokio::test]
1162
+ async fn test_typst_full_advanced_document_comparison() {
1163
+ let content = load_test_document("advanced.typ");
1164
+ let _baseline = load_pandoc_baseline("advanced");
1165
+ let config = ExtractionConfig::default();
1166
+
1167
+ let result = extract_bytes(&content, "application/x-typst", &config)
1168
+ .await
1169
+ .expect("Extraction failed");
1170
+
1171
+ assert!(
1172
+ result.content.len() > 100,
1173
+ "advanced.typ should extract comprehensive content"
1174
+ );
1175
+
1176
+ let heading_count = extract_all_headings(&result.content).len();
1177
+ assert!(heading_count >= 5, "advanced.typ should preserve heading structure");
1178
+ }
1179
+
1180
+ /// TEST 43: MIME type consistency
1181
+ ///
1182
+ /// The extractor should support both standard MIME types for Typst.
1183
+ /// Currently only supports application/x-typst, not text/x-typst.
1184
+ #[tokio::test]
1185
+ async fn test_typst_mime_type_consistency() {
1186
+ let content = load_test_document("simple.typ");
1187
+ let config = ExtractionConfig::default();
1188
+
1189
+ let result_primary = extract_bytes(&content, "application/x-typst", &config)
1190
+ .await
1191
+ .expect("Primary MIME type should work");
1192
+
1193
+ assert!(
1194
+ result_primary.content.len() > 0,
1195
+ "Primary MIME type should extract content"
1196
+ );
1197
+
1198
+ match extract_bytes(&content, "text/x-typst", &config).await {
1199
+ Ok(result) => {
1200
+ assert!(
1201
+ result.content.len() > 0,
1202
+ "Alternative MIME type should extract content if supported"
1203
+ );
1204
+ }
1205
+ Err(_e) => {
1206
+ println!("Note: text/x-typst is not currently supported (may be added in future)");
1207
+ }
1208
+ }
1209
+ }
1210
+
1211
+ /// TEST 44: Config parameter impact
1212
+ #[tokio::test]
1213
+ async fn test_typst_config_parameter_handling() {
1214
+ let content = load_test_document("simple.typ");
1215
+ let config = ExtractionConfig::default();
1216
+
1217
+ let result = extract_bytes(&content, "application/x-typst", &config)
1218
+ .await
1219
+ .expect("Extraction failed");
1220
+
1221
+ assert!(!result.content.is_empty(), "Extraction with default config should work");
1222
+
1223
+ assert_eq!(result.mime_type, "application/x-typst", "MIME type should be preserved");
1224
+ }
1225
+
1226
+ /// TEST 45: Comparative heading analysis
1227
+ ///
1228
+ /// This final comprehensive test checks heading extraction
1229
+ /// against the baseline to identify the exact scope of the heading loss bug.
1230
+ #[tokio::test]
1231
+ async fn test_typst_heading_loss_bug_analysis() {
1232
+ let content = load_test_document("headings.typ");
1233
+ let baseline = load_pandoc_baseline("headings");
1234
+ let config = ExtractionConfig::default();
1235
+
1236
+ let result = extract_bytes(&content, "application/x-typst", &config)
1237
+ .await
1238
+ .expect("Extraction failed");
1239
+
1240
+ println!("\n===== HEADING EXTRACTION ANALYSIS =====");
1241
+ println!("Baseline content:");
1242
+ println!("{}", baseline);
1243
+ println!("\nExtracted content:");
1244
+ println!("{}", result.content);
1245
+
1246
+ let extracted_headings = extract_all_headings(&result.content);
1247
+ println!("\nExtracted headings: {}", extracted_headings.len());
1248
+ for (i, h) in extracted_headings.iter().enumerate() {
1249
+ println!(" {}: {}", i + 1, h);
1250
+ }
1251
+
1252
+ assert!(
1253
+ extracted_headings.len() >= 6,
1254
+ "BUG CONFIRMED: Heading loss detected. \
1255
+ Expected 6 headings (1-6 levels), found {}. \
1256
+ This is the 62% heading loss bug - only single '=' is matched, \
1257
+ all '==' and higher are skipped entirely.",
1258
+ extracted_headings.len()
1259
+ );
1260
+ }