kreuzberg 4.0.0.rc1 → 4.0.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (342) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -8
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -534
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -9
  7. data/Gemfile.lock +9 -109
  8. data/README.md +426 -421
  9. data/Rakefile +25 -25
  10. data/Steepfile +47 -47
  11. data/examples/async_patterns.rb +341 -340
  12. data/ext/kreuzberg_rb/extconf.rb +45 -35
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
  15. data/ext/kreuzberg_rb/native/README.md +425 -425
  16. data/ext/kreuzberg_rb/native/build.rs +15 -17
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
  22. data/extconf.rb +28 -28
  23. data/kreuzberg.gemspec +148 -105
  24. data/lib/kreuzberg/api_proxy.rb +142 -142
  25. data/lib/kreuzberg/cache_api.rb +46 -45
  26. data/lib/kreuzberg/cli.rb +55 -55
  27. data/lib/kreuzberg/cli_proxy.rb +127 -127
  28. data/lib/kreuzberg/config.rb +691 -684
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -50
  31. data/lib/kreuzberg/extraction_api.rb +85 -84
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -186
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -86
  35. data/lib/kreuzberg/result.rb +216 -216
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -79
  37. data/lib/kreuzberg/validator_protocol.rb +89 -89
  38. data/lib/kreuzberg/version.rb +5 -5
  39. data/lib/kreuzberg.rb +103 -82
  40. data/sig/kreuzberg/internal.rbs +184 -184
  41. data/sig/kreuzberg.rbs +520 -468
  42. data/spec/binding/cache_spec.rb +227 -227
  43. data/spec/binding/cli_proxy_spec.rb +85 -87
  44. data/spec/binding/cli_spec.rb +55 -54
  45. data/spec/binding/config_spec.rb +345 -345
  46. data/spec/binding/config_validation_spec.rb +283 -283
  47. data/spec/binding/error_handling_spec.rb +213 -213
  48. data/spec/binding/errors_spec.rb +66 -66
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  51. data/spec/binding/plugins/validator_spec.rb +274 -274
  52. data/spec/fixtures/config.toml +39 -39
  53. data/spec/fixtures/config.yaml +41 -42
  54. data/spec/fixtures/invalid_config.toml +4 -4
  55. data/spec/smoke/package_spec.rb +178 -178
  56. data/spec/spec_helper.rb +42 -42
  57. data/vendor/kreuzberg/Cargo.toml +204 -134
  58. data/vendor/kreuzberg/README.md +175 -175
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -460
  61. data/vendor/kreuzberg/src/api/error.rs +81 -81
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -199
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -79
  64. data/vendor/kreuzberg/src/api/server.rs +353 -353
  65. data/vendor/kreuzberg/src/api/types.rs +170 -170
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -1032
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
  71. data/vendor/kreuzberg/src/core/io.rs +329 -327
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -615
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -42
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -323
  76. data/vendor/kreuzberg/src/error.rs +431 -431
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -854
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -553
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -368
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -328
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -269
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -129
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -410
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -195
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -242
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
  136. data/vendor/kreuzberg/src/lib.rs +105 -102
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -122
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -420
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -161
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -19
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -697
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
  176. data/vendor/kreuzberg/src/types.rs +903 -873
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -17
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -959
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -966
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -580
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -493
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -325
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -393
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -159
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -253
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -404
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +90 -95
  331. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  332. data/spec/examples.txt +0 -104
  333. data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
  334. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
  335. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
  336. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
  337. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
  338. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
  339. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
  340. data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
  341. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
  342. data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
@@ -0,0 +1,1259 @@
1
+ #![allow(clippy::len_zero, clippy::unnecessary_get_then_check, clippy::single_match)]
2
+ //! Comprehensive behavioral tests for Typst extractor against Pandoc baselines.
3
+ //!
4
+ //! These tests expose the critical bugs found in code review:
5
+ //! 1. 62% heading loss bug - only matches single `=` headings
6
+ //! 2. Blockquotes not implemented
7
+ //! 3. Display math not extracted
8
+ //! 4. Nested table brackets cause corruption
9
+ //! 5. Empty headings output (just `= ` with no text)
10
+ //! 6. Regex failures silently lose metadata
11
+ //!
12
+ //! The tests are designed to FAIL initially, exposing real bugs that need fixing.
13
+ //! They compare extracted output against Pandoc baseline outputs for behavioral parity.
14
+
15
+ use kreuzberg::core::config::ExtractionConfig;
16
+ use kreuzberg::core::extractor::extract_bytes;
17
+ use std::{fs, path::PathBuf};
18
+
19
+ fn typst_doc_root() -> PathBuf {
20
+ PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/typst")
21
+ }
22
+
23
+ /// Load a test document from the test_documents/typst directory
24
+ fn load_test_document(filename: &str) -> Vec<u8> {
25
+ let path = typst_doc_root().join(filename);
26
+ fs::read(&path).unwrap_or_else(|_| panic!("Failed to read test document: {}", filename))
27
+ }
28
+
29
+ /// Load Pandoc baseline output for comparison
30
+ fn load_pandoc_baseline(filename_base: &str) -> String {
31
+ let path = typst_doc_root().join(format!("{filename_base}_pandoc_baseline.txt"));
32
+ fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read baseline: {}", filename_base))
33
+ }
34
+
35
+ /// Load Pandoc metadata JSON for comparison
36
+ fn load_pandoc_metadata(filename_base: &str) -> String {
37
+ let path = typst_doc_root().join(format!("{filename_base}_pandoc_meta.json"));
38
+ fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read metadata: {}", filename_base))
39
+ }
40
+
41
+ /// Count specific heading levels (= for level 1, == for level 2, etc.)
42
+ fn count_heading_level(content: &str, level: usize) -> usize {
43
+ let exact_marker = format!("{} ", "=".repeat(level));
44
+ content
45
+ .lines()
46
+ .filter(|l| l.trim_start().starts_with(&exact_marker))
47
+ .count()
48
+ }
49
+
50
+ /// Extract all headings from content
51
+ fn extract_all_headings(content: &str) -> Vec<String> {
52
+ content
53
+ .lines()
54
+ .filter(|l| {
55
+ let trimmed = l.trim_start();
56
+ trimmed.starts_with('=') && !trimmed.starts_with("#set")
57
+ })
58
+ .map(|l| l.to_string())
59
+ .collect()
60
+ }
61
+
62
+ /// Count lines that are pure metadata/directives (not content)
63
+ fn count_directive_lines(content: &str) -> usize {
64
+ content
65
+ .lines()
66
+ .filter(|l| {
67
+ let t = l.trim();
68
+ t.starts_with("#set ") || t.starts_with("#let ") || t.starts_with("#import ")
69
+ })
70
+ .count()
71
+ }
72
+
73
+ /// Count empty headings (headings with just `= ` and no text)
74
+ fn count_empty_headings(content: &str) -> usize {
75
+ content
76
+ .lines()
77
+ .filter(|l| {
78
+ let trimmed = l.trim_start();
79
+ trimmed == "="
80
+ || trimmed == "=="
81
+ || trimmed == "==="
82
+ || trimmed == "===="
83
+ || trimmed == "====="
84
+ || trimmed == "======"
85
+ })
86
+ .count()
87
+ }
88
+
89
+ /// Extract all text between headings (content blocks)
90
+ fn extract_content_blocks(content: &str) -> Vec<String> {
91
+ let mut blocks = Vec::new();
92
+ let mut current_block = String::new();
93
+ let mut in_block = false;
94
+
95
+ for line in content.lines() {
96
+ let trimmed = line.trim_start();
97
+ if trimmed.starts_with('=') && !trimmed.starts_with("#set") {
98
+ if !current_block.is_empty() {
99
+ blocks.push(current_block.trim().to_string());
100
+ current_block.clear();
101
+ }
102
+ in_block = true;
103
+ } else if in_block && !trimmed.is_empty() {
104
+ current_block.push_str(line);
105
+ current_block.push('\n');
106
+ }
107
+ }
108
+
109
+ if !current_block.is_empty() {
110
+ blocks.push(current_block.trim().to_string());
111
+ }
112
+
113
+ blocks
114
+ }
115
+
116
+ /// Check if content has reasonable parity with baseline (within tolerance)
117
+ fn content_parity_check(extracted: &str, baseline: &str, tolerance_percent: f64) -> bool {
118
+ let extracted_len = extracted.len();
119
+ let baseline_len = baseline.len();
120
+
121
+ if baseline_len == 0 {
122
+ return extracted_len == 0;
123
+ }
124
+
125
+ let ratio = (extracted_len as f64) / (baseline_len as f64);
126
+ let acceptable_min = 1.0 - (tolerance_percent / 100.0);
127
+ let acceptable_max = 1.0 + (tolerance_percent / 100.0);
128
+
129
+ ratio >= acceptable_min && ratio <= acceptable_max
130
+ }
131
+
132
+ // CRITICAL BUG TESTS - These expose the 45+ issues
133
+
134
+ /// TEST 1: CRITICAL - 62% heading loss bug
135
+ ///
136
+ /// The extractor only matches single `=` headings, completely skipping
137
+ /// `==`, `===`, and higher levels. This causes catastrophic data loss
138
+ /// in hierarchical documents.
139
+ ///
140
+ /// Expected: All heading levels should be extracted
141
+ /// Current behavior: Only level 1 headings extracted
142
+ /// WILL FAIL: Exposing the heading loss bug
143
+ #[tokio::test]
144
+ async fn test_typst_all_heading_levels_not_lost() {
145
+ let content = load_test_document("headings.typ");
146
+ let _baseline = load_pandoc_baseline("headings");
147
+ let config = ExtractionConfig::default();
148
+
149
+ let result = extract_bytes(&content, "application/x-typst", &config)
150
+ .await
151
+ .expect("Extraction failed");
152
+
153
+ let extracted_all_headings = extract_all_headings(&result.content);
154
+
155
+ assert!(
156
+ extracted_all_headings.len() >= 6,
157
+ "CRITICAL BUG: Only extracted {} headings, should have extracted 6+ heading levels. \
158
+ This is the 62% heading loss bug - extractor only matches '=' but skips '==', '===', etc.",
159
+ extracted_all_headings.len()
160
+ );
161
+
162
+ for level in 1..=6 {
163
+ let count = count_heading_level(&result.content, level);
164
+ assert_eq!(
165
+ count, 1,
166
+ "Heading level {} should appear exactly once (found {}). \
167
+ Missing heading levels cause data loss in hierarchical documents.",
168
+ level, count
169
+ );
170
+ }
171
+ }
172
+
173
+ /// TEST 2: Display math not extracted
174
+ ///
175
+ /// Display math ($$...$$) is completely lost from extraction,
176
+ /// breaking mathematical content preservation.
177
+ ///
178
+ /// Expected: Display math should be preserved in output
179
+ /// Current behavior: Silently dropped
180
+ /// WILL FAIL: Exposing display math loss
181
+ #[tokio::test]
182
+ async fn test_typst_display_math_preserved() {
183
+ let content = load_test_document("advanced.typ");
184
+ let baseline = load_pandoc_baseline("advanced");
185
+ let config = ExtractionConfig::default();
186
+
187
+ let result = extract_bytes(&content, "application/x-typst", &config)
188
+ .await
189
+ .expect("Extraction failed");
190
+
191
+ let has_display_math_in_baseline =
192
+ baseline.contains("²") || baseline.contains("Display math") || baseline.contains("x^2");
193
+
194
+ if has_display_math_in_baseline {
195
+ let our_has_math = result.content.contains("$")
196
+ || result.content.contains("Display")
197
+ || result.content.contains("²")
198
+ || result.content.contains("²");
199
+
200
+ assert!(
201
+ our_has_math,
202
+ "Display math should be extracted. Pandoc preserves mathematical notation, \
203
+ but extractor drops it entirely. This breaks scientific/academic documents."
204
+ );
205
+ }
206
+
207
+ let has_pythagorean = result.content.contains("^2")
208
+ || result.content.contains("²")
209
+ || result.content.contains("x") && result.content.contains("y") && result.content.contains("r");
210
+
211
+ assert!(
212
+ has_pythagorean,
213
+ "Pythagorean theorem expression should be present. Display math is being dropped."
214
+ );
215
+ }
216
+
217
+ /// TEST 3: Empty headings output
218
+ ///
219
+ /// When heading text is missing or malformed, extractor outputs
220
+ /// just the marker like "= " with no text, polluting the output.
221
+ ///
222
+ /// Expected: Either full heading text or no heading at all
223
+ /// Current behavior: "= " with no content
224
+ /// WILL FAIL: Exposing empty heading bug
225
+ #[tokio::test]
226
+ async fn test_typst_no_empty_headings_output() {
227
+ let content = load_test_document("headings.typ");
228
+ let config = ExtractionConfig::default();
229
+
230
+ let result = extract_bytes(&content, "application/x-typst", &config)
231
+ .await
232
+ .expect("Extraction failed");
233
+
234
+ let empty_headings = count_empty_headings(&result.content);
235
+
236
+ assert_eq!(
237
+ empty_headings, 0,
238
+ "Found {} empty heading lines (just '=' with no text). \
239
+ Extractor outputs malformed headings like '= ' with no text, \
240
+ corrupting the document structure.",
241
+ empty_headings
242
+ );
243
+
244
+ for heading in extract_all_headings(&result.content) {
245
+ let trimmed = heading.trim_start();
246
+ let after_marker = trimmed.trim_start_matches('=').trim();
247
+ assert!(
248
+ !after_marker.is_empty(),
249
+ "Heading '{}' has no text after marker. Should not output empty headings.",
250
+ trimmed
251
+ );
252
+ }
253
+ }
254
+
255
+ /// TEST 4: Metadata extraction fails with regex silently
256
+ ///
257
+ /// When regex patterns fail to match metadata fields,
258
+ /// the extractor silently returns None instead of logging/failing,
259
+ /// causing complete metadata loss for certain formats.
260
+ ///
261
+ /// Expected: All metadata fields should be extracted
262
+ /// Current behavior: Some formats fail silently
263
+ /// WILL FAIL: Exposing metadata loss
264
+ #[tokio::test]
265
+ async fn test_typst_metadata_extraction_completeness() {
266
+ let content = load_test_document("metadata.typ");
267
+ let _baseline_meta = load_pandoc_metadata("metadata");
268
+ let config = ExtractionConfig::default();
269
+
270
+ let result = extract_bytes(&content, "application/x-typst", &config)
271
+ .await
272
+ .expect("Extraction failed");
273
+
274
+ let has_title = result
275
+ .metadata
276
+ .additional
277
+ .get("title")
278
+ .map(|t| t.to_string().len() > 0)
279
+ .unwrap_or(false);
280
+
281
+ let has_author = result
282
+ .metadata
283
+ .additional
284
+ .get("author")
285
+ .map(|a| a.to_string().len() > 0)
286
+ .unwrap_or(false);
287
+
288
+ let has_keywords = result
289
+ .metadata
290
+ .additional
291
+ .get("keywords")
292
+ .map(|k| k.to_string().len() > 0)
293
+ .unwrap_or(false);
294
+
295
+ assert!(
296
+ has_title,
297
+ "Title metadata should be extracted. Regex pattern matching fails silently \
298
+ and metadata is lost with no error reporting."
299
+ );
300
+
301
+ assert!(
302
+ has_author,
303
+ "Author metadata should be extracted. Some metadata formats fail silently."
304
+ );
305
+
306
+ assert!(
307
+ has_keywords,
308
+ "Keywords should be extracted. Regex failures cause silent data loss."
309
+ );
310
+ }
311
+
312
+ /// TEST 5: Nested table brackets cause corruption
313
+ ///
314
+ /// Tables with nested brackets like [Name [full]] corrupt the
315
+ /// table content extraction because bracket counting is naive.
316
+ ///
317
+ /// Expected: Table cells should be extracted correctly even with nesting
318
+ /// Current behavior: Bracket nesting causes cells to be malformed
319
+ /// WILL FAIL: Exposing table corruption bug
320
+ #[tokio::test]
321
+ async fn test_typst_tables_with_nested_brackets_not_corrupted() {
322
+ let content = load_test_document("advanced.typ");
323
+ let baseline = load_pandoc_baseline("advanced");
324
+ let config = ExtractionConfig::default();
325
+
326
+ let result = extract_bytes(&content, "application/x-typst", &config)
327
+ .await
328
+ .expect("Extraction failed");
329
+
330
+ let has_table_in_baseline = baseline.contains("Name") && baseline.contains("Alice");
331
+
332
+ if has_table_in_baseline {
333
+ let table_content_extracted =
334
+ result.content.contains("Name") && result.content.contains("Alice") && result.content.contains("Age");
335
+
336
+ assert!(
337
+ table_content_extracted,
338
+ "Table content should be extracted correctly. Nested brackets cause corruption \
339
+ and table cells are malformed."
340
+ );
341
+
342
+ let corrupted_brackets = result.content.matches("[[").count();
343
+ assert_eq!(
344
+ corrupted_brackets, 0,
345
+ "Found corrupted bracket sequences [[. Table extraction with nested brackets \
346
+ produces malformed output."
347
+ );
348
+ }
349
+ }
350
+
351
+ /// TEST 6: Content volume parity - within tolerance of Pandoc
352
+ ///
353
+ /// Our extractor should extract roughly the same amount of content
354
+ /// as Pandoc (baseline). Large discrepancies indicate data loss or
355
+ /// noise injection.
356
+ ///
357
+ /// Expected: Within reasonable tolerance of baseline content size
358
+ /// Current behavior: Significant data loss on complex documents (e.g., advanced.typ)
359
+ /// WILL FAIL: Exposing data loss on complex documents with formatting
360
+ #[tokio::test]
361
+ async fn test_typst_content_volume_parity_with_pandoc() {
362
+ let documents = vec![("simple", 30.0), ("headings", 20.0)];
363
+
364
+ for (doc_name, tolerance) in documents {
365
+ let content = load_test_document(&format!("{}.typ", doc_name));
366
+ let baseline = load_pandoc_baseline(doc_name);
367
+ let config = ExtractionConfig::default();
368
+
369
+ let result = extract_bytes(&content, "application/x-typst", &config)
370
+ .await
371
+ .unwrap_or_else(|_| panic!("Extraction failed for {}", doc_name));
372
+
373
+ let baseline_size = baseline.len();
374
+ let extracted_size = result.content.len();
375
+
376
+ let is_within_tolerance = content_parity_check(&result.content, &baseline, tolerance);
377
+
378
+ assert!(
379
+ is_within_tolerance,
380
+ "Content volume parity failed for {}: \
381
+ Baseline: {} bytes, Extracted: {} bytes ({}% tolerance allowed). \
382
+ Data loss indicates missing extraction features or formatting issues.",
383
+ doc_name, baseline_size, extracted_size, tolerance
384
+ );
385
+ }
386
+ }
387
+
388
+ /// TEST 7: Blockquotes not implemented
389
+ ///
390
+ /// Blockquotes (using > syntax in other formats, typst uses #quote)
391
+ /// are completely unimplemented, causing loss of semantic structure.
392
+ ///
393
+ /// Expected: Blockquote content should be extracted
394
+ /// Current behavior: Feature not implemented
395
+ /// WILL FAIL: Exposing missing blockquote support
396
+ #[tokio::test]
397
+ async fn test_typst_blockquote_handling() {
398
+ let test_content = b"#quote[
399
+ This is a blockquote.
400
+ It should be extracted.
401
+ ]";
402
+
403
+ let config = ExtractionConfig::default();
404
+ let result = extract_bytes(test_content, "application/x-typst", &config)
405
+ .await
406
+ .expect("Extraction failed");
407
+
408
+ let has_blockquote_content =
409
+ result.content.contains("blockquote") || result.content.contains("This is a blockquote");
410
+
411
+ assert!(
412
+ has_blockquote_content,
413
+ "Blockquote content should be extracted. Blockquotes are not implemented \
414
+ in the extractor, causing complete loss of quoted content."
415
+ );
416
+ }
417
+
418
+ /// TEST 8: Inline code preservation
419
+ ///
420
+ /// Test that inline code blocks are properly extracted and marked.
421
+ /// This ensures code snippets aren't corrupted.
422
+ ///
423
+ /// Expected: Inline code preserved with backticks or clearly marked
424
+ /// Current behavior: May be corrupted
425
+ /// WILL FAIL: If inline code is not preserved
426
+ #[tokio::test]
427
+ async fn test_typst_inline_code_preserved() {
428
+ let content = load_test_document("advanced.typ");
429
+ let baseline = load_pandoc_baseline("advanced");
430
+ let config = ExtractionConfig::default();
431
+
432
+ let result = extract_bytes(&content, "application/x-typst", &config)
433
+ .await
434
+ .expect("Extraction failed");
435
+
436
+ let has_inline_code =
437
+ result.content.contains("`") || (result.content.contains("code") && baseline.contains("`code`"));
438
+
439
+ assert!(
440
+ has_inline_code,
441
+ "Inline code should be preserved with backticks or clearly marked."
442
+ );
443
+ }
444
+
445
+ /// TEST 9: Inline math extraction
446
+ ///
447
+ /// Inline math (single $ delimiters) should be extracted and preserved.
448
+ ///
449
+ /// Expected: Inline math formulas preserved
450
+ /// Current behavior: May be dropped
451
+ /// WILL FAIL: If inline math is lost
452
+ #[tokio::test]
453
+ async fn test_typst_inline_math_preserved() {
454
+ let content = load_test_document("advanced.typ");
455
+ let baseline = load_pandoc_baseline("advanced");
456
+ let config = ExtractionConfig::default();
457
+
458
+ let result = extract_bytes(&content, "application/x-typst", &config)
459
+ .await
460
+ .expect("Extraction failed");
461
+
462
+ let has_inline_math =
463
+ result.content.contains("$") || result.content.contains("sqrt") || result.content.contains("equation");
464
+
465
+ if baseline.contains("$") || baseline.contains("equation") {
466
+ assert!(
467
+ has_inline_math,
468
+ "Inline math should be extracted. Mathematical formulas are being dropped."
469
+ );
470
+ }
471
+ }
472
+
473
+ /// TEST 10: Figures and captions
474
+ ///
475
+ /// Figure extraction with captions should preserve both image references
476
+ /// and caption text.
477
+ ///
478
+ /// Expected: Figure content and captions extracted
479
+ /// Current behavior: May be unimplemented
480
+ #[tokio::test]
481
+ async fn test_typst_figures_and_captions() {
482
+ let test_content = b"#figure(
483
+ image(\"example.png\"),
484
+ caption: [This is a figure caption]
485
+ )";
486
+
487
+ let config = ExtractionConfig::default();
488
+ let result = extract_bytes(test_content, "application/x-typst", &config)
489
+ .await
490
+ .expect("Extraction failed");
491
+
492
+ let _has_caption = result.content.contains("caption") || result.content.contains("figure");
493
+
494
+ println!(
495
+ "Figure extraction result (feature may be unimplemented): {:?}",
496
+ result.content
497
+ );
498
+ }
499
+
500
+ /// TEST 11: Citation/reference handling
501
+ ///
502
+ /// Citations and references should be extracted when present.
503
+ ///
504
+ /// Expected: Citation markers and text preserved
505
+ /// Current behavior: May be dropped
506
+ #[tokio::test]
507
+ async fn test_typst_citations_preserved() {
508
+ let test_content = b"Here is a citation @smith2020.
509
+
510
+ = References
511
+
512
+ #bibliography()";
513
+
514
+ let config = ExtractionConfig::default();
515
+ let result = extract_bytes(test_content, "application/x-typst", &config)
516
+ .await
517
+ .expect("Extraction failed");
518
+
519
+ let _has_citation = result.content.contains("@smith2020")
520
+ || result.content.contains("smith")
521
+ || result.content.contains("References");
522
+
523
+ println!("Citation handling (may be limited): {:?}", result.content);
524
+ }
525
+
526
+ /// TEST 12: Link extraction and formatting
527
+ ///
528
+ /// Links should be extracted with both URL and link text.
529
+ ///
530
+ /// Expected: Links in markdown format [text](url)
531
+ /// Current behavior: May lose URL or text
532
+ #[tokio::test]
533
+ async fn test_typst_link_extraction() {
534
+ let content = load_test_document("advanced.typ");
535
+ let _baseline = load_pandoc_baseline("advanced");
536
+ let config = ExtractionConfig::default();
537
+
538
+ let result = extract_bytes(&content, "application/x-typst", &config)
539
+ .await
540
+ .expect("Extraction failed");
541
+
542
+ let has_link_content =
543
+ result.content.contains("example") || result.content.contains("link") || result.content.contains("https");
544
+
545
+ assert!(
546
+ has_link_content,
547
+ "Link content should be extracted. Links may be completely dropped."
548
+ );
549
+ }
550
+
551
+ /// TEST 13: Unordered list extraction
552
+ ///
553
+ /// Both + and - list markers should be converted to standard format.
554
+ ///
555
+ /// Expected: All list items extracted and normalized
556
+ /// Current behavior: May lose some items
557
+ #[tokio::test]
558
+ async fn test_typst_list_extraction() {
559
+ let content = load_test_document("simple.typ");
560
+ let _baseline = load_pandoc_baseline("simple");
561
+ let config = ExtractionConfig::default();
562
+
563
+ let result = extract_bytes(&content, "application/x-typst", &config)
564
+ .await
565
+ .expect("Extraction failed");
566
+
567
+ let has_list_markers = result.content.contains("-") || result.content.contains("+");
568
+ let has_list_content =
569
+ result.content.contains("First") || result.content.contains("Second") || result.content.contains("item");
570
+
571
+ assert!(
572
+ has_list_markers || has_list_content,
573
+ "List items should be extracted with markers or content preserved."
574
+ );
575
+ }
576
+
577
+ /// TEST 14: Code block extraction
578
+ ///
579
+ /// Triple-backtick code blocks should be fully extracted with language specifiers.
580
+ ///
581
+ /// Expected: Code blocks with language markers preserved
582
+ /// Current behavior: May be malformed
583
+ #[tokio::test]
584
+ async fn test_typst_code_block_extraction() {
585
+ let content = load_test_document("advanced.typ");
586
+ let _baseline = load_pandoc_baseline("advanced");
587
+ let config = ExtractionConfig::default();
588
+
589
+ let result = extract_bytes(&content, "application/x-typst", &config)
590
+ .await
591
+ .expect("Extraction failed");
592
+
593
+ let has_code = result.content.contains("```")
594
+ || result.content.contains("def")
595
+ || result.content.contains("fibonacci")
596
+ || result.content.contains("python");
597
+
598
+ assert!(has_code, "Code blocks should be extracted with language specifiers.");
599
+ }
600
+
601
+ /// TEST 15: Bold and italic formatting
602
+ ///
603
+ /// Inline emphasis formatting should be preserved or normalized.
604
+ ///
605
+ /// Expected: Bold (*text*) and italic (_text_) markers present
606
+ /// Current behavior: May be lost
607
+ #[tokio::test]
608
+ async fn test_typst_emphasis_formatting() {
609
+ let content = load_test_document("advanced.typ");
610
+ let config = ExtractionConfig::default();
611
+
612
+ let result = extract_bytes(&content, "application/x-typst", &config)
613
+ .await
614
+ .expect("Extraction failed");
615
+
616
+ let has_emphasis = result.content.contains("*") && result.content.contains("_");
617
+
618
+ assert!(has_emphasis, "Bold and italic formatting markers should be preserved.");
619
+ }
620
+
621
+ /// TEST 16: Complex nested formatting
622
+ ///
623
+ /// Test handling of *_nested formatting_* combinations.
624
+ ///
625
+ /// Expected: Nested formatting preserved or flattened consistently
626
+ /// Current behavior: May be malformed
627
+ #[tokio::test]
628
+ async fn test_typst_nested_formatting() {
629
+ let test_content = b"This is *bold with _nested italic_* text.";
630
+
631
+ let config = ExtractionConfig::default();
632
+ let result = extract_bytes(test_content, "application/x-typst", &config)
633
+ .await
634
+ .expect("Extraction failed");
635
+
636
+ let has_formatting = result.content.contains("*")
637
+ || result.content.contains("_")
638
+ || (result.content.contains("bold") && result.content.contains("italic"));
639
+
640
+ assert!(
641
+ has_formatting,
642
+ "Nested formatting should be preserved or flattened consistently."
643
+ );
644
+ }
645
+
646
+ /// TEST 17: Multiple paragraph handling
647
+ ///
648
+ /// Multiple paragraphs separated by blank lines should be preserved.
649
+ ///
650
+ /// Expected: Paragraph structure maintained
651
+ /// Current behavior: May merge or lose paragraphs
652
+ #[tokio::test]
653
+ async fn test_typst_multiple_paragraphs() {
654
+ let content = load_test_document("advanced.typ");
655
+ let _baseline = load_pandoc_baseline("advanced");
656
+ let config = ExtractionConfig::default();
657
+
658
+ let result = extract_bytes(&content, "application/x-typst", &config)
659
+ .await
660
+ .expect("Extraction failed");
661
+
662
+ let non_empty_lines: Vec<_> = result.content.lines().filter(|l| !l.trim().is_empty()).collect();
663
+
664
+ assert!(
665
+ non_empty_lines.len() >= 5,
666
+ "Multiple paragraphs should be preserved. Found {} content lines.",
667
+ non_empty_lines.len()
668
+ );
669
+ }
670
+
671
+ /// TEST 18: Heading-content association
672
+ ///
673
+ /// Content should follow its heading logically in the output.
674
+ ///
675
+ /// Expected: Each heading followed by its content
676
+ /// Current behavior: May be scrambled
677
+ #[tokio::test]
678
+ async fn test_typst_heading_content_association() {
679
+ let content = load_test_document("advanced.typ");
680
+ let config = ExtractionConfig::default();
681
+
682
+ let result = extract_bytes(&content, "application/x-typst", &config)
683
+ .await
684
+ .expect("Extraction failed");
685
+
686
+ let blocks = extract_content_blocks(&result.content);
687
+
688
+ assert!(blocks.len() > 0, "Content blocks should be associated with headings.");
689
+
690
+ for block in &blocks {
691
+ assert!(block.len() > 0, "Content blocks should not be empty.");
692
+ }
693
+ }
694
+
695
+ /// TEST 19: Whitespace normalization
696
+ ///
697
+ /// Multiple blank lines should be normalized consistently.
698
+ ///
699
+ /// Expected: Single blank lines between sections
700
+ /// Current behavior: May have excessive whitespace
701
+ #[tokio::test]
702
+ async fn test_typst_whitespace_handling() {
703
+ let content = load_test_document("advanced.typ");
704
+ let config = ExtractionConfig::default();
705
+
706
+ let result = extract_bytes(&content, "application/x-typst", &config)
707
+ .await
708
+ .expect("Extraction failed");
709
+
710
+ let blank_line_runs: Vec<_> = result.content.split("\n\n\n").collect();
711
+
712
+ assert!(
713
+ blank_line_runs.len() <= 2,
714
+ "Should not have excessive blank lines (triple newlines). \
715
+ Found {} instances of triple newlines.",
716
+ blank_line_runs.len() - 1
717
+ );
718
+ }
719
+
720
+ /// TEST 20: Minimal document handling
721
+ ///
722
+ /// Even minimal documents should extract correctly.
723
+ ///
724
+ /// Expected: Basic content and structure
725
+ /// Current behavior: May fail or lose content
726
+ #[tokio::test]
727
+ async fn test_typst_minimal_document() {
728
+ let content = load_test_document("minimal.typ");
729
+ let _baseline = load_pandoc_baseline("minimal");
730
+ let config = ExtractionConfig::default();
731
+
732
+ let result = extract_bytes(&content, "application/x-typst", &config)
733
+ .await
734
+ .expect("Extraction failed");
735
+
736
+ assert!(
737
+ !result.content.is_empty(),
738
+ "Even minimal documents should extract some content."
739
+ );
740
+
741
+ assert!(
742
+ result.content.len() > 0,
743
+ "Minimal document should produce non-empty output."
744
+ );
745
+ }
746
+
747
+ /// TEST 21: No directive pollution
748
+ ///
749
+ /// Extracted content should not contain #set, #let, #import directives.
750
+ ///
751
+ /// Expected: Clean extracted content without directives
752
+ /// Current behavior: May include directives
753
+ #[tokio::test]
754
+ async fn test_typst_no_directive_pollution() {
755
+ let content = load_test_document("advanced.typ");
756
+ let config = ExtractionConfig::default();
757
+
758
+ let result = extract_bytes(&content, "application/x-typst", &config)
759
+ .await
760
+ .expect("Extraction failed");
761
+
762
+ let directive_count = count_directive_lines(&result.content);
763
+
764
+ assert_eq!(
765
+ directive_count, 0,
766
+ "Extracted content should not contain directives (#set, #let, etc). \
767
+ Found {} directive lines polluting the output.",
768
+ directive_count
769
+ );
770
+ }
771
+
772
+ /// TEST 22: Metadata field completeness
773
+ ///
774
+ /// All metadata fields from baseline should be present.
775
+ ///
776
+ /// Expected: Title, author, date, keywords all extracted
777
+ /// Current behavior: Some fields missing
778
+ #[tokio::test]
779
+ async fn test_typst_metadata_field_completeness() {
780
+ let content = load_test_document("advanced.typ");
781
+ let config = ExtractionConfig::default();
782
+
783
+ let result = extract_bytes(&content, "application/x-typst", &config)
784
+ .await
785
+ .expect("Extraction failed");
786
+
787
+ let has_title = result.metadata.additional.get("title").is_some();
788
+ let has_author = result.metadata.additional.get("author").is_some();
789
+ let has_date = result.metadata.date.is_some();
790
+
791
+ assert!(
792
+ has_title && has_author && has_date,
793
+ "All metadata fields should be extracted. \
794
+ Title: {}, Author: {}, Date: {}",
795
+ has_title,
796
+ has_author,
797
+ has_date
798
+ );
799
+ }
800
+
801
+ /// TEST 23: Special character handling
802
+ ///
803
+ /// Unicode and special characters should be preserved.
804
+ ///
805
+ /// Expected: Special characters like ü, é, etc. preserved
806
+ /// Current behavior: May be corrupted
807
+ #[tokio::test]
808
+ async fn test_typst_special_character_preservation() {
809
+ let test_content = "Café with naïve français".as_bytes();
810
+
811
+ let config = ExtractionConfig::default();
812
+ let result = extract_bytes(test_content, "application/x-typst", &config)
813
+ .await
814
+ .expect("Extraction failed");
815
+
816
+ let has_special_chars =
817
+ result.content.contains("Café") || result.content.contains("naïve") || result.content.contains("français");
818
+
819
+ assert!(
820
+ has_special_chars,
821
+ "Special characters should be preserved in extraction."
822
+ );
823
+ }
824
+
825
+ /// TEST 24: Very long heading handling
826
+ ///
827
+ /// Long headings should not cause truncation or corruption.
828
+ ///
829
+ /// Expected: Full heading text preserved regardless of length
830
+ /// Current behavior: May truncate
831
+ #[tokio::test]
832
+ async fn test_typst_long_heading_handling() {
833
+ let test_content = b"= This is a very long heading that should be completely preserved without any truncation or corruption whatsoever";
834
+
835
+ let config = ExtractionConfig::default();
836
+ let result = extract_bytes(test_content, "application/x-typst", &config)
837
+ .await
838
+ .expect("Extraction failed");
839
+
840
+ let has_heading_start = result.content.contains("very long heading");
841
+
842
+ assert!(has_heading_start, "Long headings should not be truncated.");
843
+ }
844
+
845
+ /// TEST 25: Edge case - Empty heading recovery
846
+ ///
847
+ /// Even if a heading has no text, extraction should be robust.
848
+ ///
849
+ /// Expected: Graceful handling without crashes
850
+ /// Current behavior: May panic or produce empty output
851
+ #[tokio::test]
852
+ async fn test_typst_empty_heading_edge_case() {
853
+ let test_content = b"= \n\n== \nContent here";
854
+
855
+ let config = ExtractionConfig::default();
856
+ let result = extract_bytes(test_content, "application/x-typst", &config).await;
857
+
858
+ match result {
859
+ Ok(extraction) => {
860
+ assert!(
861
+ extraction.content.contains("Content"),
862
+ "Should extract regular content even if some headings are empty."
863
+ );
864
+ }
865
+ Err(_) => {}
866
+ }
867
+ }
868
+
869
+ /// TEST 26: Regression - Basic heading extraction
870
+ #[tokio::test]
871
+ async fn test_typst_basic_heading_regression() {
872
+ let test_content = b"= Main Heading\n\nContent here";
873
+
874
+ let config = ExtractionConfig::default();
875
+ let result = extract_bytes(test_content, "application/x-typst", &config)
876
+ .await
877
+ .expect("Extraction failed");
878
+
879
+ assert!(
880
+ result.content.contains("= Main Heading"),
881
+ "Basic level-1 heading should be extracted."
882
+ );
883
+
884
+ assert!(result.content.contains("Content"), "Content should be extracted.");
885
+ }
886
+
887
+ /// TEST 27: Regression - Level 2 heading extraction
888
+ #[tokio::test]
889
+ async fn test_typst_level2_heading_regression() {
890
+ let test_content = b"= Main\n\n== Subsection\n\nMore content";
891
+
892
+ let config = ExtractionConfig::default();
893
+ let result = extract_bytes(test_content, "application/x-typst", &config)
894
+ .await
895
+ .expect("Extraction failed");
896
+
897
+ assert!(
898
+ result.content.contains("== Subsection"),
899
+ "Level 2 headings must be extracted."
900
+ );
901
+ }
902
+
903
+ /// TEST 28: Regression - Basic metadata
904
+ #[tokio::test]
905
+ async fn test_typst_basic_metadata_regression() {
906
+ let test_content = b"#set document(title: \"Test\", author: \"Me\")\n\n= Heading";
907
+
908
+ let config = ExtractionConfig::default();
909
+ let result = extract_bytes(test_content, "application/x-typst", &config)
910
+ .await
911
+ .expect("Extraction failed");
912
+
913
+ assert!(
914
+ result.metadata.additional.get("title").is_some(),
915
+ "Title metadata must be extracted."
916
+ );
917
+
918
+ assert!(
919
+ result.metadata.additional.get("author").is_some(),
920
+ "Author metadata must be extracted."
921
+ );
922
+ }
923
+
924
+ /// TEST 29: Regression - Bold formatting
925
+ #[tokio::test]
926
+ async fn test_typst_bold_regression() {
927
+ let test_content = b"This is *bold text* here";
928
+
929
+ let config = ExtractionConfig::default();
930
+ let result = extract_bytes(test_content, "application/x-typst", &config)
931
+ .await
932
+ .expect("Extraction failed");
933
+
934
+ assert!(
935
+ result.content.contains("*bold*") || result.content.contains("bold"),
936
+ "Bold text should be preserved."
937
+ );
938
+ }
939
+
940
+ /// TEST 30: Regression - Inline code
941
+ #[tokio::test]
942
+ async fn test_typst_inline_code_regression() {
943
+ let test_content = b"Use `println!(\"hello\")` in Rust";
944
+
945
+ let config = ExtractionConfig::default();
946
+ let result = extract_bytes(test_content, "application/x-typst", &config)
947
+ .await
948
+ .expect("Extraction failed");
949
+
950
+ assert!(
951
+ result.content.contains("`") && result.content.contains("println"),
952
+ "Inline code should be preserved with backticks."
953
+ );
954
+ }
955
+
956
+ /// TEST 31: Regression - Code blocks
957
+ #[tokio::test]
958
+ async fn test_typst_codeblock_regression() {
959
+ let test_content = b"```rust\nfn main() {}\n```";
960
+
961
+ let config = ExtractionConfig::default();
962
+ let result = extract_bytes(test_content, "application/x-typst", &config)
963
+ .await
964
+ .expect("Extraction failed");
965
+
966
+ assert!(
967
+ result.content.contains("```"),
968
+ "Code block delimiters should be preserved."
969
+ );
970
+
971
+ assert!(
972
+ result.content.contains("fn main"),
973
+ "Code block content should be preserved."
974
+ );
975
+ }
976
+
977
+ /// TEST 32: Regression - List extraction
978
+ #[tokio::test]
979
+ async fn test_typst_list_regression() {
980
+ let test_content = b"- Item 1\n+ Item 2\n- Item 3";
981
+
982
+ let config = ExtractionConfig::default();
983
+ let result = extract_bytes(test_content, "application/x-typst", &config)
984
+ .await
985
+ .expect("Extraction failed");
986
+
987
+ assert!(
988
+ result.content.contains("Item 1") && result.content.contains("Item 2") && result.content.contains("Item 3"),
989
+ "All list items should be extracted."
990
+ );
991
+ }
992
+
993
+ /// TEST 33: Regression - Math preservation
994
+ #[tokio::test]
995
+ async fn test_typst_math_regression() {
996
+ let test_content = b"Formula: $E = mc^2$";
997
+
998
+ let config = ExtractionConfig::default();
999
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1000
+ .await
1001
+ .expect("Extraction failed");
1002
+
1003
+ assert!(
1004
+ result.content.contains("$") && (result.content.contains("mc") || result.content.contains("E")),
1005
+ "Math formulas should be preserved."
1006
+ );
1007
+ }
1008
+
1009
+ /// TEST 34: Regression - Link extraction
1010
+ #[tokio::test]
1011
+ async fn test_typst_link_regression() {
1012
+ let test_content = b"Visit #link(\"https://example.com\")[example]";
1013
+
1014
+ let config = ExtractionConfig::default();
1015
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1016
+ .await
1017
+ .expect("Extraction failed");
1018
+
1019
+ assert!(
1020
+ result.content.contains("example") || result.content.contains("example.com"),
1021
+ "Link text or URL should be preserved."
1022
+ );
1023
+ }
1024
+
1025
+ /// TEST 35: Regression - Table basic extraction
1026
+ #[tokio::test]
1027
+ async fn test_typst_table_regression() {
1028
+ let test_content = b"#table(columns: 2, [A], [B], [1], [2])";
1029
+
1030
+ let config = ExtractionConfig::default();
1031
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1032
+ .await
1033
+ .expect("Extraction failed");
1034
+
1035
+ assert!(
1036
+ result.content.contains("A") || result.content.contains("TABLE"),
1037
+ "Table content should be extracted."
1038
+ );
1039
+ }
1040
+
1041
+ /// TEST 36: Large document handling
1042
+ #[tokio::test]
1043
+ async fn test_typst_large_document_stress() {
1044
+ let mut large_content = String::new();
1045
+
1046
+ for i in 1..=50 {
1047
+ large_content.push_str(&format!("= Heading {}\n\n", i));
1048
+ large_content.push_str(&format!("Content for section {}.\n\n", i));
1049
+ }
1050
+
1051
+ let config = ExtractionConfig::default();
1052
+ let result = extract_bytes(large_content.as_bytes(), "application/x-typst", &config)
1053
+ .await
1054
+ .expect("Extraction failed");
1055
+
1056
+ let heading_count = extract_all_headings(&result.content).len();
1057
+ assert!(
1058
+ heading_count >= 40,
1059
+ "Large documents should extract all headings. Found {} of 50.",
1060
+ heading_count
1061
+ );
1062
+ }
1063
+
1064
+ /// TEST 37: Deep nesting stress test
1065
+ #[tokio::test]
1066
+ async fn test_typst_deep_nesting_stress() {
1067
+ let mut nested = String::new();
1068
+
1069
+ for level in 1..=6 {
1070
+ nested.push_str(&format!("{} Level {} Heading\n\n", "=".repeat(level), level));
1071
+ nested.push_str(&format!("Content at level {}.\n\n", level));
1072
+ }
1073
+
1074
+ let config = ExtractionConfig::default();
1075
+ let result = extract_bytes(nested.as_bytes(), "application/x-typst", &config)
1076
+ .await
1077
+ .expect("Extraction failed");
1078
+
1079
+ for level in 1..=6 {
1080
+ let count = count_heading_level(&result.content, level);
1081
+ assert!(
1082
+ count >= 1,
1083
+ "Level {} heading should be extracted in deep nesting test.",
1084
+ level
1085
+ );
1086
+ }
1087
+ }
1088
+
1089
+ /// TEST 38: Mixed formatting stress
1090
+ #[tokio::test]
1091
+ async fn test_typst_mixed_formatting_stress() {
1092
+ let test_content = b"This text has *bold*, _italic_, `code`, and $math$ all mixed together!";
1093
+
1094
+ let config = ExtractionConfig::default();
1095
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1096
+ .await
1097
+ .expect("Extraction failed");
1098
+
1099
+ let has_formatting = (result.content.contains("*") || result.content.contains("bold"))
1100
+ && (result.content.contains("_") || result.content.contains("italic"))
1101
+ && (result.content.contains("`") || result.content.contains("code"))
1102
+ && (result.content.contains("$") || result.content.contains("math"));
1103
+
1104
+ assert!(has_formatting, "All mixed formatting should be preserved.");
1105
+ }
1106
+
1107
+ /// TEST 39: Unicode stress test
1108
+ #[tokio::test]
1109
+ async fn test_typst_unicode_stress() {
1110
+ let test_content = "= Unicode Heading 中文 العربية\n\nContent with emojis: 🎉🚀💯\n\nGreek: α β γ δ ε ζ".as_bytes();
1111
+
1112
+ let config = ExtractionConfig::default();
1113
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1114
+ .await
1115
+ .expect("Extraction failed");
1116
+
1117
+ assert!(
1118
+ result.content.contains("Unicode"),
1119
+ "Unicode content should be preserved."
1120
+ );
1121
+ }
1122
+
1123
+ /// TEST 40: Pathological whitespace
1124
+ #[tokio::test]
1125
+ async fn test_typst_pathological_whitespace() {
1126
+ let test_content = b"= Heading\n\n\n\n\n\nContent with excessive blank lines\n\n\n\n\nMore content";
1127
+
1128
+ let config = ExtractionConfig::default();
1129
+ let result = extract_bytes(test_content, "application/x-typst", &config)
1130
+ .await
1131
+ .expect("Extraction failed");
1132
+
1133
+ assert!(
1134
+ result.content.contains("Heading") && result.content.contains("Content"),
1135
+ "Should extract content even with excessive whitespace."
1136
+ );
1137
+ }
1138
+
1139
+ /// TEST 41: Full document comparison - simple.typ
1140
+ #[tokio::test]
1141
+ async fn test_typst_full_simple_document_comparison() {
1142
+ let content = load_test_document("simple.typ");
1143
+ let _baseline = load_pandoc_baseline("simple");
1144
+ let config = ExtractionConfig::default();
1145
+
1146
+ let result = extract_bytes(&content, "application/x-typst", &config)
1147
+ .await
1148
+ .expect("Extraction failed");
1149
+
1150
+ assert!(
1151
+ result.content.len() > 50,
1152
+ "simple.typ should extract substantial content"
1153
+ );
1154
+
1155
+ let heading_count = extract_all_headings(&result.content).len();
1156
+ assert!(heading_count > 2, "simple.typ should have multiple sections");
1157
+ }
1158
+
1159
+ /// TEST 42: Full document comparison - advanced.typ
1160
+ #[tokio::test]
1161
+ async fn test_typst_full_advanced_document_comparison() {
1162
+ let content = load_test_document("advanced.typ");
1163
+ let _baseline = load_pandoc_baseline("advanced");
1164
+ let config = ExtractionConfig::default();
1165
+
1166
+ let result = extract_bytes(&content, "application/x-typst", &config)
1167
+ .await
1168
+ .expect("Extraction failed");
1169
+
1170
+ assert!(
1171
+ result.content.len() > 100,
1172
+ "advanced.typ should extract comprehensive content"
1173
+ );
1174
+
1175
+ let heading_count = extract_all_headings(&result.content).len();
1176
+ assert!(heading_count >= 5, "advanced.typ should preserve heading structure");
1177
+ }
1178
+
1179
+ /// TEST 43: MIME type consistency
1180
+ ///
1181
+ /// The extractor should support both standard MIME types for Typst.
1182
+ /// Currently only supports application/x-typst, not text/x-typst.
1183
+ #[tokio::test]
1184
+ async fn test_typst_mime_type_consistency() {
1185
+ let content = load_test_document("simple.typ");
1186
+ let config = ExtractionConfig::default();
1187
+
1188
+ let result_primary = extract_bytes(&content, "application/x-typst", &config)
1189
+ .await
1190
+ .expect("Primary MIME type should work");
1191
+
1192
+ assert!(
1193
+ result_primary.content.len() > 0,
1194
+ "Primary MIME type should extract content"
1195
+ );
1196
+
1197
+ match extract_bytes(&content, "text/x-typst", &config).await {
1198
+ Ok(result) => {
1199
+ assert!(
1200
+ result.content.len() > 0,
1201
+ "Alternative MIME type should extract content if supported"
1202
+ );
1203
+ }
1204
+ Err(_e) => {
1205
+ println!("Note: text/x-typst is not currently supported (may be added in future)");
1206
+ }
1207
+ }
1208
+ }
1209
+
1210
+ /// TEST 44: Config parameter impact
1211
+ #[tokio::test]
1212
+ async fn test_typst_config_parameter_handling() {
1213
+ let content = load_test_document("simple.typ");
1214
+ let config = ExtractionConfig::default();
1215
+
1216
+ let result = extract_bytes(&content, "application/x-typst", &config)
1217
+ .await
1218
+ .expect("Extraction failed");
1219
+
1220
+ assert!(!result.content.is_empty(), "Extraction with default config should work");
1221
+
1222
+ assert_eq!(result.mime_type, "application/x-typst", "MIME type should be preserved");
1223
+ }
1224
+
1225
+ /// TEST 45: Comparative heading analysis
1226
+ ///
1227
+ /// This final comprehensive test checks heading extraction
1228
+ /// against the baseline to identify the exact scope of the heading loss bug.
1229
+ #[tokio::test]
1230
+ async fn test_typst_heading_loss_bug_analysis() {
1231
+ let content = load_test_document("headings.typ");
1232
+ let baseline = load_pandoc_baseline("headings");
1233
+ let config = ExtractionConfig::default();
1234
+
1235
+ let result = extract_bytes(&content, "application/x-typst", &config)
1236
+ .await
1237
+ .expect("Extraction failed");
1238
+
1239
+ println!("\n===== HEADING EXTRACTION ANALYSIS =====");
1240
+ println!("Baseline content:");
1241
+ println!("{}", baseline);
1242
+ println!("\nExtracted content:");
1243
+ println!("{}", result.content);
1244
+
1245
+ let extracted_headings = extract_all_headings(&result.content);
1246
+ println!("\nExtracted headings: {}", extracted_headings.len());
1247
+ for (i, h) in extracted_headings.iter().enumerate() {
1248
+ println!(" {}: {}", i + 1, h);
1249
+ }
1250
+
1251
+ assert!(
1252
+ extracted_headings.len() >= 6,
1253
+ "BUG CONFIRMED: Heading loss detected. \
1254
+ Expected 6 headings (1-6 levels), found {}. \
1255
+ This is the 62% heading loss bug - only single '=' is matched, \
1256
+ all '==' and higher are skipped entirely.",
1257
+ extracted_headings.len()
1258
+ );
1259
+ }