kreuzberg 4.0.0.rc2 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +391 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,647 +1,648 @@
1
- //! Comprehensive TDD test suite for Typst document extraction.
2
- //!
3
- //! This test suite validates Typst document extraction against expected outputs.
4
- //! The tests verify:
5
- //! - Document metadata extraction (title, author, date, keywords)
6
- //! - Heading hierarchy parsing (=, ==, ===, etc.)
7
- //! - Inline formatting (bold, italic, code)
8
- //! - Table extraction and parsing
9
- //! - List handling (ordered and unordered)
10
- //! - Link extraction
11
- //! - Mathematical notation preservation
12
- //!
13
- //! Each test document is extracted and validated for correct content extraction.
14
-
15
- use kreuzberg::core::config::ExtractionConfig;
16
- use kreuzberg::core::extractor::extract_bytes;
17
- use std::{fs, path::PathBuf};
18
-
19
- fn typst_fixture(name: &str) -> PathBuf {
20
- PathBuf::from(env!("CARGO_MANIFEST_DIR"))
21
- .join("../../test_documents/typst")
22
- .join(name)
23
- }
24
-
25
- /// Test simple.typ - Basic Typst document with fundamental formatting
26
- ///
27
- /// Document contains:
28
- /// - Document metadata: title, author, date
29
- /// - Level 1 heading: "Introduction"
30
- /// - Level 2 headings: "Subsection", "Features", "Lists", "Code", "Tables", "Links", "Conclusion"
31
- /// - Inline formatting: *bold*, _italic_, `inline code`
32
- /// - Unordered list with 3 items
33
- /// - Code snippet
34
- /// - 2x2 table with headers
35
- /// - Link to Typst website
36
- ///
37
- /// Expected: Document should extract text, preserve headings, metadata, and formatting markers
38
- #[tokio::test]
39
- async fn test_simple_typst_document_extraction() {
40
- let config = ExtractionConfig::default();
41
-
42
- let doc_path = typst_fixture("simple.typ");
43
- let content = match fs::read(doc_path) {
44
- Ok(c) => c,
45
- Err(e) => {
46
- eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
47
- return;
48
- }
49
- };
50
-
51
- let result = extract_bytes(&content, "text/x-typst", &config).await;
52
- if result.is_err() {
53
- println!("Skipping test: Typst extractor may not be available");
54
- return;
55
- }
56
-
57
- let extraction = result.unwrap();
58
-
59
- assert_eq!(extraction.mime_type, "text/x-typst", "MIME type should be preserved");
60
-
61
- assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
62
-
63
- assert!(
64
- extraction.metadata.additional.contains_key("title"),
65
- "Document title should be extracted from #set document()"
66
- );
67
-
68
- assert!(
69
- extraction.metadata.additional.contains_key("author"),
70
- "Document author should be extracted"
71
- );
72
-
73
- assert!(
74
- extraction.content.contains("Introduction"),
75
- "Should extract 'Introduction' heading"
76
- );
77
- assert!(
78
- extraction.content.contains("Features"),
79
- "Should extract 'Features' heading"
80
- );
81
- assert!(
82
- extraction.content.contains("Conclusion"),
83
- "Should extract 'Conclusion' heading"
84
- );
85
-
86
- let intro_count = extraction.content.matches("= Introduction").count();
87
- let subsection_count = extraction.content.matches("== Subsection").count();
88
- let features_count = extraction.content.matches("= Features").count();
89
- let lists_count = extraction.content.matches("== Lists").count();
90
- let code_count = extraction.content.matches("== Code").count();
91
- let tables_count = extraction.content.matches("== Tables").count();
92
- let links_count = extraction.content.matches("== Links").count();
93
- let conclusion_count = extraction.content.matches("= Conclusion").count();
94
-
95
- assert_eq!(intro_count, 1, "Should extract 'Introduction' (level 1)");
96
- assert_eq!(subsection_count, 1, "Should extract 'Subsection' (level 2)");
97
- assert_eq!(features_count, 1, "Should extract 'Features' (level 1)");
98
- assert_eq!(lists_count, 1, "Should extract 'Lists' (level 2)");
99
- assert_eq!(code_count, 1, "Should extract 'Code' (level 2)");
100
- assert_eq!(tables_count, 1, "Should extract 'Tables' (level 2)");
101
- assert_eq!(links_count, 1, "Should extract 'Links' (level 2)");
102
- assert_eq!(conclusion_count, 1, "Should extract 'Conclusion' (level 1)");
103
-
104
- assert!(
105
- extraction.content.contains("*") || extraction.content.contains("bold"),
106
- "Should preserve bold formatting or text"
107
- );
108
-
109
- assert!(
110
- extraction.content.contains("-") || extraction.content.contains("First") || extraction.content.contains("item"),
111
- "Should extract list content"
112
- );
113
-
114
- println!(
115
- "✓ simple.typ: Successfully extracted {} characters with all 8 headings",
116
- extraction.content.len()
117
- );
118
- }
119
-
120
- /// Test minimal.typ - Minimal Typst document
121
- ///
122
- /// Document contains:
123
- /// - Single level 1 heading: "Hello World"
124
- /// - Simple text content
125
- ///
126
- /// Expected: Basic heading and content extraction
127
- #[tokio::test]
128
- async fn test_minimal_typst_document_extraction() {
129
- let config = ExtractionConfig::default();
130
-
131
- let doc_path = typst_fixture("minimal.typ");
132
- let content = match fs::read(doc_path) {
133
- Ok(c) => c,
134
- Err(e) => {
135
- eprintln!("Warning: Could not read minimal.typ: {}. Skipping test.", e);
136
- return;
137
- }
138
- };
139
-
140
- let result = extract_bytes(&content, "application/x-typst", &config).await;
141
- if result.is_err() {
142
- println!("Skipping test: Typst extractor may not be available");
143
- return;
144
- }
145
-
146
- let extraction = result.unwrap();
147
-
148
- assert!(
149
- !extraction.content.is_empty(),
150
- "Minimal document should extract content"
151
- );
152
-
153
- assert!(
154
- extraction.content.contains("Hello") || extraction.content.contains("World"),
155
- "Should extract heading content"
156
- );
157
-
158
- println!(
159
- "✓ minimal.typ: Successfully extracted {} characters",
160
- extraction.content.len()
161
- );
162
- }
163
-
164
- /// Test headings.typ - Document focusing on heading hierarchy
165
- ///
166
- /// Document contains:
167
- /// - 6 heading levels (=, ==, ===, ====, =====, ======)
168
- /// - Content under each heading level
169
- ///
170
- /// Expected: Heading structure should be preserved with level information
171
- #[tokio::test]
172
- async fn test_heading_hierarchy_extraction() {
173
- let config = ExtractionConfig::default();
174
-
175
- let doc_path = typst_fixture("headings.typ");
176
- let content = match fs::read(doc_path) {
177
- Ok(c) => c,
178
- Err(e) => {
179
- eprintln!("Warning: Could not read headings.typ: {}. Skipping test.", e);
180
- return;
181
- }
182
- };
183
-
184
- let result = extract_bytes(&content, "text/x-typst", &config).await;
185
- if result.is_err() {
186
- println!("Skipping test: Typst extractor may not be available");
187
- return;
188
- }
189
-
190
- let extraction = result.unwrap();
191
-
192
- assert!(!extraction.content.is_empty(), "Document should extract content");
193
-
194
- assert!(
195
- extraction.content.contains("= Level 1") || extraction.content.contains("Level 1 Heading"),
196
- "Should extract level 1 heading"
197
- );
198
-
199
- assert!(
200
- extraction.content.contains("== Level 2") || extraction.content.contains("Level 2 Heading"),
201
- "Should extract level 2 heading"
202
- );
203
-
204
- assert!(
205
- extraction.content.contains("=== Level 3") || extraction.content.contains("Level 3 Heading"),
206
- "Should extract level 3 heading"
207
- );
208
-
209
- assert!(
210
- extraction.content.contains("==== Level 4") || extraction.content.contains("Level 4 Heading"),
211
- "Should extract level 4 heading"
212
- );
213
-
214
- assert!(
215
- extraction.content.contains("===== Level 5") || extraction.content.contains("Level 5 Heading"),
216
- "Should extract level 5 heading"
217
- );
218
-
219
- assert!(
220
- extraction.content.contains("====== Level 6") || extraction.content.contains("Level 6 Heading"),
221
- "Should extract level 6 heading"
222
- );
223
-
224
- let level_1_count = extraction.content.matches("= Level 1").count();
225
- let level_2_count = extraction.content.matches("== Level 2").count();
226
- let level_3_count = extraction.content.matches("=== Level 3").count();
227
- let level_4_count = extraction.content.matches("==== Level 4").count();
228
- let level_5_count = extraction.content.matches("===== Level 5").count();
229
- let level_6_count = extraction.content.matches("====== Level 6").count();
230
-
231
- assert_eq!(level_1_count, 1, "Should extract exactly one level 1 heading");
232
- assert_eq!(level_2_count, 1, "Should extract exactly one level 2 heading");
233
- assert_eq!(level_3_count, 1, "Should extract exactly one level 3 heading");
234
- assert_eq!(level_4_count, 1, "Should extract exactly one level 4 heading");
235
- assert_eq!(level_5_count, 1, "Should extract exactly one level 5 heading");
236
- assert_eq!(level_6_count, 1, "Should extract exactly one level 6 heading");
237
-
238
- println!(
239
- "✓ headings.typ: Successfully extracted {} characters with heading structure",
240
- extraction.content.len()
241
- );
242
- }
243
-
244
- /// Test metadata.typ - Document with comprehensive metadata
245
- ///
246
- /// Document contains:
247
- /// - #set document() with: title, author, subject, keywords
248
- /// - Content sections
249
- ///
250
- /// Expected: All metadata fields should be extracted correctly
251
- #[tokio::test]
252
- async fn test_metadata_extraction() {
253
- let config = ExtractionConfig::default();
254
-
255
- let doc_path = typst_fixture("metadata.typ");
256
- let content = match fs::read(doc_path) {
257
- Ok(c) => c,
258
- Err(e) => {
259
- eprintln!("Warning: Could not read metadata.typ: {}. Skipping test.", e);
260
- return;
261
- }
262
- };
263
-
264
- let result = extract_bytes(&content, "application/x-typst", &config).await;
265
- if result.is_err() {
266
- println!("Skipping test: Typst extractor may not be available");
267
- return;
268
- }
269
-
270
- let extraction = result.unwrap();
271
-
272
- if let Some(title) = extraction.metadata.additional.get("title") {
273
- assert!(
274
- title.to_string().contains("Metadata") || title.to_string().contains("Example"),
275
- "Title should contain expected text"
276
- );
277
- }
278
-
279
- if let Some(author) = extraction.metadata.additional.get("author") {
280
- assert!(
281
- author.to_string().contains("John") || author.to_string().contains("Doe"),
282
- "Author should contain expected text"
283
- );
284
- }
285
-
286
- if let Some(keywords) = extraction.metadata.additional.get("keywords") {
287
- assert!(!keywords.to_string().is_empty(), "Keywords should be present");
288
- }
289
-
290
- assert!(!extraction.content.is_empty(), "Document should extract content");
291
-
292
- println!(
293
- "✓ metadata.typ: Successfully extracted metadata and {} characters of content",
294
- extraction.content.len()
295
- );
296
- }
297
-
298
- /// Test advanced.typ - Complex Typst document with multiple features
299
- ///
300
- /// Document contains:
301
- /// - Metadata: title, author, keywords, date
302
- /// - Heading numbering configuration
303
- /// - Mathematical notation (inline and display)
304
- /// - Nested heading levels (level 1, 2, 3, 4)
305
- /// - Code blocks (Python example)
306
- /// - Complex tables with 3 columns and 4 rows
307
- /// - Multiple paragraph sections
308
- /// - Links with text
309
- /// - Multiple formatting combinations
310
- ///
311
- /// Expected: Comprehensive extraction of all document elements
312
- #[tokio::test]
313
- async fn test_advanced_typst_document_extraction() {
314
- let config = ExtractionConfig::default();
315
-
316
- let doc_path = typst_fixture("advanced.typ");
317
- let content = match fs::read(doc_path) {
318
- Ok(c) => c,
319
- Err(e) => {
320
- eprintln!("Warning: Could not read advanced.typ: {}. Skipping test.", e);
321
- return;
322
- }
323
- };
324
-
325
- let result = extract_bytes(&content, "text/x-typst", &config).await;
326
- if result.is_err() {
327
- println!("Skipping test: Typst extractor may not be available");
328
- return;
329
- }
330
-
331
- let extraction = result.unwrap();
332
-
333
- assert!(
334
- extraction.metadata.additional.contains_key("title"),
335
- "Title should be extracted"
336
- );
337
-
338
- assert!(
339
- !extraction.content.is_empty(),
340
- "Advanced document should extract content"
341
- );
342
-
343
- assert!(
344
- extraction.content.contains("$")
345
- || extraction.content.contains("equation")
346
- || extraction.content.contains("math"),
347
- "Should extract or preserve mathematical notation"
348
- );
349
-
350
- assert!(
351
- extraction.content.contains("Mathematical")
352
- || extraction.content.contains("Formatting")
353
- || extraction.content.contains("Features"),
354
- "Should extract section headings"
355
- );
356
-
357
- assert!(
358
- extraction.content.contains("python")
359
- || extraction.content.contains("def")
360
- || extraction.content.contains("fibonacci")
361
- || extraction.content.contains("```"),
362
- "Should extract code block content"
363
- );
364
-
365
- let level_count = extraction.content.matches("=").count();
366
- assert!(level_count >= 3, "Should preserve nested heading hierarchy");
367
-
368
- assert!(
369
- extraction.content.contains("Name")
370
- || extraction.content.contains("Alice")
371
- || extraction.content.contains("Table"),
372
- "Should extract table content"
373
- );
374
-
375
- assert!(
376
- extraction.content.contains("example")
377
- || extraction.content.contains("link")
378
- || extraction.content.contains("http"),
379
- "Should extract link content"
380
- );
381
-
382
- println!(
383
- "✓ advanced.typ: Successfully extracted {} characters with complex formatting",
384
- extraction.content.len()
385
- );
386
- }
387
-
388
- /// Test typst-reader.typ - Pandoc test file
389
- ///
390
- /// Document from Pandoc test suite demonstrating Typst reader functionality
391
- ///
392
- /// Expected: Proper extraction of Typst-specific syntax
393
- #[tokio::test]
394
- async fn test_typst_reader_extraction() {
395
- let config = ExtractionConfig::default();
396
-
397
- let doc_path = typst_fixture("typst-reader.typ");
398
- let content = match fs::read(doc_path) {
399
- Ok(c) => c,
400
- Err(e) => {
401
- eprintln!("Warning: Could not read typst-reader.typ: {}. Skipping test.", e);
402
- return;
403
- }
404
- };
405
-
406
- let result = extract_bytes(&content, "application/x-typst", &config).await;
407
- if result.is_err() {
408
- println!("Skipping test: Typst extractor may not be available");
409
- return;
410
- }
411
-
412
- let extraction = result.unwrap();
413
-
414
- assert!(
415
- !extraction.content.is_empty(),
416
- "Should extract content from Pandoc test file"
417
- );
418
-
419
- assert!(
420
- extraction.content.contains("=") || extraction.content.contains("Fibonacci"),
421
- "Should extract heading or content from test file"
422
- );
423
-
424
- println!(
425
- "✓ typst-reader.typ: Successfully extracted {} characters",
426
- extraction.content.len()
427
- );
428
- }
429
-
430
- /// Test undergradmath.typ - Pandoc test file with complex math
431
- ///
432
- /// Document from Pandoc test suite with extensive mathematical notation
433
- /// and complex formatting
434
- ///
435
- /// Expected: Handling of complex Typst syntax with metadata and content
436
- #[tokio::test]
437
- async fn test_undergradmath_extraction() {
438
- let config = ExtractionConfig::default();
439
-
440
- let doc_path = typst_fixture("undergradmath.typ");
441
- let content = match fs::read(doc_path) {
442
- Ok(c) => c,
443
- Err(e) => {
444
- eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e);
445
- return;
446
- }
447
- };
448
-
449
- let result = extract_bytes(&content, "text/x-typst", &config).await;
450
- if result.is_err() {
451
- println!("Skipping test: Typst extractor may not be available");
452
- return;
453
- }
454
-
455
- let extraction = result.unwrap();
456
-
457
- assert!(
458
- !extraction.content.is_empty(),
459
- "Should extract content from complex math document"
460
- );
461
-
462
- if let Some(title) = extraction.metadata.additional.get("title") {
463
- assert!(!title.to_string().is_empty(), "Title should be extracted");
464
- }
465
-
466
- assert!(
467
- extraction.content.contains("=") || extraction.content.contains("Typst") || extraction.content.len() > 100,
468
- "Should extract document structure or content"
469
- );
470
-
471
- println!(
472
- "✓ undergradmath.typ: Successfully extracted {} characters from math document",
473
- extraction.content.len()
474
- );
475
- }
476
-
477
- /// Test MIME type detection and fallback
478
- ///
479
- /// Verifies that Typst documents can be extracted with different MIME type specifications
480
- #[tokio::test]
481
- async fn test_typst_mime_type_variants() {
482
- let config = ExtractionConfig::default();
483
-
484
- let doc_path = typst_fixture("simple.typ");
485
- let content = match fs::read(doc_path) {
486
- Ok(c) => c,
487
- Err(e) => {
488
- eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
489
- return;
490
- }
491
- };
492
-
493
- let mime_types = vec!["application/x-typst", "text/x-typst", "text/plain"];
494
-
495
- for mime_type in mime_types {
496
- let result = extract_bytes(&content, mime_type, &config).await;
497
-
498
- if result.is_ok() {
499
- let extraction = result.unwrap();
500
- assert!(
501
- !extraction.content.is_empty(),
502
- "Should extract content with MIME type: {}",
503
- mime_type
504
- );
505
- println!(
506
- "✓ MIME type '{}': Successfully extracted {} characters",
507
- mime_type,
508
- extraction.content.len()
509
- );
510
- }
511
- }
512
- }
513
-
514
- /// Test formatting preservation
515
- ///
516
- /// Validates that inline formatting markers are preserved in extracted content
517
- #[tokio::test]
518
- async fn test_formatting_preservation() {
519
- let config = ExtractionConfig::default();
520
-
521
- let doc_path = typst_fixture("simple.typ");
522
- let content = match fs::read(doc_path) {
523
- Ok(c) => c,
524
- Err(e) => {
525
- eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
526
- return;
527
- }
528
- };
529
-
530
- let result = extract_bytes(&content, "text/x-typst", &config).await;
531
- if result.is_err() {
532
- println!("Skipping test: Typst extractor may not be available");
533
- return;
534
- }
535
-
536
- let extraction = result.unwrap();
537
-
538
- assert!(
539
- extraction.content.contains("*") || extraction.content.contains("bold"),
540
- "Should preserve bold formatting or text"
541
- );
542
-
543
- assert!(
544
- extraction.content.contains("_") || extraction.content.contains("italic"),
545
- "Should preserve italic formatting or text"
546
- );
547
-
548
- assert!(
549
- extraction.content.contains("`") || extraction.content.contains("code"),
550
- "Should preserve code formatting or text"
551
- );
552
-
553
- println!("✓ Formatting preservation: All markers/content found in extracted text");
554
- }
555
-
556
- /// Test large document handling
557
- ///
558
- /// Validates extraction of the large undergradmath document
559
- #[tokio::test]
560
- async fn test_large_document_extraction() {
561
- let config = ExtractionConfig::default();
562
-
563
- let doc_path = typst_fixture("undergradmath.typ");
564
- let content = match fs::read(doc_path) {
565
- Ok(c) => c,
566
- Err(e) => {
567
- eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e);
568
- return;
569
- }
570
- };
571
-
572
- let result = extract_bytes(&content, "text/x-typst", &config).await;
573
- if result.is_err() {
574
- println!("Skipping test: Typst extractor may not be available");
575
- return;
576
- }
577
-
578
- let extraction = result.unwrap();
579
-
580
- assert!(
581
- !extraction.content.is_empty(),
582
- "Should extract content from large document"
583
- );
584
-
585
- println!(
586
- "✓ Large document: Extracted {} bytes of content from source file",
587
- extraction.content.len()
588
- );
589
- }
590
-
591
- /// Test empty/whitespace handling
592
- ///
593
- /// Validates graceful handling of edge cases
594
- #[tokio::test]
595
- async fn test_empty_content_handling() {
596
- let config = ExtractionConfig::default();
597
-
598
- let empty_content = b"";
599
- let result = extract_bytes(empty_content, "text/x-typst", &config).await;
600
-
601
- match result {
602
- Ok(extraction) => {
603
- println!(
604
- "✓ Empty content: Handled gracefully, extracted {} bytes",
605
- extraction.content.len()
606
- );
607
- }
608
- Err(e) => {
609
- println!("✓ Empty content: Resulted in expected error: {}", e);
610
- }
611
- }
612
- }
613
-
614
- /// Test MIME type priority
615
- ///
616
- /// Validates that Typst extractor has correct priority (50)
617
- #[tokio::test]
618
- async fn test_typst_extractor_priority() {
619
- use kreuzberg::extractors::TypstExtractor;
620
- use kreuzberg::plugins::DocumentExtractor;
621
-
622
- let extractor = TypstExtractor::new();
623
- let priority = extractor.priority();
624
-
625
- assert_eq!(priority, 50, "Typst extractor should have priority 50");
626
- println!("Typst extractor priority: {}", priority);
627
- }
628
-
629
- /// Test supported MIME types
630
- ///
631
- /// Validates that extractor claims to support Typst MIME types
632
- #[tokio::test]
633
- async fn test_supported_mime_types() {
634
- use kreuzberg::extractors::TypstExtractor;
635
- use kreuzberg::plugins::DocumentExtractor;
636
-
637
- let extractor = TypstExtractor::new();
638
- let mime_types = extractor.supported_mime_types();
639
-
640
- assert!(
641
- mime_types.contains(&"application/x-typst"),
642
- "Should support application/x-typst"
643
- );
644
- assert!(mime_types.contains(&"text/x-typst"), "Should support text/x-typst");
645
-
646
- println!("✓ Supported MIME types: {:?}", mime_types);
647
- }
1
+ //! Comprehensive TDD test suite for Typst document extraction.
2
+ //!
3
+ //! This test suite validates Typst document extraction against expected outputs.
4
+ //! The tests verify:
5
+ //! - Document metadata extraction (title, author, date, keywords)
6
+ //! - Heading hierarchy parsing (=, ==, ===, etc.)
7
+ //! - Inline formatting (bold, italic, code)
8
+ //! - Table extraction and parsing
9
+ //! - List handling (ordered and unordered)
10
+ //! - Link extraction
11
+ //! - Mathematical notation preservation
12
+ //!
13
+ //! Each test document is extracted and validated for correct content extraction.
14
+
15
+ #![cfg(feature = "office")]
16
+
17
+ use kreuzberg::core::config::ExtractionConfig;
18
+ use kreuzberg::core::extractor::extract_bytes;
19
+ use std::{fs, path::PathBuf};
20
+
21
+ fn typst_fixture(name: &str) -> PathBuf {
22
+ PathBuf::from(env!("CARGO_MANIFEST_DIR"))
23
+ .join("../../test_documents/typst")
24
+ .join(name)
25
+ }
26
+
27
+ /// Test simple.typ - Basic Typst document with fundamental formatting
28
+ ///
29
+ /// Document contains:
30
+ /// - Document metadata: title, author, date
31
+ /// - Level 1 heading: "Introduction"
32
+ /// - Level 2 headings: "Subsection", "Features", "Lists", "Code", "Tables", "Links", "Conclusion"
33
+ /// - Inline formatting: *bold*, _italic_, `inline code`
34
+ /// - Unordered list with 3 items
35
+ /// - Code snippet
36
+ /// - 2x2 table with headers
37
+ /// - Link to Typst website
38
+ ///
39
+ /// Expected: Document should extract text, preserve headings, metadata, and formatting markers
40
+ #[tokio::test]
41
+ async fn test_simple_typst_document_extraction() {
42
+ let config = ExtractionConfig::default();
43
+
44
+ let doc_path = typst_fixture("simple.typ");
45
+ let content = match fs::read(doc_path) {
46
+ Ok(c) => c,
47
+ Err(e) => {
48
+ eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
49
+ return;
50
+ }
51
+ };
52
+
53
+ let result = extract_bytes(&content, "text/x-typst", &config).await;
54
+ if result.is_err() {
55
+ println!("Skipping test: Typst extractor may not be available");
56
+ return;
57
+ }
58
+
59
+ let extraction = result.unwrap();
60
+
61
+ assert_eq!(extraction.mime_type, "text/x-typst", "MIME type should be preserved");
62
+
63
+ assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
64
+
65
+ assert!(
66
+ extraction.metadata.additional.contains_key("title"),
67
+ "Document title should be extracted from #set document()"
68
+ );
69
+
70
+ assert!(
71
+ extraction.metadata.additional.contains_key("author"),
72
+ "Document author should be extracted"
73
+ );
74
+
75
+ assert!(
76
+ extraction.content.contains("Introduction"),
77
+ "Should extract 'Introduction' heading"
78
+ );
79
+ assert!(
80
+ extraction.content.contains("Features"),
81
+ "Should extract 'Features' heading"
82
+ );
83
+ assert!(
84
+ extraction.content.contains("Conclusion"),
85
+ "Should extract 'Conclusion' heading"
86
+ );
87
+
88
+ let intro_count = extraction.content.matches("= Introduction").count();
89
+ let subsection_count = extraction.content.matches("== Subsection").count();
90
+ let features_count = extraction.content.matches("= Features").count();
91
+ let lists_count = extraction.content.matches("== Lists").count();
92
+ let code_count = extraction.content.matches("== Code").count();
93
+ let tables_count = extraction.content.matches("== Tables").count();
94
+ let links_count = extraction.content.matches("== Links").count();
95
+ let conclusion_count = extraction.content.matches("= Conclusion").count();
96
+
97
+ assert_eq!(intro_count, 1, "Should extract 'Introduction' (level 1)");
98
+ assert_eq!(subsection_count, 1, "Should extract 'Subsection' (level 2)");
99
+ assert_eq!(features_count, 1, "Should extract 'Features' (level 1)");
100
+ assert_eq!(lists_count, 1, "Should extract 'Lists' (level 2)");
101
+ assert_eq!(code_count, 1, "Should extract 'Code' (level 2)");
102
+ assert_eq!(tables_count, 1, "Should extract 'Tables' (level 2)");
103
+ assert_eq!(links_count, 1, "Should extract 'Links' (level 2)");
104
+ assert_eq!(conclusion_count, 1, "Should extract 'Conclusion' (level 1)");
105
+
106
+ assert!(
107
+ extraction.content.contains("*") || extraction.content.contains("bold"),
108
+ "Should preserve bold formatting or text"
109
+ );
110
+
111
+ assert!(
112
+ extraction.content.contains("-") || extraction.content.contains("First") || extraction.content.contains("item"),
113
+ "Should extract list content"
114
+ );
115
+
116
+ println!(
117
+ "✓ simple.typ: Successfully extracted {} characters with all 8 headings",
118
+ extraction.content.len()
119
+ );
120
+ }
121
+
122
+ /// Test minimal.typ - Minimal Typst document
123
+ ///
124
+ /// Document contains:
125
+ /// - Single level 1 heading: "Hello World"
126
+ /// - Simple text content
127
+ ///
128
+ /// Expected: Basic heading and content extraction
129
+ #[tokio::test]
130
+ async fn test_minimal_typst_document_extraction() {
131
+ let config = ExtractionConfig::default();
132
+
133
+ let doc_path = typst_fixture("minimal.typ");
134
+ let content = match fs::read(doc_path) {
135
+ Ok(c) => c,
136
+ Err(e) => {
137
+ eprintln!("Warning: Could not read minimal.typ: {}. Skipping test.", e);
138
+ return;
139
+ }
140
+ };
141
+
142
+ let result = extract_bytes(&content, "application/x-typst", &config).await;
143
+ if result.is_err() {
144
+ println!("Skipping test: Typst extractor may not be available");
145
+ return;
146
+ }
147
+
148
+ let extraction = result.unwrap();
149
+
150
+ assert!(
151
+ !extraction.content.is_empty(),
152
+ "Minimal document should extract content"
153
+ );
154
+
155
+ assert!(
156
+ extraction.content.contains("Hello") || extraction.content.contains("World"),
157
+ "Should extract heading content"
158
+ );
159
+
160
+ println!(
161
+ "✓ minimal.typ: Successfully extracted {} characters",
162
+ extraction.content.len()
163
+ );
164
+ }
165
+
166
+ /// Test headings.typ - Document focusing on heading hierarchy
167
+ ///
168
+ /// Document contains:
169
+ /// - 6 heading levels (=, ==, ===, ====, =====, ======)
170
+ /// - Content under each heading level
171
+ ///
172
+ /// Expected: Heading structure should be preserved with level information
173
+ #[tokio::test]
174
+ async fn test_heading_hierarchy_extraction() {
175
+ let config = ExtractionConfig::default();
176
+
177
+ let doc_path = typst_fixture("headings.typ");
178
+ let content = match fs::read(doc_path) {
179
+ Ok(c) => c,
180
+ Err(e) => {
181
+ eprintln!("Warning: Could not read headings.typ: {}. Skipping test.", e);
182
+ return;
183
+ }
184
+ };
185
+
186
+ let result = extract_bytes(&content, "text/x-typst", &config).await;
187
+ if result.is_err() {
188
+ println!("Skipping test: Typst extractor may not be available");
189
+ return;
190
+ }
191
+
192
+ let extraction = result.unwrap();
193
+
194
+ assert!(!extraction.content.is_empty(), "Document should extract content");
195
+
196
+ assert!(
197
+ extraction.content.contains("= Level 1") || extraction.content.contains("Level 1 Heading"),
198
+ "Should extract level 1 heading"
199
+ );
200
+
201
+ assert!(
202
+ extraction.content.contains("== Level 2") || extraction.content.contains("Level 2 Heading"),
203
+ "Should extract level 2 heading"
204
+ );
205
+
206
+ assert!(
207
+ extraction.content.contains("=== Level 3") || extraction.content.contains("Level 3 Heading"),
208
+ "Should extract level 3 heading"
209
+ );
210
+
211
+ assert!(
212
+ extraction.content.contains("==== Level 4") || extraction.content.contains("Level 4 Heading"),
213
+ "Should extract level 4 heading"
214
+ );
215
+
216
+ assert!(
217
+ extraction.content.contains("===== Level 5") || extraction.content.contains("Level 5 Heading"),
218
+ "Should extract level 5 heading"
219
+ );
220
+
221
+ assert!(
222
+ extraction.content.contains("====== Level 6") || extraction.content.contains("Level 6 Heading"),
223
+ "Should extract level 6 heading"
224
+ );
225
+
226
+ let level_1_count = extraction.content.matches("= Level 1").count();
227
+ let level_2_count = extraction.content.matches("== Level 2").count();
228
+ let level_3_count = extraction.content.matches("=== Level 3").count();
229
+ let level_4_count = extraction.content.matches("==== Level 4").count();
230
+ let level_5_count = extraction.content.matches("===== Level 5").count();
231
+ let level_6_count = extraction.content.matches("====== Level 6").count();
232
+
233
+ assert_eq!(level_1_count, 1, "Should extract exactly one level 1 heading");
234
+ assert_eq!(level_2_count, 1, "Should extract exactly one level 2 heading");
235
+ assert_eq!(level_3_count, 1, "Should extract exactly one level 3 heading");
236
+ assert_eq!(level_4_count, 1, "Should extract exactly one level 4 heading");
237
+ assert_eq!(level_5_count, 1, "Should extract exactly one level 5 heading");
238
+ assert_eq!(level_6_count, 1, "Should extract exactly one level 6 heading");
239
+
240
+ println!(
241
+ "✓ headings.typ: Successfully extracted {} characters with heading structure",
242
+ extraction.content.len()
243
+ );
244
+ }
245
+
246
+ /// Test metadata.typ - Document with comprehensive metadata
247
+ ///
248
+ /// Document contains:
249
+ /// - #set document() with: title, author, subject, keywords
250
+ /// - Content sections
251
+ ///
252
+ /// Expected: All metadata fields should be extracted correctly
253
+ #[tokio::test]
254
+ async fn test_metadata_extraction() {
255
+ let config = ExtractionConfig::default();
256
+
257
+ let doc_path = typst_fixture("metadata.typ");
258
+ let content = match fs::read(doc_path) {
259
+ Ok(c) => c,
260
+ Err(e) => {
261
+ eprintln!("Warning: Could not read metadata.typ: {}. Skipping test.", e);
262
+ return;
263
+ }
264
+ };
265
+
266
+ let result = extract_bytes(&content, "application/x-typst", &config).await;
267
+ if result.is_err() {
268
+ println!("Skipping test: Typst extractor may not be available");
269
+ return;
270
+ }
271
+
272
+ let extraction = result.unwrap();
273
+
274
+ if let Some(title) = extraction.metadata.additional.get("title") {
275
+ assert!(
276
+ title.to_string().contains("Metadata") || title.to_string().contains("Example"),
277
+ "Title should contain expected text"
278
+ );
279
+ }
280
+
281
+ if let Some(author) = extraction.metadata.additional.get("author") {
282
+ assert!(
283
+ author.to_string().contains("John") || author.to_string().contains("Doe"),
284
+ "Author should contain expected text"
285
+ );
286
+ }
287
+
288
+ if let Some(keywords) = extraction.metadata.additional.get("keywords") {
289
+ assert!(!keywords.to_string().is_empty(), "Keywords should be present");
290
+ }
291
+
292
+ assert!(!extraction.content.is_empty(), "Document should extract content");
293
+
294
+ println!(
295
+ "✓ metadata.typ: Successfully extracted metadata and {} characters of content",
296
+ extraction.content.len()
297
+ );
298
+ }
299
+
300
+ /// Test advanced.typ - Complex Typst document with multiple features
301
+ ///
302
+ /// Document contains:
303
+ /// - Metadata: title, author, keywords, date
304
+ /// - Heading numbering configuration
305
+ /// - Mathematical notation (inline and display)
306
+ /// - Nested heading levels (level 1, 2, 3, 4)
307
+ /// - Code blocks (Python example)
308
+ /// - Complex tables with 3 columns and 4 rows
309
+ /// - Multiple paragraph sections
310
+ /// - Links with text
311
+ /// - Multiple formatting combinations
312
+ ///
313
+ /// Expected: Comprehensive extraction of all document elements
314
+ #[tokio::test]
315
+ async fn test_advanced_typst_document_extraction() {
316
+ let config = ExtractionConfig::default();
317
+
318
+ let doc_path = typst_fixture("advanced.typ");
319
+ let content = match fs::read(doc_path) {
320
+ Ok(c) => c,
321
+ Err(e) => {
322
+ eprintln!("Warning: Could not read advanced.typ: {}. Skipping test.", e);
323
+ return;
324
+ }
325
+ };
326
+
327
+ let result = extract_bytes(&content, "text/x-typst", &config).await;
328
+ if result.is_err() {
329
+ println!("Skipping test: Typst extractor may not be available");
330
+ return;
331
+ }
332
+
333
+ let extraction = result.unwrap();
334
+
335
+ assert!(
336
+ extraction.metadata.additional.contains_key("title"),
337
+ "Title should be extracted"
338
+ );
339
+
340
+ assert!(
341
+ !extraction.content.is_empty(),
342
+ "Advanced document should extract content"
343
+ );
344
+
345
+ assert!(
346
+ extraction.content.contains("$")
347
+ || extraction.content.contains("equation")
348
+ || extraction.content.contains("math"),
349
+ "Should extract or preserve mathematical notation"
350
+ );
351
+
352
+ assert!(
353
+ extraction.content.contains("Mathematical")
354
+ || extraction.content.contains("Formatting")
355
+ || extraction.content.contains("Features"),
356
+ "Should extract section headings"
357
+ );
358
+
359
+ assert!(
360
+ extraction.content.contains("python")
361
+ || extraction.content.contains("def")
362
+ || extraction.content.contains("fibonacci")
363
+ || extraction.content.contains("```"),
364
+ "Should extract code block content"
365
+ );
366
+
367
+ let level_count = extraction.content.matches("=").count();
368
+ assert!(level_count >= 3, "Should preserve nested heading hierarchy");
369
+
370
+ assert!(
371
+ extraction.content.contains("Name")
372
+ || extraction.content.contains("Alice")
373
+ || extraction.content.contains("Table"),
374
+ "Should extract table content"
375
+ );
376
+
377
+ assert!(
378
+ extraction.content.contains("example")
379
+ || extraction.content.contains("link")
380
+ || extraction.content.contains("http"),
381
+ "Should extract link content"
382
+ );
383
+
384
+ println!(
385
+ "✓ advanced.typ: Successfully extracted {} characters with complex formatting",
386
+ extraction.content.len()
387
+ );
388
+ }
389
+
390
+ /// Test typst-reader.typ - Pandoc test file
391
+ ///
392
+ /// Document from Pandoc test suite demonstrating Typst reader functionality
393
+ ///
394
+ /// Expected: Proper extraction of Typst-specific syntax
395
+ #[tokio::test]
396
+ async fn test_typst_reader_extraction() {
397
+ let config = ExtractionConfig::default();
398
+
399
+ let doc_path = typst_fixture("typst-reader.typ");
400
+ let content = match fs::read(doc_path) {
401
+ Ok(c) => c,
402
+ Err(e) => {
403
+ eprintln!("Warning: Could not read typst-reader.typ: {}. Skipping test.", e);
404
+ return;
405
+ }
406
+ };
407
+
408
+ let result = extract_bytes(&content, "application/x-typst", &config).await;
409
+ if result.is_err() {
410
+ println!("Skipping test: Typst extractor may not be available");
411
+ return;
412
+ }
413
+
414
+ let extraction = result.unwrap();
415
+
416
+ assert!(
417
+ !extraction.content.is_empty(),
418
+ "Should extract content from Pandoc test file"
419
+ );
420
+
421
+ assert!(
422
+ extraction.content.contains("=") || extraction.content.contains("Fibonacci"),
423
+ "Should extract heading or content from test file"
424
+ );
425
+
426
+ println!(
427
+ "✓ typst-reader.typ: Successfully extracted {} characters",
428
+ extraction.content.len()
429
+ );
430
+ }
431
+
432
+ /// Test undergradmath.typ - Pandoc test file with complex math
433
+ ///
434
+ /// Document from Pandoc test suite with extensive mathematical notation
435
+ /// and complex formatting
436
+ ///
437
+ /// Expected: Handling of complex Typst syntax with metadata and content
438
+ #[tokio::test]
439
+ async fn test_undergradmath_extraction() {
440
+ let config = ExtractionConfig::default();
441
+
442
+ let doc_path = typst_fixture("undergradmath.typ");
443
+ let content = match fs::read(doc_path) {
444
+ Ok(c) => c,
445
+ Err(e) => {
446
+ eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e);
447
+ return;
448
+ }
449
+ };
450
+
451
+ let result = extract_bytes(&content, "text/x-typst", &config).await;
452
+ if result.is_err() {
453
+ println!("Skipping test: Typst extractor may not be available");
454
+ return;
455
+ }
456
+
457
+ let extraction = result.unwrap();
458
+
459
+ assert!(
460
+ !extraction.content.is_empty(),
461
+ "Should extract content from complex math document"
462
+ );
463
+
464
+ if let Some(title) = extraction.metadata.additional.get("title") {
465
+ assert!(!title.to_string().is_empty(), "Title should be extracted");
466
+ }
467
+
468
+ assert!(
469
+ extraction.content.contains("=") || extraction.content.contains("Typst") || extraction.content.len() > 100,
470
+ "Should extract document structure or content"
471
+ );
472
+
473
+ println!(
474
+ "✓ undergradmath.typ: Successfully extracted {} characters from math document",
475
+ extraction.content.len()
476
+ );
477
+ }
478
+
479
+ /// Test MIME type detection and fallback
480
+ ///
481
+ /// Verifies that Typst documents can be extracted with different MIME type specifications
482
+ #[tokio::test]
483
+ async fn test_typst_mime_type_variants() {
484
+ let config = ExtractionConfig::default();
485
+
486
+ let doc_path = typst_fixture("simple.typ");
487
+ let content = match fs::read(doc_path) {
488
+ Ok(c) => c,
489
+ Err(e) => {
490
+ eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
491
+ return;
492
+ }
493
+ };
494
+
495
+ let mime_types = vec!["application/x-typst", "text/x-typst", "text/plain"];
496
+
497
+ for mime_type in mime_types {
498
+ let result = extract_bytes(&content, mime_type, &config).await;
499
+
500
+ if let Ok(extraction) = result {
501
+ assert!(
502
+ !extraction.content.is_empty(),
503
+ "Should extract content with MIME type: {}",
504
+ mime_type
505
+ );
506
+ println!(
507
+ "✓ MIME type '{}': Successfully extracted {} characters",
508
+ mime_type,
509
+ extraction.content.len()
510
+ );
511
+ }
512
+ }
513
+ }
514
+
515
+ /// Test formatting preservation
516
+ ///
517
+ /// Validates that inline formatting markers are preserved in extracted content
518
+ #[tokio::test]
519
+ async fn test_formatting_preservation() {
520
+ let config = ExtractionConfig::default();
521
+
522
+ let doc_path = typst_fixture("simple.typ");
523
+ let content = match fs::read(doc_path) {
524
+ Ok(c) => c,
525
+ Err(e) => {
526
+ eprintln!("Warning: Could not read simple.typ: {}. Skipping test.", e);
527
+ return;
528
+ }
529
+ };
530
+
531
+ let result = extract_bytes(&content, "text/x-typst", &config).await;
532
+ if result.is_err() {
533
+ println!("Skipping test: Typst extractor may not be available");
534
+ return;
535
+ }
536
+
537
+ let extraction = result.unwrap();
538
+
539
+ assert!(
540
+ extraction.content.contains("*") || extraction.content.contains("bold"),
541
+ "Should preserve bold formatting or text"
542
+ );
543
+
544
+ assert!(
545
+ extraction.content.contains("_") || extraction.content.contains("italic"),
546
+ "Should preserve italic formatting or text"
547
+ );
548
+
549
+ assert!(
550
+ extraction.content.contains("`") || extraction.content.contains("code"),
551
+ "Should preserve code formatting or text"
552
+ );
553
+
554
+ println!("✓ Formatting preservation: All markers/content found in extracted text");
555
+ }
556
+
557
+ /// Test large document handling
558
+ ///
559
+ /// Validates extraction of the large undergradmath document
560
+ #[tokio::test]
561
+ async fn test_large_document_extraction() {
562
+ let config = ExtractionConfig::default();
563
+
564
+ let doc_path = typst_fixture("undergradmath.typ");
565
+ let content = match fs::read(doc_path) {
566
+ Ok(c) => c,
567
+ Err(e) => {
568
+ eprintln!("Warning: Could not read undergradmath.typ: {}. Skipping test.", e);
569
+ return;
570
+ }
571
+ };
572
+
573
+ let result = extract_bytes(&content, "text/x-typst", &config).await;
574
+ if result.is_err() {
575
+ println!("Skipping test: Typst extractor may not be available");
576
+ return;
577
+ }
578
+
579
+ let extraction = result.unwrap();
580
+
581
+ assert!(
582
+ !extraction.content.is_empty(),
583
+ "Should extract content from large document"
584
+ );
585
+
586
+ println!(
587
+ "✓ Large document: Extracted {} bytes of content from source file",
588
+ extraction.content.len()
589
+ );
590
+ }
591
+
592
+ /// Test empty/whitespace handling
593
+ ///
594
+ /// Validates graceful handling of edge cases
595
+ #[tokio::test]
596
+ async fn test_empty_content_handling() {
597
+ let config = ExtractionConfig::default();
598
+
599
+ let empty_content = b"";
600
+ let result = extract_bytes(empty_content, "text/x-typst", &config).await;
601
+
602
+ match result {
603
+ Ok(extraction) => {
604
+ println!(
605
+ "✓ Empty content: Handled gracefully, extracted {} bytes",
606
+ extraction.content.len()
607
+ );
608
+ }
609
+ Err(e) => {
610
+ println!("✓ Empty content: Resulted in expected error: {}", e);
611
+ }
612
+ }
613
+ }
614
+
615
+ /// Test MIME type priority
616
+ ///
617
+ /// Validates that Typst extractor has correct priority (50)
618
+ #[tokio::test]
619
+ async fn test_typst_extractor_priority() {
620
+ use kreuzberg::extractors::TypstExtractor;
621
+ use kreuzberg::plugins::DocumentExtractor;
622
+
623
+ let extractor = TypstExtractor::new();
624
+ let priority = extractor.priority();
625
+
626
+ assert_eq!(priority, 50, "Typst extractor should have priority 50");
627
+ println!("✓ Typst extractor priority: {}", priority);
628
+ }
629
+
630
+ /// Test supported MIME types
631
+ ///
632
+ /// Validates that extractor claims to support Typst MIME types
633
+ #[tokio::test]
634
+ async fn test_supported_mime_types() {
635
+ use kreuzberg::extractors::TypstExtractor;
636
+ use kreuzberg::plugins::DocumentExtractor;
637
+
638
+ let extractor = TypstExtractor::new();
639
+ let mime_types = extractor.supported_mime_types();
640
+
641
+ assert!(
642
+ mime_types.contains(&"application/x-typst"),
643
+ "Should support application/x-typst"
644
+ );
645
+ assert!(mime_types.contains(&"text/x-typst"), "Should support text/x-typst");
646
+
647
+ println!("✓ Supported MIME types: {:?}", mime_types);
648
+ }