kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,692 @@
1
+ //! Comprehensive TDD test suite for RST (reStructuredText) extraction
2
+ //!
3
+ //! Tests RST extraction using Pandoc as the baseline for quality validation.
4
+ //! The test documents are derived from the Pandoc test suite and provide
5
+ //! comprehensive coverage of RST-specific features including:
6
+ //! - Metadata extraction from field lists (:Author:, :Date:, etc.)
7
+ //! - Directive handling (.. code-block::, .. image::, .. math::, etc.)
8
+ //! - Section structure and heading levels
9
+ //! - Table extraction (simple and grid tables)
10
+ //! - Reference links and images
11
+ //! - Comments and special blocks
12
+ //! - Content quality validation
13
+
14
+ use kreuzberg::core::config::ExtractionConfig;
15
+ use kreuzberg::core::extractor::extract_bytes;
16
+
17
+ mod helpers;
18
+
19
+ const RST_FIXTURE: &str = include_str!("../../../test_documents/rst/rst-reader.rst");
20
+
21
+ fn rst_fixture_bytes() -> Vec<u8> {
22
+ RST_FIXTURE.as_bytes().to_vec()
23
+ }
24
+
25
+ /// Test extraction of document title from RST file structure
26
+ #[tokio::test]
27
+ async fn test_rst_title_extraction() {
28
+ let content = rst_fixture_bytes();
29
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
30
+ .await
31
+ .expect("Should extract RST successfully");
32
+
33
+ assert!(
34
+ result.content.to_lowercase().contains("pandoc test suite"),
35
+ "Should contain document title 'Pandoc Test Suite'"
36
+ );
37
+
38
+ assert!(
39
+ result.content.contains("Level one header") || result.content.contains("header"),
40
+ "Should contain document headers"
41
+ );
42
+
43
+ println!("✅ RST title extraction test passed!");
44
+ }
45
+
46
+ /// Test field list metadata extraction (:Authors:, :Date:, :Revision:)
47
+ #[tokio::test]
48
+ async fn test_rst_field_list_metadata_extraction() {
49
+ let content = rst_fixture_bytes();
50
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
51
+ .await
52
+ .expect("Should extract RST successfully");
53
+
54
+ println!(
55
+ "Content excerpt (first 500 chars): {}",
56
+ &result.content[..std::cmp::min(500, result.content.len())]
57
+ );
58
+
59
+ assert!(
60
+ result.content.contains("John MacFarlane")
61
+ || result.content.contains("July 17")
62
+ || result.content.contains("Pandoc Test Suite"),
63
+ "Should contain metadata information or title"
64
+ );
65
+
66
+ println!("✅ RST field list metadata extraction test passed!");
67
+ }
68
+
69
+ /// Test extraction of multiple heading levels
70
+ #[tokio::test]
71
+ async fn test_rst_section_hierarchy() {
72
+ let content = rst_fixture_bytes();
73
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
74
+ .await
75
+ .expect("Should extract RST successfully");
76
+
77
+ let headings = vec![
78
+ "Level one header",
79
+ "Level two header",
80
+ "Level three",
81
+ "Paragraphs",
82
+ "Block Quotes",
83
+ "Code Blocks",
84
+ "Lists",
85
+ "Field Lists",
86
+ "HTML Blocks",
87
+ "LaTeX Block",
88
+ "Images",
89
+ "Tables",
90
+ ];
91
+
92
+ for heading in headings {
93
+ assert!(
94
+ result.content.contains(heading),
95
+ "Should contain heading: '{}'",
96
+ heading
97
+ );
98
+ }
99
+
100
+ println!("✅ RST section hierarchy test passed!");
101
+ }
102
+
103
+ /// Test that emphasis in headings is preserved
104
+ #[tokio::test]
105
+ async fn test_rst_heading_with_inline_markup() {
106
+ let content = rst_fixture_bytes();
107
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
108
+ .await
109
+ .expect("Should extract RST successfully");
110
+
111
+ assert!(
112
+ result.content.contains("emphasis") || result.content.contains("Level four"),
113
+ "Should contain heading with emphasis"
114
+ );
115
+
116
+ println!("✅ RST heading with inline markup test passed!");
117
+ }
118
+
119
+ /// Test code block extraction with language specification
120
+ #[tokio::test]
121
+ async fn test_rst_code_block_extraction() {
122
+ let content = rst_fixture_bytes();
123
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
124
+ .await
125
+ .expect("Should extract RST successfully");
126
+
127
+ assert!(
128
+ result.content.contains("def my_function") || result.content.contains("python"),
129
+ "Should contain Python code block or language specification"
130
+ );
131
+
132
+ assert!(
133
+ result.content.contains("return x + 1") || result.content.contains("my_function"),
134
+ "Should contain Python function code"
135
+ );
136
+
137
+ println!("✅ RST code block extraction test passed!");
138
+ }
139
+
140
+ /// Test Haskell code blocks with highlight directive
141
+ #[tokio::test]
142
+ async fn test_rst_highlight_directive_code_blocks() {
143
+ let content = rst_fixture_bytes();
144
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
145
+ .await
146
+ .expect("Should extract RST successfully");
147
+
148
+ assert!(
149
+ result.content.contains("haskell") || result.content.contains("Tree") || result.content.contains("data Tree"),
150
+ "Should contain Haskell code blocks"
151
+ );
152
+
153
+ assert!(
154
+ result.content.contains("Leaf") || result.content.contains("Node"),
155
+ "Should contain Haskell data constructors"
156
+ );
157
+
158
+ println!("✅ RST highlight directive code blocks test passed!");
159
+ }
160
+
161
+ /// Test JavaScript code blocks
162
+ #[tokio::test]
163
+ async fn test_rst_javascript_code_blocks() {
164
+ let content = rst_fixture_bytes();
165
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
166
+ .await
167
+ .expect("Should extract RST successfully");
168
+
169
+ assert!(
170
+ result.content.contains("javascript") || result.content.contains("=>") || result.content.contains("let f"),
171
+ "Should contain JavaScript code"
172
+ );
173
+
174
+ println!("✅ RST JavaScript code blocks test passed!");
175
+ }
176
+
177
+ /// Test unordered list extraction
178
+ #[tokio::test]
179
+ async fn test_rst_unordered_lists() {
180
+ let content = rst_fixture_bytes();
181
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
182
+ .await
183
+ .expect("Should extract RST successfully");
184
+
185
+ let list_items = vec![
186
+ "asterisk 1",
187
+ "asterisk 2",
188
+ "asterisk 3",
189
+ "Plus 1",
190
+ "Plus 2",
191
+ "Plus 3",
192
+ "Minus 1",
193
+ "Minus 2",
194
+ "Minus 3",
195
+ ];
196
+
197
+ for item in list_items {
198
+ assert!(result.content.contains(item), "Should contain list item: '{}'", item);
199
+ }
200
+
201
+ println!("✅ RST unordered lists test passed!");
202
+ }
203
+
204
+ /// Test ordered list extraction
205
+ #[tokio::test]
206
+ async fn test_rst_ordered_lists() {
207
+ let content = rst_fixture_bytes();
208
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
209
+ .await
210
+ .expect("Should extract RST successfully");
211
+
212
+ let list_items = vec!["First", "Second", "Third"];
213
+
214
+ for item in list_items {
215
+ assert!(
216
+ result.content.contains(item),
217
+ "Should contain ordered list item: '{}'",
218
+ item
219
+ );
220
+ }
221
+
222
+ println!("✅ RST ordered lists test passed!");
223
+ }
224
+
225
+ /// Test nested lists extraction
226
+ #[tokio::test]
227
+ async fn test_rst_nested_lists() {
228
+ let content = rst_fixture_bytes();
229
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
230
+ .await
231
+ .expect("Should extract RST successfully");
232
+
233
+ assert!(
234
+ result.content.contains("First")
235
+ || result.content.contains("Second")
236
+ || result.content.contains("Third")
237
+ || result.content.contains("Definition"),
238
+ "Should contain nested or definition list content"
239
+ );
240
+
241
+ println!("✅ RST nested lists test passed!");
242
+ }
243
+
244
+ /// Test simple table extraction
245
+ #[tokio::test]
246
+ async fn test_rst_simple_table_extraction() {
247
+ let content = rst_fixture_bytes();
248
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
249
+ .await
250
+ .expect("Should extract RST successfully");
251
+
252
+ assert!(
253
+ result.content.contains("Simple Tables")
254
+ || result.content.contains("col")
255
+ || (result.content.contains("r1") && result.content.contains("r2")),
256
+ "Should contain simple table content"
257
+ );
258
+
259
+ println!("✅ RST simple table extraction test passed!");
260
+ }
261
+
262
+ /// Test grid table extraction
263
+ #[tokio::test]
264
+ async fn test_rst_grid_table_extraction() {
265
+ let content = rst_fixture_bytes();
266
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
267
+ .await
268
+ .expect("Should extract RST successfully");
269
+
270
+ assert!(
271
+ result.content.contains("Grid Tables")
272
+ || result.content.contains("r1 a")
273
+ || (result.content.contains("r1") && result.content.contains("r2")),
274
+ "Should contain grid table content"
275
+ );
276
+
277
+ println!("✅ RST grid table extraction test passed!");
278
+ }
279
+
280
+ /// Test table with complex structure (multiple rows/columns spanning)
281
+ #[tokio::test]
282
+ async fn test_rst_complex_table_with_spanning() {
283
+ let content = rst_fixture_bytes();
284
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
285
+ .await
286
+ .expect("Should extract RST successfully");
287
+
288
+ assert!(
289
+ result.content.contains("Table with cells")
290
+ || result.content.contains("Property")
291
+ || result.content.contains("min")
292
+ || result.content.contains("°C"),
293
+ "Should contain complex table content"
294
+ );
295
+
296
+ println!("✅ RST complex table with spanning test passed!");
297
+ }
298
+
299
+ /// Test emphasis and strong markup
300
+ #[tokio::test]
301
+ async fn test_rst_emphasis_and_strong() {
302
+ let content = rst_fixture_bytes();
303
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
304
+ .await
305
+ .expect("Should extract RST successfully");
306
+
307
+ assert!(
308
+ result.content.contains("emphasized") || result.content.contains("strong"),
309
+ "Should contain emphasis markers or converted text"
310
+ );
311
+
312
+ println!("✅ RST emphasis and strong test passed!");
313
+ }
314
+
315
+ /// Test inline code extraction
316
+ #[tokio::test]
317
+ async fn test_rst_inline_code() {
318
+ let content = rst_fixture_bytes();
319
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
320
+ .await
321
+ .expect("Should extract RST successfully");
322
+
323
+ assert!(
324
+ result.content.contains(">") || result.content.contains("code"),
325
+ "Should contain inline code or code markers"
326
+ );
327
+
328
+ println!("✅ RST inline code test passed!");
329
+ }
330
+
331
+ /// Test subscript and superscript
332
+ #[tokio::test]
333
+ async fn test_rst_subscript_superscript() {
334
+ let content = rst_fixture_bytes();
335
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
336
+ .await
337
+ .expect("Should extract RST successfully");
338
+
339
+ assert!(
340
+ result.content.contains("subscript") || result.content.contains("superscript"),
341
+ "Should contain subscript/superscript text"
342
+ );
343
+
344
+ println!("✅ RST subscript/superscript test passed!");
345
+ }
346
+
347
+ /// Test explicit links extraction
348
+ #[tokio::test]
349
+ async fn test_rst_explicit_links() {
350
+ let content = rst_fixture_bytes();
351
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
352
+ .await
353
+ .expect("Should extract RST successfully");
354
+
355
+ assert!(
356
+ result.content.contains("/url") || result.content.contains("URL"),
357
+ "Should contain link URLs"
358
+ );
359
+
360
+ assert!(
361
+ result.content.contains("link"),
362
+ "Should contain link references or text"
363
+ );
364
+
365
+ println!("✅ RST explicit links test passed!");
366
+ }
367
+
368
+ /// Test reference links
369
+ #[tokio::test]
370
+ async fn test_rst_reference_links() {
371
+ let content = rst_fixture_bytes();
372
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
373
+ .await
374
+ .expect("Should extract RST successfully");
375
+
376
+ assert!(
377
+ result.content.contains("link1") || result.content.contains("link2") || result.content.contains("link"),
378
+ "Should contain resolved reference links"
379
+ );
380
+
381
+ println!("✅ RST reference links test passed!");
382
+ }
383
+
384
+ /// Test autolinks (bare URLs and email addresses)
385
+ #[tokio::test]
386
+ async fn test_rst_autolinks() {
387
+ let content = rst_fixture_bytes();
388
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
389
+ .await
390
+ .expect("Should extract RST successfully");
391
+
392
+ assert!(
393
+ result.content.contains("example.com") || result.content.contains("http"),
394
+ "Should contain URLs from autolinks"
395
+ );
396
+
397
+ assert!(
398
+ result.content.contains("nowhere") || result.content.contains("@"),
399
+ "Should contain email references"
400
+ );
401
+
402
+ println!("✅ RST autolinks test passed!");
403
+ }
404
+
405
+ /// Test image directive extraction
406
+ #[tokio::test]
407
+ async fn test_rst_image_directive() {
408
+ let content = rst_fixture_bytes();
409
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
410
+ .await
411
+ .expect("Should extract RST successfully");
412
+
413
+ assert!(
414
+ result.content.contains("image") || result.content.contains("lalune") || result.content.contains("movie"),
415
+ "Should contain image directives or references"
416
+ );
417
+
418
+ assert!(
419
+ result.content.contains("Voyage") || result.content.contains("Melies"),
420
+ "Should contain image descriptions"
421
+ );
422
+
423
+ println!("✅ RST image directive test passed!");
424
+ }
425
+
426
+ /// Test raw HTML block extraction
427
+ #[tokio::test]
428
+ async fn test_rst_raw_html_blocks() {
429
+ let content = rst_fixture_bytes();
430
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
431
+ .await
432
+ .expect("Should extract RST successfully");
433
+
434
+ assert!(
435
+ result.content.contains("div") || result.content.contains("foo"),
436
+ "Should contain HTML block content"
437
+ );
438
+
439
+ println!("✅ RST raw HTML blocks test passed!");
440
+ }
441
+
442
+ /// Test LaTeX block extraction
443
+ #[tokio::test]
444
+ async fn test_rst_latex_blocks() {
445
+ let content = rst_fixture_bytes();
446
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
447
+ .await
448
+ .expect("Should extract RST successfully");
449
+
450
+ assert!(
451
+ result.content.contains("LaTeX Block")
452
+ || result.content.contains("begin{tabular}")
453
+ || result.content.contains("Animal")
454
+ || result.content.contains("Dog"),
455
+ "Should contain LaTeX block or content"
456
+ );
457
+
458
+ println!("✅ RST LaTeX blocks test passed!");
459
+ }
460
+
461
+ /// Test math directive extraction
462
+ #[tokio::test]
463
+ async fn test_rst_math_directive() {
464
+ let content = rst_fixture_bytes();
465
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
466
+ .await
467
+ .expect("Should extract RST successfully");
468
+
469
+ assert!(
470
+ result.content.contains("E=mc^2")
471
+ || result.content.contains("E = mc")
472
+ || result.content.contains("alpha")
473
+ || result.content.contains("Math"),
474
+ "Should contain math formulas"
475
+ );
476
+
477
+ println!("✅ RST math directive test passed!");
478
+ }
479
+
480
+ /// Test comment blocks are excluded from output
481
+ #[tokio::test]
482
+ async fn test_rst_comment_blocks_excluded() {
483
+ let content = rst_fixture_bytes();
484
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
485
+ .await
486
+ .expect("Should extract RST successfully");
487
+
488
+ assert!(
489
+ !result.content.contains("should not appear"),
490
+ "Comments should be excluded from output"
491
+ );
492
+
493
+ assert!(
494
+ result.content.contains("First paragraph") || result.content.contains("paragraph"),
495
+ "Non-comment content should be present"
496
+ );
497
+
498
+ println!("✅ RST comment blocks excluded test passed!");
499
+ }
500
+
501
+ /// Test line blocks extraction
502
+ #[tokio::test]
503
+ async fn test_rst_line_blocks() {
504
+ let content = rst_fixture_bytes();
505
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
506
+ .await
507
+ .expect("Should extract RST successfully");
508
+
509
+ assert!(
510
+ result.content.contains("Line blocks")
511
+ || result.content.contains("bee")
512
+ || result.content.contains("entire bee"),
513
+ "Should contain line block content or heading"
514
+ );
515
+
516
+ println!("✅ RST line blocks test passed!");
517
+ }
518
+
519
+ /// Test unicode character preservation
520
+ #[tokio::test]
521
+ async fn test_rst_unicode_characters() {
522
+ let content = rst_fixture_bytes();
523
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
524
+ .await
525
+ .expect("Should extract RST successfully");
526
+
527
+ assert!(
528
+ result.content.contains("©")
529
+ || result.content.contains("copyright")
530
+ || result.content.contains("umlaut")
531
+ || result.content.contains("unicode"),
532
+ "Should contain unicode characters or references"
533
+ );
534
+
535
+ println!("✅ RST unicode characters test passed!");
536
+ }
537
+
538
+ /// Test escaped characters
539
+ #[tokio::test]
540
+ async fn test_rst_escaped_characters() {
541
+ let content = rst_fixture_bytes();
542
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
543
+ .await
544
+ .expect("Should extract RST successfully");
545
+
546
+ assert!(
547
+ result.content.contains("Backslash")
548
+ || result.content.contains("Backtick")
549
+ || result.content.contains("Asterisk"),
550
+ "Should contain escaped special character sections"
551
+ );
552
+
553
+ println!("✅ RST escaped characters test passed!");
554
+ }
555
+
556
+ // SECTION 12: FOOTNOTES AND REFERENCES
557
+
558
+ /// Test footnote extraction
559
+ #[tokio::test]
560
+ async fn test_rst_footnotes() {
561
+ let content = rst_fixture_bytes();
562
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
563
+ .await
564
+ .expect("Should extract RST successfully");
565
+
566
+ assert!(
567
+ result.content.contains("Note") || result.content.contains("continuation"),
568
+ "Should contain footnote content"
569
+ );
570
+
571
+ println!("✅ RST footnotes test passed!");
572
+ }
573
+
574
+ /// Test block quote extraction
575
+ #[tokio::test]
576
+ async fn test_rst_block_quotes() {
577
+ let content = rst_fixture_bytes();
578
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
579
+ .await
580
+ .expect("Should extract RST successfully");
581
+
582
+ assert!(
583
+ result.content.contains("block quote") || result.content.contains("pretty short"),
584
+ "Should contain block quote content"
585
+ );
586
+
587
+ println!("✅ RST block quotes test passed!");
588
+ }
589
+
590
+ /// Test overall content extraction volume
591
+ #[tokio::test]
592
+ async fn test_rst_content_extraction_volume() {
593
+ let content = rst_fixture_bytes();
594
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
595
+ .await
596
+ .expect("Should extract RST successfully");
597
+
598
+ let content_length = result.content.len();
599
+ println!("Extracted content length: {} bytes", content_length);
600
+
601
+ assert!(
602
+ content_length > 1000,
603
+ "Extracted content should be substantial (> 1000 bytes), got {} bytes",
604
+ content_length
605
+ );
606
+
607
+ assert_eq!(result.mime_type, "text/x-rst", "MIME type should be preserved");
608
+
609
+ println!("✅ RST content extraction volume test passed!");
610
+ println!(" Extracted {} bytes from RST file", content_length);
611
+ }
612
+
613
+ /// Test extracted content contains all major sections
614
+ #[tokio::test]
615
+ async fn test_rst_all_major_sections_present() {
616
+ let content = rst_fixture_bytes();
617
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
618
+ .await
619
+ .expect("Should extract RST successfully");
620
+
621
+ let major_sections = vec![
622
+ "Paragraphs",
623
+ "Block Quotes",
624
+ "Code Blocks",
625
+ "Lists",
626
+ "Field Lists",
627
+ "HTML Blocks",
628
+ "LaTeX Block",
629
+ "Inline Markup",
630
+ "Special Characters",
631
+ "Links",
632
+ "Images",
633
+ "Comments",
634
+ "Tables",
635
+ "Math",
636
+ ];
637
+
638
+ let content_lower = result.content.to_lowercase();
639
+ let mut found_count = 0;
640
+
641
+ for section in major_sections {
642
+ if content_lower.contains(&section.to_lowercase()) {
643
+ found_count += 1;
644
+ println!("✓ Found section: {}", section);
645
+ } else {
646
+ println!("✗ Missing section: {}", section);
647
+ }
648
+ }
649
+
650
+ assert!(
651
+ found_count >= 10,
652
+ "Should find at least 10 major sections, found {}",
653
+ found_count
654
+ );
655
+
656
+ println!("✅ RST all major sections present test passed!");
657
+ println!(" Found {}/14 major sections", found_count);
658
+ }
659
+
660
+ /// Test MIME type detection
661
+ #[tokio::test]
662
+ async fn test_rst_mime_type_detection() {
663
+ let content = rst_fixture_bytes();
664
+
665
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default())
666
+ .await
667
+ .expect("Should extract with text/x-rst MIME type");
668
+
669
+ assert_eq!(result.mime_type, "text/x-rst");
670
+
671
+ println!("✅ RST MIME type detection test passed!");
672
+ }
673
+
674
+ /// Test that no extraction errors occur on valid RST file
675
+ #[tokio::test]
676
+ async fn test_rst_extraction_no_errors() {
677
+ let content = rst_fixture_bytes();
678
+
679
+ let result = extract_bytes(&content, "text/x-rst", &ExtractionConfig::default()).await;
680
+
681
+ assert!(
682
+ result.is_ok(),
683
+ "RST extraction should succeed without errors: {:?}",
684
+ result.err()
685
+ );
686
+
687
+ let extraction = result.unwrap();
688
+
689
+ assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
690
+
691
+ println!("✅ RST extraction no errors test passed!");
692
+ }