kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,498 @@
1
+ //! Comprehensive tests for DocBook extractor supporting both 4.x and 5.x versions.
2
+
3
+ use kreuzberg::core::config::ExtractionConfig;
4
+ use kreuzberg::plugins::{DocumentExtractor, Plugin};
5
+ use std::path::PathBuf;
6
+
7
+ /// Helper to get absolute path to test documents
8
+ fn test_file_path(filename: &str) -> PathBuf {
9
+ let manifest_dir = env!("CARGO_MANIFEST_DIR");
10
+ PathBuf::from(manifest_dir)
11
+ .parent()
12
+ .unwrap()
13
+ .parent()
14
+ .unwrap()
15
+ .join("test_documents")
16
+ .join("docbook")
17
+ .join(filename)
18
+ }
19
+
20
+ /// DocBook 4.x extractor test helper
21
+ async fn extract_docbook4_file(filename: &str) -> kreuzberg::Result<kreuzberg::types::ExtractionResult> {
22
+ let extractor = kreuzberg::extractors::DocbookExtractor::new();
23
+ let path = test_file_path(filename);
24
+ let config = ExtractionConfig::default();
25
+ extractor.extract_file(&path, "application/docbook+xml", &config).await
26
+ }
27
+
28
+ /// DocBook 5.x extractor test helper
29
+ async fn extract_docbook5_file(filename: &str) -> kreuzberg::Result<kreuzberg::types::ExtractionResult> {
30
+ let extractor = kreuzberg::extractors::DocbookExtractor::new();
31
+ let path = test_file_path(filename);
32
+ let config = ExtractionConfig::default();
33
+ extractor.extract_file(&path, "application/docbook+xml", &config).await
34
+ }
35
+
36
+ /// Helper to extract bytes directly
37
+ async fn extract_docbook_bytes(
38
+ content: &[u8],
39
+ mime_type: &str,
40
+ ) -> kreuzberg::Result<kreuzberg::types::ExtractionResult> {
41
+ let extractor = kreuzberg::extractors::DocbookExtractor::new();
42
+ let config = ExtractionConfig::default();
43
+ extractor.extract_bytes(content, mime_type, &config).await
44
+ }
45
+
46
+ #[test]
47
+ fn test_docbook_extractor_plugin_interface() {
48
+ let extractor = kreuzberg::extractors::DocbookExtractor::new();
49
+ assert_eq!(extractor.name(), "docbook-extractor");
50
+ assert!(extractor.initialize().is_ok());
51
+ assert!(extractor.shutdown().is_ok());
52
+ }
53
+
54
+ #[test]
55
+ fn test_docbook_extractor_supported_mime_types() {
56
+ let extractor = kreuzberg::extractors::DocbookExtractor::new();
57
+ let mime_types = extractor.supported_mime_types();
58
+ assert!(mime_types.contains(&"application/docbook+xml"));
59
+ assert!(mime_types.contains(&"text/docbook"));
60
+ }
61
+
62
+ #[test]
63
+ fn test_docbook_extractor_priority() {
64
+ let extractor = kreuzberg::extractors::DocbookExtractor::new();
65
+ assert_eq!(extractor.priority(), 50);
66
+ }
67
+
68
+ #[tokio::test]
69
+ async fn test_docbook4_chapter_extraction() {
70
+ let result = extract_docbook4_file("docbook-chapter.docbook").await;
71
+ assert!(result.is_ok(), "Failed to extract DocBook 4 chapter");
72
+
73
+ let result = result.unwrap();
74
+ assert!(!result.content.is_empty(), "Content should not be empty");
75
+ assert!(
76
+ result.content.contains("Test Chapter"),
77
+ "Content should contain chapter title"
78
+ );
79
+ assert!(
80
+ result.content.contains("Like a Sect1"),
81
+ "Content should contain section titles"
82
+ );
83
+ }
84
+
85
+ #[tokio::test]
86
+ async fn test_docbook5_reader_extraction() {
87
+ let result = extract_docbook5_file("docbook-reader.docbook").await;
88
+ assert!(result.is_ok(), "Failed to extract DocBook 5 file");
89
+
90
+ let result = result.unwrap();
91
+ assert!(!result.content.is_empty(), "Content should not be empty");
92
+ assert!(
93
+ result.content.contains("Pandoc Test Suite"),
94
+ "Content should contain article title"
95
+ );
96
+ }
97
+
98
+ #[tokio::test]
99
+ async fn test_docbook_xref_extraction() {
100
+ let result = extract_docbook4_file("docbook-xref.docbook").await;
101
+ assert!(result.is_ok(), "Failed to extract DocBook with xref elements");
102
+
103
+ let result = result.unwrap();
104
+ assert!(!result.content.is_empty(), "Content should not be empty");
105
+ assert!(
106
+ result.content.contains("An Example Book"),
107
+ "Content should contain book title"
108
+ );
109
+ assert!(
110
+ result.content.contains("XRef Samples"),
111
+ "Content should contain xref chapter"
112
+ );
113
+ }
114
+
115
+ #[tokio::test]
116
+ async fn test_docbook_tables_extraction() {
117
+ let result = extract_docbook4_file("tables.docbook4").await;
118
+ assert!(result.is_ok(), "Failed to extract DocBook with tables");
119
+
120
+ let result = result.unwrap();
121
+ assert!(!result.content.is_empty(), "Content should not be empty");
122
+ assert!(!result.tables.is_empty(), "Should extract tables from DocBook");
123
+ }
124
+
125
+ #[tokio::test]
126
+ async fn test_docbook5_tables_extraction() {
127
+ let result = extract_docbook5_file("tables.docbook5").await;
128
+ assert!(result.is_ok(), "Failed to extract DocBook 5 with tables");
129
+
130
+ let result = result.unwrap();
131
+ assert!(!result.content.is_empty(), "Content should not be empty");
132
+ assert!(!result.tables.is_empty(), "Should extract tables from DocBook 5");
133
+ }
134
+
135
+ #[tokio::test]
136
+ async fn test_docbook_metadata_extraction() {
137
+ let result = extract_docbook5_file("docbook-reader.docbook").await;
138
+ assert!(result.is_ok());
139
+
140
+ let result = result.unwrap();
141
+ assert!(!result.content.is_empty());
142
+ }
143
+
144
+ #[tokio::test]
145
+ async fn test_docbook_section_hierarchy() {
146
+ let result = extract_docbook4_file("docbook-chapter.docbook").await;
147
+ assert!(result.is_ok());
148
+
149
+ let result = result.unwrap();
150
+ let content = &result.content;
151
+
152
+ assert!(content.contains("Like a Sect1"));
153
+ assert!(content.contains("Like a Sect2"));
154
+ assert!(content.contains("Like a Sect3"));
155
+ assert!(content.contains("Like a Sect4"));
156
+ }
157
+
158
+ #[tokio::test]
159
+ async fn test_docbook_paragraph_extraction() {
160
+ let result = extract_docbook4_file("docbook-chapter.docbook").await;
161
+ assert!(result.is_ok());
162
+
163
+ let result = result.unwrap();
164
+ assert!(
165
+ result.content.contains("This chapter uses recursive sections"),
166
+ "Should extract paragraph content"
167
+ );
168
+ }
169
+
170
+ #[tokio::test]
171
+ async fn test_docbook_paragraph_content() {
172
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
173
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
174
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
175
+ <article>
176
+ <title>Test Article</title>
177
+ <para>This is a test paragraph.</para>
178
+ <para>This is another paragraph with <emphasis>emphasized</emphasis> text.</para>
179
+ </article>"#;
180
+
181
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
182
+ assert!(result.is_ok());
183
+
184
+ let result = result.unwrap();
185
+ assert!(result.content.contains("Test Article"));
186
+ assert!(result.content.contains("This is a test paragraph"));
187
+ assert!(result.content.contains("another paragraph"));
188
+ }
189
+
190
+ #[tokio::test]
191
+ async fn test_docbook_code_block_extraction() {
192
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
193
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
194
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
195
+ <article>
196
+ <para>Here is code:</para>
197
+ <programlisting>
198
+ def hello():
199
+ print("world")
200
+ </programlisting>
201
+ </article>"#;
202
+
203
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
204
+ assert!(result.is_ok());
205
+
206
+ let result = result.unwrap();
207
+ assert!(result.content.contains("def hello"));
208
+ assert!(result.content.contains("print"));
209
+ }
210
+
211
+ #[tokio::test]
212
+ async fn test_docbook_mixed_content() {
213
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
214
+ <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
215
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
216
+ <book>
217
+ <title>Test Book</title>
218
+ <chapter>
219
+ <title>Chapter 1</title>
220
+ <section>
221
+ <title>Section 1.1</title>
222
+ <para>Paragraph in section.</para>
223
+ </section>
224
+ </chapter>
225
+ </book>"#;
226
+
227
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
228
+ assert!(result.is_ok());
229
+
230
+ let result = result.unwrap();
231
+ assert!(result.content.contains("Test Book"));
232
+ assert!(result.content.contains("Chapter 1"));
233
+ assert!(result.content.contains("Section 1.1"));
234
+ assert!(result.content.contains("Paragraph in section"));
235
+ }
236
+
237
+ #[tokio::test]
238
+ async fn test_docbook_namespaced_5x_parsing() {
239
+ let docbook5 = r#"<?xml version="1.0" encoding="UTF-8"?>
240
+ <article xmlns="http://docbook.org/ns/docbook">
241
+ <info>
242
+ <title>DocBook 5 Article</title>
243
+ <author>
244
+ <personname>
245
+ <firstname>John</firstname>
246
+ <surname>Doe</surname>
247
+ </personname>
248
+ </author>
249
+ <date>2024-01-01</date>
250
+ </info>
251
+ <section>
252
+ <title>Introduction</title>
253
+ <para>Welcome to DocBook 5.</para>
254
+ </section>
255
+ </article>"#;
256
+
257
+ let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
258
+ assert!(result.is_ok());
259
+
260
+ let result = result.unwrap();
261
+ assert!(result.content.contains("DocBook 5 Article"));
262
+ assert!(result.content.contains("Welcome to DocBook 5"));
263
+ }
264
+
265
+ #[tokio::test]
266
+ async fn test_docbook_link_handling() {
267
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
268
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
269
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
270
+ <article>
271
+ <title>Links Test</title>
272
+ <para>See <link xlink:href="http://example.com">example site</link>.</para>
273
+ </article>"#;
274
+
275
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
276
+ assert!(result.is_ok());
277
+
278
+ let result = result.unwrap();
279
+ assert!(result.content.contains("example"));
280
+ }
281
+
282
+ #[tokio::test]
283
+ async fn test_docbook_mime_type_detection() {
284
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
285
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
286
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
287
+ <article>
288
+ <title>Test</title>
289
+ </article>"#;
290
+
291
+ let result1 = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
292
+ assert!(result1.is_ok());
293
+
294
+ let result2 = extract_docbook_bytes(docbook.as_bytes(), "text/docbook").await;
295
+ assert!(result2.is_ok());
296
+ }
297
+
298
+ #[tokio::test]
299
+ async fn test_docbook_empty_sections() {
300
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
301
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
302
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
303
+ <article>
304
+ <title>Empty Sections</title>
305
+ <section>
306
+ <title>Empty Section</title>
307
+ </section>
308
+ <section>
309
+ <title>Section with Content</title>
310
+ <para>Content here</para>
311
+ </section>
312
+ </article>"#;
313
+
314
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
315
+ assert!(result.is_ok());
316
+
317
+ let result = result.unwrap();
318
+ assert!(result.content.contains("Empty Section"));
319
+ assert!(result.content.contains("Section with Content"));
320
+ assert!(result.content.contains("Content here"));
321
+ }
322
+
323
+ #[tokio::test]
324
+ async fn test_docbook_itemized_list() {
325
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
326
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
327
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
328
+ <article>
329
+ <title>List Test</title>
330
+ <itemizedlist>
331
+ <listitem>
332
+ <para>First item</para>
333
+ </listitem>
334
+ <listitem>
335
+ <para>Second item</para>
336
+ </listitem>
337
+ <listitem>
338
+ <para>Third item</para>
339
+ </listitem>
340
+ </itemizedlist>
341
+ </article>"#;
342
+
343
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
344
+ assert!(result.is_ok());
345
+
346
+ let result = result.unwrap();
347
+ assert!(result.content.contains("First item"));
348
+ assert!(result.content.contains("Second item"));
349
+ assert!(result.content.contains("Third item"));
350
+ assert!(result.content.contains("- "), "Should contain bullet points");
351
+ }
352
+
353
+ #[tokio::test]
354
+ async fn test_docbook_ordered_list() {
355
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
356
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
357
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
358
+ <article>
359
+ <title>Ordered List Test</title>
360
+ <orderedlist>
361
+ <listitem>
362
+ <para>First step</para>
363
+ </listitem>
364
+ <listitem>
365
+ <para>Second step</para>
366
+ </listitem>
367
+ <listitem>
368
+ <para>Third step</para>
369
+ </listitem>
370
+ </orderedlist>
371
+ </article>"#;
372
+
373
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
374
+ assert!(result.is_ok());
375
+
376
+ let result = result.unwrap();
377
+ assert!(result.content.contains("First step"));
378
+ assert!(result.content.contains("Second step"));
379
+ assert!(result.content.contains("Third step"));
380
+ assert!(result.content.contains("1. "), "Should contain numbered list");
381
+ }
382
+
383
+ #[tokio::test]
384
+ async fn test_docbook_blockquote() {
385
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
386
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
387
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
388
+ <article>
389
+ <title>Blockquote Test</title>
390
+ <blockquote>
391
+ <para>This is a quoted passage.</para>
392
+ </blockquote>
393
+ </article>"#;
394
+
395
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
396
+ assert!(result.is_ok());
397
+
398
+ let result = result.unwrap();
399
+ assert!(result.content.contains("quoted passage"));
400
+ assert!(result.content.contains("> "), "Should contain blockquote marker");
401
+ }
402
+
403
+ #[tokio::test]
404
+ async fn test_docbook_figure() {
405
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
406
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
407
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
408
+ <article>
409
+ <title>Figure Test</title>
410
+ <figure>
411
+ <title>Sample Figure</title>
412
+ <para>This is a figure description.</para>
413
+ </figure>
414
+ </article>"#;
415
+
416
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
417
+ assert!(result.is_ok());
418
+
419
+ let result = result.unwrap();
420
+ assert!(result.content.contains("Figure"));
421
+ }
422
+
423
+ #[tokio::test]
424
+ async fn test_docbook_footnote() {
425
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
426
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
427
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
428
+ <article>
429
+ <title>Footnote Test</title>
430
+ <para>Here is some text with a footnote<footnote><para>This is the footnote content</para></footnote>.</para>
431
+ </article>"#;
432
+
433
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
434
+ assert!(result.is_ok());
435
+
436
+ let result = result.unwrap();
437
+ assert!(result.content.contains("text with a footnote"));
438
+ assert!(result.content.contains("footnote content"));
439
+ }
440
+
441
+ #[tokio::test]
442
+ async fn test_docbook_mixed_content_with_lists() {
443
+ let docbook = r#"<?xml version="1.0" encoding="UTF-8"?>
444
+ <!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
445
+ "http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd">
446
+ <article>
447
+ <title>Mixed Content</title>
448
+ <para>Introduction paragraph.</para>
449
+ <itemizedlist>
450
+ <listitem>
451
+ <para>List item 1</para>
452
+ </listitem>
453
+ <listitem>
454
+ <para>List item 2</para>
455
+ </listitem>
456
+ </itemizedlist>
457
+ <para>Conclusion paragraph.</para>
458
+ <programlisting>
459
+ code example
460
+ </programlisting>
461
+ </article>"#;
462
+
463
+ let result = extract_docbook_bytes(docbook.as_bytes(), "application/docbook+xml").await;
464
+ assert!(result.is_ok());
465
+
466
+ let result = result.unwrap();
467
+ assert!(result.content.contains("Introduction paragraph"));
468
+ assert!(result.content.contains("List item 1"));
469
+ assert!(result.content.contains("List item 2"));
470
+ assert!(result.content.contains("Conclusion paragraph"));
471
+ assert!(result.content.contains("code example"));
472
+ }
473
+
474
+ #[tokio::test]
475
+ async fn test_docbook_namespaced_lists() {
476
+ let docbook5 = r#"<?xml version="1.0" encoding="UTF-8"?>
477
+ <article xmlns="http://docbook.org/ns/docbook">
478
+ <info>
479
+ <title>Lists in DocBook 5</title>
480
+ </info>
481
+ <itemizedlist>
482
+ <listitem>
483
+ <para>Namespaced item 1</para>
484
+ </listitem>
485
+ <listitem>
486
+ <para>Namespaced item 2</para>
487
+ </listitem>
488
+ </itemizedlist>
489
+ </article>"#;
490
+
491
+ let result = extract_docbook_bytes(docbook5.as_bytes(), "application/docbook+xml").await;
492
+ assert!(result.is_ok());
493
+
494
+ let result = result.unwrap();
495
+ assert!(result.content.contains("Namespaced item 1"));
496
+ assert!(result.content.contains("Namespaced item 2"));
497
+ assert!(result.content.contains("- "));
498
+ }
@@ -0,0 +1,122 @@
1
+ //! End-to-end integration test for DOCX metadata extraction
2
+
3
+ #![cfg(feature = "office")]
4
+
5
+ use kreuzberg::{ExtractionConfig, extract_file};
6
+
7
+ #[tokio::test]
8
+ async fn test_docx_full_metadata_extraction() {
9
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
10
+ .parent()
11
+ .unwrap()
12
+ .parent()
13
+ .unwrap();
14
+ let test_file = workspace_root.join("test_documents/documents/word_sample.docx");
15
+
16
+ if !test_file.exists() {
17
+ println!("Skipping test: Test file not found at {:?}", test_file);
18
+ return;
19
+ }
20
+
21
+ let result = extract_file(&test_file, None, &ExtractionConfig::default())
22
+ .await
23
+ .expect("Should extract DOCX successfully");
24
+
25
+ assert!(!result.content.is_empty(), "Content should not be empty");
26
+ assert!(
27
+ result.content.to_lowercase().contains("swim"),
28
+ "Content should contain 'swim'"
29
+ );
30
+
31
+ assert_eq!(
32
+ result.metadata.additional.get("created_by").and_then(|v| v.as_str()),
33
+ Some("Christoph Auer"),
34
+ "Should have correct creator"
35
+ );
36
+ assert_eq!(
37
+ result.metadata.additional.get("modified_by").and_then(|v| v.as_str()),
38
+ Some("Maxim Lysak"),
39
+ "Should have correct last modified by"
40
+ );
41
+ assert_eq!(
42
+ result.metadata.additional.get("created_at").and_then(|v| v.as_str()),
43
+ Some("2024-10-09T12:43:00Z"),
44
+ "Should have correct creation date"
45
+ );
46
+ assert_eq!(
47
+ result.metadata.additional.get("revision").and_then(|v| v.as_str()),
48
+ Some("7"),
49
+ "Should have revision number"
50
+ );
51
+
52
+ assert_eq!(
53
+ result.metadata.additional.get("page_count").and_then(|v| v.as_i64()),
54
+ Some(2),
55
+ "Should have 2 pages"
56
+ );
57
+ assert_eq!(
58
+ result.metadata.additional.get("word_count").and_then(|v| v.as_i64()),
59
+ Some(108),
60
+ "Should have 108 words"
61
+ );
62
+ assert_eq!(
63
+ result
64
+ .metadata
65
+ .additional
66
+ .get("character_count")
67
+ .and_then(|v| v.as_i64()),
68
+ Some(620),
69
+ "Should have 620 characters"
70
+ );
71
+ assert_eq!(
72
+ result.metadata.additional.get("line_count").and_then(|v| v.as_i64()),
73
+ Some(5),
74
+ "Should have 5 lines"
75
+ );
76
+ assert_eq!(
77
+ result
78
+ .metadata
79
+ .additional
80
+ .get("paragraph_count")
81
+ .and_then(|v| v.as_i64()),
82
+ Some(1),
83
+ "Should have 1 paragraph"
84
+ );
85
+
86
+ println!("✅ DOCX metadata extraction test passed!");
87
+ println!(" Found {} metadata fields", result.metadata.additional.len());
88
+ }
89
+
90
+ #[tokio::test]
91
+ async fn test_docx_minimal_metadata_extraction() {
92
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
93
+ .parent()
94
+ .unwrap()
95
+ .parent()
96
+ .unwrap();
97
+ let test_file = workspace_root.join("test_documents/documents/lorem_ipsum.docx");
98
+
99
+ if !test_file.exists() {
100
+ println!("Skipping test: Test file not found at {:?}", test_file);
101
+ return;
102
+ }
103
+
104
+ let result = extract_file(&test_file, None, &ExtractionConfig::default())
105
+ .await
106
+ .expect("Should extract DOCX successfully");
107
+
108
+ assert!(!result.content.is_empty(), "Content should not be empty");
109
+
110
+ assert_eq!(
111
+ result.metadata.additional.get("page_count").and_then(|v| v.as_i64()),
112
+ Some(1),
113
+ "Should have 1 page"
114
+ );
115
+ assert_eq!(
116
+ result.metadata.additional.get("word_count").and_then(|v| v.as_i64()),
117
+ Some(520),
118
+ "Should have 520 words"
119
+ );
120
+
121
+ println!("✅ DOCX minimal metadata extraction test passed!");
122
+ }