kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,854 @@
1
+ //! Email extraction functions.
2
+ //!
3
+ //! Parses .eml (RFC822) and .msg (Outlook) email files using `mail-parser`.
4
+ //! Extracts message content, headers, and attachment information.
5
+ //!
6
+ //! # Features
7
+ //!
8
+ //! - **EML support**: RFC822 format parsing
9
+ //! - **HTML to text**: Strips HTML tags from HTML email bodies
10
+ //! - **Metadata extraction**: Sender, recipients, subject, message ID
11
+ //! - **Attachment list**: Names of all attachments (content not extracted)
12
+ //!
13
+ //! # Example
14
+ //!
15
+ //! ```rust,no_run
16
+ //! use kreuzberg::extraction::email::parse_eml_content;
17
+ //!
18
+ //! # fn example() -> kreuzberg::Result<()> {
19
+ //! let eml_bytes = std::fs::read("message.eml")?;
20
+ //! let result = parse_eml_content(&eml_bytes)?;
21
+ //!
22
+ //! println!("From: {:?}", result.from_email);
23
+ //! println!("Subject: {:?}", result.subject);
24
+ //! # Ok(())
25
+ //! # }
26
+ //! ```
27
+ use crate::error::{KreuzbergError, Result};
28
+ use crate::types::{EmailAttachment, EmailExtractionResult};
29
+ use mail_parser::MimeHeaders;
30
+ use regex::Regex;
31
+ use std::collections::HashMap;
32
+ use std::sync::OnceLock;
33
+
34
+ static HTML_TAG_RE: OnceLock<Regex> = OnceLock::new();
35
+ static SCRIPT_RE: OnceLock<Regex> = OnceLock::new();
36
+ static STYLE_RE: OnceLock<Regex> = OnceLock::new();
37
+ static WHITESPACE_RE: OnceLock<Regex> = OnceLock::new();
38
+
39
+ fn html_tag_regex() -> &'static Regex {
40
+ HTML_TAG_RE.get_or_init(|| Regex::new(r"<[^>]+>").unwrap())
41
+ }
42
+
43
+ fn script_regex() -> &'static Regex {
44
+ SCRIPT_RE.get_or_init(|| Regex::new(r"(?i)<script[^>]*>.*?</script>").unwrap())
45
+ }
46
+
47
+ fn style_regex() -> &'static Regex {
48
+ STYLE_RE.get_or_init(|| Regex::new(r"(?i)<style[^>]*>.*?</style>").unwrap())
49
+ }
50
+
51
+ fn whitespace_regex() -> &'static Regex {
52
+ WHITESPACE_RE.get_or_init(|| Regex::new(r"\s+").unwrap())
53
+ }
54
+
55
+ /// Parse .eml file content (RFC822 format)
56
+ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
57
+ let message = mail_parser::MessageParser::default()
58
+ .parse(data)
59
+ .ok_or_else(|| KreuzbergError::parsing("Failed to parse EML file: invalid email format".to_string()))?;
60
+
61
+ let subject = message.subject().map(|s| s.to_string());
62
+
63
+ let from_email = message
64
+ .from()
65
+ .and_then(|from| from.first())
66
+ .and_then(|addr| addr.address())
67
+ .map(|s| s.to_string());
68
+
69
+ let to_emails: Vec<String> = message
70
+ .to()
71
+ .map(|to| {
72
+ to.iter()
73
+ .filter_map(|addr| addr.address().map(|s| s.to_string()))
74
+ .collect()
75
+ })
76
+ .unwrap_or_else(Vec::new);
77
+
78
+ let cc_emails: Vec<String> = message
79
+ .cc()
80
+ .map(|cc| {
81
+ cc.iter()
82
+ .filter_map(|addr| addr.address().map(|s| s.to_string()))
83
+ .collect()
84
+ })
85
+ .unwrap_or_else(Vec::new);
86
+
87
+ let bcc_emails: Vec<String> = message
88
+ .bcc()
89
+ .map(|bcc| {
90
+ bcc.iter()
91
+ .filter_map(|addr| addr.address().map(|s| s.to_string()))
92
+ .collect()
93
+ })
94
+ .unwrap_or_else(Vec::new);
95
+
96
+ let date = message.date().map(|d| d.to_rfc3339());
97
+
98
+ let message_id = message.message_id().map(|id| id.to_string());
99
+
100
+ let plain_text = message.body_text(0).map(|s| s.to_string());
101
+
102
+ let html_content = message.body_html(0).map(|s| s.to_string());
103
+
104
+ let cleaned_text = if let Some(plain) = &plain_text {
105
+ plain.clone()
106
+ } else if let Some(html) = &html_content {
107
+ clean_html_content(html)
108
+ } else {
109
+ String::new()
110
+ };
111
+
112
+ let mut attachments = Vec::new();
113
+ for attachment in message.attachments() {
114
+ let filename = attachment.attachment_name().map(|s| s.to_string());
115
+
116
+ let mime_type = attachment
117
+ .content_type()
118
+ .map(|ct| {
119
+ let content_type_str = format!("{}/{}", ct.ctype(), ct.subtype().unwrap_or("octet-stream"));
120
+ parse_content_type(&content_type_str)
121
+ })
122
+ .unwrap_or_else(|| "application/octet-stream".to_string());
123
+
124
+ let data = attachment.contents();
125
+ let size = data.len();
126
+
127
+ let is_image = is_image_mime_type(&mime_type);
128
+
129
+ attachments.push(EmailAttachment {
130
+ name: filename.clone(),
131
+ filename,
132
+ mime_type: Some(mime_type),
133
+ size: Some(size),
134
+ is_image,
135
+ data: Some(data.to_vec()),
136
+ });
137
+ }
138
+
139
+ let metadata = build_metadata(
140
+ &subject,
141
+ &from_email,
142
+ &to_emails,
143
+ &cc_emails,
144
+ &bcc_emails,
145
+ &date,
146
+ &message_id,
147
+ &attachments,
148
+ );
149
+
150
+ Ok(EmailExtractionResult {
151
+ subject,
152
+ from_email,
153
+ to_emails,
154
+ cc_emails,
155
+ bcc_emails,
156
+ date,
157
+ message_id,
158
+ plain_text,
159
+ html_content,
160
+ cleaned_text,
161
+ attachments,
162
+ metadata,
163
+ })
164
+ }
165
+
166
+ /// Parse .msg file content (Outlook format)
167
+ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
168
+ let outlook = msg_parser::Outlook::from_slice(data)
169
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to parse MSG file: {}", e)))?;
170
+
171
+ let subject = Some(outlook.subject.clone());
172
+ let from_email = Some(outlook.sender.email.clone());
173
+
174
+ let to_emails = outlook
175
+ .to
176
+ .iter()
177
+ .map(|p| p.email.clone())
178
+ .filter(|e| !e.is_empty())
179
+ .collect::<Vec<String>>();
180
+
181
+ let cc_emails = outlook
182
+ .cc
183
+ .iter()
184
+ .map(|p| p.email.clone())
185
+ .filter(|e| !e.is_empty())
186
+ .collect::<Vec<String>>();
187
+
188
+ let bcc_emails = if !outlook.bcc.is_empty() {
189
+ vec![outlook.bcc.clone()]
190
+ } else {
191
+ vec![]
192
+ };
193
+
194
+ let date = if !outlook.headers.date.is_empty() {
195
+ Some(outlook.headers.date.clone())
196
+ } else {
197
+ None
198
+ };
199
+
200
+ let message_id = if !outlook.headers.message_id.is_empty() {
201
+ Some(outlook.headers.message_id.clone())
202
+ } else {
203
+ None
204
+ };
205
+
206
+ let plain_text = if !outlook.body.is_empty() {
207
+ Some(outlook.body.clone())
208
+ } else {
209
+ None
210
+ };
211
+
212
+ let html_content = None;
213
+ let cleaned_text = plain_text.clone().unwrap_or_default();
214
+
215
+ let attachments: Vec<EmailAttachment> = outlook
216
+ .attachments
217
+ .iter()
218
+ .map(|att| {
219
+ let filename = if !att.file_name.is_empty() {
220
+ Some(att.file_name.clone())
221
+ } else if !att.display_name.is_empty() {
222
+ Some(att.display_name.clone())
223
+ } else {
224
+ Some(format!("attachment{}", att.extension))
225
+ };
226
+
227
+ let mime_type = if !att.mime_tag.is_empty() {
228
+ Some(att.mime_tag.clone())
229
+ } else {
230
+ Some("application/octet-stream".to_string())
231
+ };
232
+
233
+ let data = if !att.payload.is_empty() {
234
+ hex::decode(&att.payload).ok()
235
+ } else {
236
+ None
237
+ };
238
+
239
+ let size = data.as_ref().map(|d| d.len());
240
+ let is_image = mime_type.as_ref().map(|m| is_image_mime_type(m)).unwrap_or(false);
241
+
242
+ EmailAttachment {
243
+ name: filename.clone(),
244
+ filename,
245
+ mime_type,
246
+ size,
247
+ is_image,
248
+ data,
249
+ }
250
+ })
251
+ .collect();
252
+
253
+ let from_name = if !outlook.sender.name.is_empty() {
254
+ Some(outlook.sender.name.clone())
255
+ } else {
256
+ None
257
+ };
258
+
259
+ let mut metadata = HashMap::new();
260
+ if let Some(ref subj) = subject {
261
+ metadata.insert("subject".to_string(), subj.to_string());
262
+ }
263
+ if let Some(ref from) = from_email {
264
+ metadata.insert("email_from".to_string(), from.to_string());
265
+ }
266
+ if let Some(ref name) = from_name {
267
+ metadata.insert("from_name".to_string(), name.to_string());
268
+ }
269
+ if !to_emails.is_empty() {
270
+ metadata.insert("email_to".to_string(), to_emails.join(", "));
271
+ }
272
+ if !cc_emails.is_empty() {
273
+ metadata.insert("email_cc".to_string(), cc_emails.join(", "));
274
+ }
275
+ if !bcc_emails.is_empty() {
276
+ metadata.insert("email_bcc".to_string(), bcc_emails.join(", "));
277
+ }
278
+ if let Some(ref dt) = date {
279
+ metadata.insert("date".to_string(), dt.to_string());
280
+ }
281
+ if let Some(ref msg_id) = message_id {
282
+ metadata.insert("message_id".to_string(), msg_id.to_string());
283
+ }
284
+ if !attachments.is_empty() {
285
+ let attachment_names: Vec<String> = attachments
286
+ .iter()
287
+ .filter_map(|a| a.filename.as_ref())
288
+ .cloned()
289
+ .collect();
290
+ metadata.insert("attachments".to_string(), attachment_names.join(", "));
291
+ }
292
+
293
+ Ok(EmailExtractionResult {
294
+ subject,
295
+ from_email,
296
+ to_emails,
297
+ cc_emails,
298
+ bcc_emails,
299
+ date,
300
+ message_id,
301
+ plain_text,
302
+ html_content,
303
+ cleaned_text,
304
+ attachments,
305
+ metadata,
306
+ })
307
+ }
308
+
309
+ /// Extract email content from either .eml or .msg format
310
+ pub fn extract_email_content(data: &[u8], mime_type: &str) -> Result<EmailExtractionResult> {
311
+ if data.is_empty() {
312
+ return Err(KreuzbergError::validation("Email content is empty".to_string()));
313
+ }
314
+
315
+ match mime_type {
316
+ "message/rfc822" | "text/plain" => parse_eml_content(data),
317
+ "application/vnd.ms-outlook" => parse_msg_content(data),
318
+ _ => Err(KreuzbergError::validation(format!(
319
+ "Unsupported email MIME type: {}",
320
+ mime_type
321
+ ))),
322
+ }
323
+ }
324
+
325
+ /// Build text output from email extraction result
326
+ pub fn build_email_text_output(result: &EmailExtractionResult) -> String {
327
+ let mut text_parts = Vec::new();
328
+
329
+ if let Some(ref subject) = result.subject {
330
+ text_parts.push(format!("Subject: {}", subject));
331
+ }
332
+
333
+ if let Some(ref from) = result.from_email {
334
+ text_parts.push(format!("From: {}", from));
335
+ }
336
+
337
+ if !result.to_emails.is_empty() {
338
+ text_parts.push(format!("To: {}", result.to_emails.join(", ")));
339
+ }
340
+
341
+ if !result.cc_emails.is_empty() {
342
+ text_parts.push(format!("CC: {}", result.cc_emails.join(", ")));
343
+ }
344
+
345
+ if !result.bcc_emails.is_empty() {
346
+ text_parts.push(format!("BCC: {}", result.bcc_emails.join(", ")));
347
+ }
348
+
349
+ if let Some(ref date) = result.date {
350
+ text_parts.push(format!("Date: {}", date));
351
+ }
352
+
353
+ text_parts.push(result.cleaned_text.clone());
354
+
355
+ if !result.attachments.is_empty() {
356
+ let attachment_names: Vec<String> = result
357
+ .attachments
358
+ .iter()
359
+ .filter_map(|att| att.name.as_ref().or(att.filename.as_ref()))
360
+ .cloned()
361
+ .collect();
362
+ if !attachment_names.is_empty() {
363
+ text_parts.push(format!("Attachments: {}", attachment_names.join(", ")));
364
+ }
365
+ }
366
+
367
+ text_parts.join("\n")
368
+ }
369
+
370
+ fn clean_html_content(html: &str) -> String {
371
+ if html.is_empty() {
372
+ return String::new();
373
+ }
374
+
375
+ let cleaned = script_regex().replace_all(html, "");
376
+ let cleaned = style_regex().replace_all(&cleaned, "");
377
+
378
+ let cleaned = html_tag_regex().replace_all(&cleaned, "");
379
+
380
+ let cleaned = whitespace_regex().replace_all(&cleaned, " ");
381
+
382
+ cleaned.trim().to_string()
383
+ }
384
+
385
+ fn is_image_mime_type(mime_type: &str) -> bool {
386
+ mime_type.starts_with("image/")
387
+ }
388
+
389
+ fn parse_content_type(content_type: &str) -> String {
390
+ let trimmed = content_type.trim();
391
+ if trimmed.is_empty() {
392
+ return "application/octet-stream".to_string();
393
+ }
394
+ trimmed
395
+ .split(';')
396
+ .next()
397
+ .unwrap_or("application/octet-stream")
398
+ .trim()
399
+ .to_lowercase()
400
+ }
401
+
402
+ #[allow(clippy::too_many_arguments)]
403
+ fn build_metadata(
404
+ subject: &Option<String>,
405
+ from_email: &Option<String>,
406
+ to_emails: &[String],
407
+ cc_emails: &[String],
408
+ bcc_emails: &[String],
409
+ date: &Option<String>,
410
+ message_id: &Option<String>,
411
+ attachments: &[EmailAttachment],
412
+ ) -> HashMap<String, String> {
413
+ let mut metadata = HashMap::new();
414
+
415
+ if let Some(subj) = subject {
416
+ metadata.insert("subject".to_string(), subj.clone());
417
+ }
418
+ if let Some(from) = from_email {
419
+ metadata.insert("email_from".to_string(), from.clone());
420
+ }
421
+ if !to_emails.is_empty() {
422
+ metadata.insert("email_to".to_string(), to_emails.join(", "));
423
+ }
424
+ if !cc_emails.is_empty() {
425
+ metadata.insert("email_cc".to_string(), cc_emails.join(", "));
426
+ }
427
+ if !bcc_emails.is_empty() {
428
+ metadata.insert("email_bcc".to_string(), bcc_emails.join(", "));
429
+ }
430
+ if let Some(dt) = date {
431
+ metadata.insert("date".to_string(), dt.clone());
432
+ }
433
+ if let Some(msg_id) = message_id {
434
+ metadata.insert("message_id".to_string(), msg_id.clone());
435
+ }
436
+
437
+ if !attachments.is_empty() {
438
+ let attachment_names: Vec<String> = attachments
439
+ .iter()
440
+ .filter_map(|att| att.name.as_ref().or(att.filename.as_ref()))
441
+ .cloned()
442
+ .collect();
443
+ if !attachment_names.is_empty() {
444
+ metadata.insert("attachments".to_string(), attachment_names.join(", "));
445
+ }
446
+ }
447
+
448
+ metadata
449
+ }
450
+
451
+ #[cfg(test)]
452
+ mod tests {
453
+ use super::*;
454
+
455
+ #[test]
456
+ fn test_clean_html_content() {
457
+ let html = "<p>Hello <b>World</b></p>";
458
+ let cleaned = clean_html_content(html);
459
+ assert_eq!(cleaned, "Hello World");
460
+ }
461
+
462
+ #[test]
463
+ fn test_clean_html_with_whitespace() {
464
+ let html = "<div> Multiple \n spaces </div>";
465
+ let cleaned = clean_html_content(html);
466
+ assert_eq!(cleaned, "Multiple spaces");
467
+ }
468
+
469
+ #[test]
470
+ fn test_clean_html_with_script_and_style() {
471
+ let html = r#"
472
+ <html>
473
+ <head><style>body { color: red; }</style></head>
474
+ <body>
475
+ <script>alert('test');</script>
476
+ <p>Hello World</p>
477
+ </body>
478
+ </html>
479
+ "#;
480
+ let cleaned = clean_html_content(html);
481
+ assert!(!cleaned.contains("<script>"));
482
+ assert!(!cleaned.contains("<style>"));
483
+ assert!(cleaned.contains("Hello World"));
484
+ }
485
+
486
+ #[test]
487
+ fn test_is_image_mime_type() {
488
+ assert!(is_image_mime_type("image/png"));
489
+ assert!(is_image_mime_type("image/jpeg"));
490
+ assert!(!is_image_mime_type("text/plain"));
491
+ assert!(!is_image_mime_type("application/pdf"));
492
+ }
493
+
494
+ #[test]
495
+ fn test_parse_content_type() {
496
+ assert_eq!(parse_content_type("text/plain"), "text/plain");
497
+ assert_eq!(parse_content_type("text/plain; charset=utf-8"), "text/plain");
498
+ assert_eq!(parse_content_type("image/jpeg; name=test.jpg"), "image/jpeg");
499
+ assert_eq!(parse_content_type(""), "application/octet-stream");
500
+ }
501
+
502
+ #[test]
503
+ fn test_extract_email_content_empty_data() {
504
+ let result = extract_email_content(b"", "message/rfc822");
505
+ assert!(result.is_err());
506
+ assert!(matches!(result.unwrap_err(), KreuzbergError::Validation { .. }));
507
+ }
508
+
509
+ #[test]
510
+ fn test_extract_email_content_invalid_mime_type() {
511
+ let result = extract_email_content(b"test", "application/pdf");
512
+ assert!(result.is_err());
513
+ assert!(matches!(result.unwrap_err(), KreuzbergError::Validation { .. }));
514
+ }
515
+
516
+ #[test]
517
+ fn test_parse_eml_content_invalid() {
518
+ let result = parse_eml_content(b"not an email");
519
+ assert!(result.is_ok());
520
+ }
521
+
522
+ #[test]
523
+ fn test_parse_msg_content_invalid() {
524
+ let result = parse_msg_content(b"not a msg file");
525
+ assert!(result.is_err());
526
+ assert!(matches!(result.unwrap_err(), KreuzbergError::Parsing { .. }));
527
+ }
528
+
529
+ #[test]
530
+ fn test_simple_eml_parsing() {
531
+ let eml_content =
532
+ b"From: test@example.com\r\nTo: recipient@example.com\r\nSubject: Test Email\r\n\r\nThis is a test email body.";
533
+
534
+ let result = parse_eml_content(eml_content).unwrap();
535
+ assert_eq!(result.subject, Some("Test Email".to_string()));
536
+ assert_eq!(result.from_email, Some("test@example.com".to_string()));
537
+ assert_eq!(result.to_emails, vec!["recipient@example.com".to_string()]);
538
+ assert_eq!(result.cleaned_text, "This is a test email body.");
539
+ }
540
+
541
+ #[test]
542
+ fn test_build_email_text_output_minimal() {
543
+ let result = EmailExtractionResult {
544
+ subject: Some("Test".to_string()),
545
+ from_email: Some("sender@example.com".to_string()),
546
+ to_emails: vec!["recipient@example.com".to_string()],
547
+ cc_emails: vec![],
548
+ bcc_emails: vec![],
549
+ date: None,
550
+ message_id: None,
551
+ plain_text: None,
552
+ html_content: None,
553
+ cleaned_text: "Hello World".to_string(),
554
+ attachments: vec![],
555
+ metadata: HashMap::new(),
556
+ };
557
+
558
+ let output = build_email_text_output(&result);
559
+ assert!(output.contains("Subject: Test"));
560
+ assert!(output.contains("From: sender@example.com"));
561
+ assert!(output.contains("To: recipient@example.com"));
562
+ assert!(output.contains("Hello World"));
563
+ }
564
+
565
+ #[test]
566
+ fn test_build_email_text_output_with_attachments() {
567
+ let result = EmailExtractionResult {
568
+ subject: Some("Test".to_string()),
569
+ from_email: Some("sender@example.com".to_string()),
570
+ to_emails: vec!["recipient@example.com".to_string()],
571
+ cc_emails: vec![],
572
+ bcc_emails: vec![],
573
+ date: None,
574
+ message_id: None,
575
+ plain_text: None,
576
+ html_content: None,
577
+ cleaned_text: "Hello World".to_string(),
578
+ attachments: vec![EmailAttachment {
579
+ name: Some("file.txt".to_string()),
580
+ filename: Some("file.txt".to_string()),
581
+ mime_type: Some("text/plain".to_string()),
582
+ size: Some(1024),
583
+ is_image: false,
584
+ data: None,
585
+ }],
586
+ metadata: HashMap::new(),
587
+ };
588
+
589
+ let output = build_email_text_output(&result);
590
+ assert!(output.contains("Attachments: file.txt"));
591
+ }
592
+
593
+ #[test]
594
+ fn test_build_metadata() {
595
+ let subject = Some("Test Subject".to_string());
596
+ let from_email = Some("sender@example.com".to_string());
597
+ let to_emails = vec!["recipient@example.com".to_string()];
598
+ let cc_emails = vec!["cc@example.com".to_string()];
599
+ let bcc_emails = vec!["bcc@example.com".to_string()];
600
+ let date = Some("2024-01-01T12:00:00Z".to_string());
601
+ let message_id = Some("<abc123@example.com>".to_string());
602
+ let attachments = vec![];
603
+
604
+ let metadata = build_metadata(
605
+ &subject,
606
+ &from_email,
607
+ &to_emails,
608
+ &cc_emails,
609
+ &bcc_emails,
610
+ &date,
611
+ &message_id,
612
+ &attachments,
613
+ );
614
+
615
+ assert_eq!(metadata.get("subject"), Some(&"Test Subject".to_string()));
616
+ assert_eq!(metadata.get("email_from"), Some(&"sender@example.com".to_string()));
617
+ assert_eq!(metadata.get("email_to"), Some(&"recipient@example.com".to_string()));
618
+ assert_eq!(metadata.get("email_cc"), Some(&"cc@example.com".to_string()));
619
+ assert_eq!(metadata.get("email_bcc"), Some(&"bcc@example.com".to_string()));
620
+ assert_eq!(metadata.get("date"), Some(&"2024-01-01T12:00:00Z".to_string()));
621
+ assert_eq!(metadata.get("message_id"), Some(&"<abc123@example.com>".to_string()));
622
+ }
623
+
624
+ #[test]
625
+ fn test_build_metadata_with_attachments() {
626
+ let attachments = vec![
627
+ EmailAttachment {
628
+ name: Some("file1.pdf".to_string()),
629
+ filename: Some("file1.pdf".to_string()),
630
+ mime_type: Some("application/pdf".to_string()),
631
+ size: Some(1024),
632
+ is_image: false,
633
+ data: None,
634
+ },
635
+ EmailAttachment {
636
+ name: Some("image.png".to_string()),
637
+ filename: Some("image.png".to_string()),
638
+ mime_type: Some("image/png".to_string()),
639
+ size: Some(2048),
640
+ is_image: true,
641
+ data: None,
642
+ },
643
+ ];
644
+
645
+ let metadata = build_metadata(&None, &None, &[], &[], &[], &None, &None, &attachments);
646
+
647
+ assert_eq!(metadata.get("attachments"), Some(&"file1.pdf, image.png".to_string()));
648
+ }
649
+
650
+ #[test]
651
+ fn test_clean_html_content_empty() {
652
+ let cleaned = clean_html_content("");
653
+ assert_eq!(cleaned, "");
654
+ }
655
+
656
+ #[test]
657
+ fn test_clean_html_content_only_tags() {
658
+ let html = "<div><span><p></p></span></div>";
659
+ let cleaned = clean_html_content(html);
660
+ assert_eq!(cleaned, "");
661
+ }
662
+
663
+ #[test]
664
+ fn test_clean_html_content_nested_tags() {
665
+ let html = "<div><p>Outer <span>Inner <b>Bold</b></span> Text</p></div>";
666
+ let cleaned = clean_html_content(html);
667
+ assert_eq!(cleaned, "Outer Inner Bold Text");
668
+ }
669
+
670
+ #[test]
671
+ fn test_clean_html_content_multiple_scripts() {
672
+ let html = r#"
673
+ <script>function a() {}</script>
674
+ <p>Content</p>
675
+ <script>function b() {}</script>
676
+ "#;
677
+ let cleaned = clean_html_content(html);
678
+ assert!(!cleaned.contains("function"));
679
+ assert!(cleaned.contains("Content"));
680
+ }
681
+
682
+ #[test]
683
+ fn test_is_image_mime_type_variants() {
684
+ assert!(is_image_mime_type("image/gif"));
685
+ assert!(is_image_mime_type("image/webp"));
686
+ assert!(is_image_mime_type("image/svg+xml"));
687
+ assert!(!is_image_mime_type("video/mp4"));
688
+ assert!(!is_image_mime_type("audio/mp3"));
689
+ }
690
+
691
+ #[test]
692
+ fn test_parse_content_type_with_parameters() {
693
+ assert_eq!(parse_content_type("multipart/mixed; boundary=xyz"), "multipart/mixed");
694
+ assert_eq!(parse_content_type("text/html; charset=UTF-8"), "text/html");
695
+ }
696
+
697
+ #[test]
698
+ fn test_parse_content_type_whitespace() {
699
+ assert_eq!(parse_content_type(" text/plain "), "text/plain");
700
+ assert_eq!(parse_content_type(" text/plain ; charset=utf-8 "), "text/plain");
701
+ }
702
+
703
+ #[test]
704
+ fn test_parse_content_type_case_insensitive() {
705
+ assert_eq!(parse_content_type("TEXT/PLAIN"), "text/plain");
706
+ assert_eq!(parse_content_type("Image/JPEG"), "image/jpeg");
707
+ }
708
+
709
+ #[test]
710
+ fn test_extract_email_content_mime_variants() {
711
+ let eml_content = b"From: test@example.com\r\n\r\nBody";
712
+
713
+ assert!(extract_email_content(eml_content, "message/rfc822").is_ok());
714
+ assert!(extract_email_content(eml_content, "text/plain").is_ok());
715
+ }
716
+
717
+ #[test]
718
+ fn test_simple_eml_with_multiple_recipients() {
719
+ let eml_content = b"From: sender@example.com\r\nTo: r1@example.com, r2@example.com\r\nCc: cc@example.com\r\nBcc: bcc@example.com\r\nSubject: Multi-recipient\r\n\r\nBody";
720
+
721
+ let result = parse_eml_content(eml_content).unwrap();
722
+ assert_eq!(result.to_emails.len(), 2);
723
+ assert!(result.to_emails.contains(&"r1@example.com".to_string()));
724
+ assert!(result.to_emails.contains(&"r2@example.com".to_string()));
725
+ }
726
+
727
+ #[test]
728
+ fn test_simple_eml_with_html_body() {
729
+ let eml_content = b"From: sender@example.com\r\nTo: recipient@example.com\r\nSubject: HTML Email\r\nContent-Type: text/html\r\n\r\n<html><body><p>HTML Body</p></body></html>";
730
+
731
+ let result = parse_eml_content(eml_content).unwrap();
732
+ assert!(!result.cleaned_text.is_empty());
733
+ }
734
+
735
+ #[test]
736
+ fn test_build_email_text_output_with_all_fields() {
737
+ let result = EmailExtractionResult {
738
+ subject: Some("Complete Email".to_string()),
739
+ from_email: Some("sender@example.com".to_string()),
740
+ to_emails: vec!["recipient@example.com".to_string()],
741
+ cc_emails: vec!["cc@example.com".to_string()],
742
+ bcc_emails: vec!["bcc@example.com".to_string()],
743
+ date: Some("2024-01-01T12:00:00Z".to_string()),
744
+ message_id: Some("<msg123@example.com>".to_string()),
745
+ plain_text: Some("Plain text body".to_string()),
746
+ html_content: Some("<html><body>HTML body</body></html>".to_string()),
747
+ cleaned_text: "Cleaned body text".to_string(),
748
+ attachments: vec![],
749
+ metadata: HashMap::new(),
750
+ };
751
+
752
+ let output = build_email_text_output(&result);
753
+ assert!(output.contains("Subject: Complete Email"));
754
+ assert!(output.contains("From: sender@example.com"));
755
+ assert!(output.contains("To: recipient@example.com"));
756
+ assert!(output.contains("CC: cc@example.com"));
757
+ assert!(output.contains("BCC: bcc@example.com"));
758
+ assert!(output.contains("Date: 2024-01-01T12:00:00Z"));
759
+ assert!(output.contains("Cleaned body text"));
760
+ }
761
+
762
+ #[test]
763
+ fn test_build_email_text_output_empty_attachments() {
764
+ let result = EmailExtractionResult {
765
+ subject: Some("Test".to_string()),
766
+ from_email: Some("sender@example.com".to_string()),
767
+ to_emails: vec!["recipient@example.com".to_string()],
768
+ cc_emails: vec![],
769
+ bcc_emails: vec![],
770
+ date: None,
771
+ message_id: None,
772
+ plain_text: None,
773
+ html_content: None,
774
+ cleaned_text: "Body".to_string(),
775
+ attachments: vec![EmailAttachment {
776
+ name: None,
777
+ filename: None,
778
+ mime_type: Some("application/octet-stream".to_string()),
779
+ size: Some(100),
780
+ is_image: false,
781
+ data: None,
782
+ }],
783
+ metadata: HashMap::new(),
784
+ };
785
+
786
+ let output = build_email_text_output(&result);
787
+ assert!(output.contains("Body"));
788
+ }
789
+
790
+ #[test]
791
+ fn test_build_metadata_empty_fields() {
792
+ let metadata = build_metadata(&None, &None, &[], &[], &[], &None, &None, &[]);
793
+ assert!(metadata.is_empty());
794
+ }
795
+
796
+ #[test]
797
+ fn test_build_metadata_partial_fields() {
798
+ let subject = Some("Test".to_string());
799
+ let date = Some("2024-01-01".to_string());
800
+
801
+ let metadata = build_metadata(&subject, &None, &[], &[], &[], &date, &None, &[]);
802
+
803
+ assert_eq!(metadata.get("subject"), Some(&"Test".to_string()));
804
+ assert_eq!(metadata.get("date"), Some(&"2024-01-01".to_string()));
805
+ assert_eq!(metadata.len(), 2);
806
+ }
807
+
808
+ #[test]
809
+ fn test_clean_html_content_case_insensitive_tags() {
810
+ let html = "<SCRIPT>code</SCRIPT><STYLE>css</STYLE><P>Text</P>";
811
+ let cleaned = clean_html_content(html);
812
+ assert!(!cleaned.contains("code"));
813
+ assert!(!cleaned.contains("css"));
814
+ assert!(cleaned.contains("Text"));
815
+ }
816
+
817
+ #[test]
818
+ fn test_simple_eml_with_date() {
819
+ let eml_content = b"From: sender@example.com\r\nTo: recipient@example.com\r\nDate: Mon, 1 Jan 2024 12:00:00 +0000\r\nSubject: Test\r\n\r\nBody";
820
+
821
+ let result = parse_eml_content(eml_content).unwrap();
822
+ assert!(result.date.is_some());
823
+ }
824
+
825
+ #[test]
826
+ fn test_simple_eml_with_message_id() {
827
+ let eml_content = b"From: sender@example.com\r\nTo: recipient@example.com\r\nMessage-ID: <unique@example.com>\r\nSubject: Test\r\n\r\nBody";
828
+
829
+ let result = parse_eml_content(eml_content).unwrap();
830
+ assert!(result.message_id.is_some());
831
+ }
832
+
833
+ #[test]
834
+ fn test_simple_eml_minimal() {
835
+ let eml_content = b"From: sender@example.com\r\n\r\nMinimal body";
836
+
837
+ let result = parse_eml_content(eml_content).unwrap();
838
+ assert_eq!(result.from_email, Some("sender@example.com".to_string()));
839
+ assert_eq!(result.cleaned_text, "Minimal body");
840
+ }
841
+
842
+ #[test]
843
+ fn test_regex_initialization() {
844
+ let _ = html_tag_regex();
845
+ let _ = script_regex();
846
+ let _ = style_regex();
847
+ let _ = whitespace_regex();
848
+
849
+ let _ = html_tag_regex();
850
+ let _ = script_regex();
851
+ let _ = style_regex();
852
+ let _ = whitespace_regex();
853
+ }
854
+ }