kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -41,6 +41,7 @@ impl PptxExtractor {
41
41
 
42
42
  let ocr_config = config.ocr.as_ref().unwrap();
43
43
  let tess_config = ocr_config.tesseract_config.as_ref().cloned().unwrap_or_default();
44
+ let output_format = config.output_format;
44
45
 
45
46
  for image in &mut images {
46
47
  let image_data = image.data.clone();
@@ -53,7 +54,7 @@ impl PptxExtractor {
53
54
 
54
55
  let proc = OcrProcessor::new(cache_dir)?;
55
56
  let ocr_tess_config: crate::ocr::types::TesseractConfig = (&tess_config_clone).into();
56
- proc.process_image(&image_data, &ocr_tess_config)
57
+ proc.process_image_with_format(&image_data, &ocr_tess_config, output_format)
57
58
  })
58
59
  .await
59
60
  .map_err(|e| crate::KreuzbergError::Ocr {
@@ -65,13 +66,15 @@ impl PptxExtractor {
65
66
  Ok(ocr_extraction) => {
66
67
  let extraction_result = ExtractionResult {
67
68
  content: ocr_extraction.content,
68
- mime_type: image.format.clone(),
69
+ mime_type: ocr_extraction.mime_type,
69
70
  metadata: Metadata::default(),
70
71
  tables: vec![],
71
72
  detected_languages: None,
72
73
  chunks: None,
73
74
  images: None,
75
+ djot_content: None,
74
76
  pages: None,
77
+ elements: None,
75
78
  };
76
79
  image.ocr_result = Some(Box::new(extraction_result));
77
80
  }
@@ -178,6 +181,8 @@ impl DocumentExtractor for PptxExtractor {
178
181
  detected_languages: None,
179
182
  chunks: None,
180
183
  images,
184
+ djot_content: None,
185
+ elements: None,
181
186
  })
182
187
  }
183
188
 
@@ -241,6 +246,8 @@ impl DocumentExtractor for PptxExtractor {
241
246
  detected_languages: None,
242
247
  chunks: None,
243
248
  images,
249
+ djot_content: None,
250
+ elements: None,
244
251
  })
245
252
  }
246
253
 
@@ -453,7 +453,9 @@ impl DocumentExtractor for RstExtractor {
453
453
  detected_languages: None,
454
454
  chunks: None,
455
455
  images: None,
456
+ djot_content: None,
456
457
  pages: None,
458
+ elements: None,
457
459
  })
458
460
  }
459
461
 
@@ -0,0 +1,116 @@
1
+ //! Character encoding utilities for RTF parsing.
2
+ //!
3
+ //! Provides hex byte parsing and Windows-1252 character mapping for the 0x80-0x9F range.
4
+
5
+ /// Convert a hex digit character to its numeric value.
6
+ ///
7
+ /// Returns None if the character is not a valid hex digit.
8
+ #[inline]
9
+ pub fn hex_digit_to_u8(c: char) -> Option<u8> {
10
+ match c {
11
+ '0'..='9' => Some((c as u8) - b'0'),
12
+ 'a'..='f' => Some((c as u8) - b'a' + 10),
13
+ 'A'..='F' => Some((c as u8) - b'A' + 10),
14
+ _ => None,
15
+ }
16
+ }
17
+
18
+ /// Parse a hex-encoded byte from two characters.
19
+ ///
20
+ /// Returns the decoded byte if both characters are valid hex digits.
21
+ #[inline]
22
+ pub fn parse_hex_byte(h1: char, h2: char) -> Option<u8> {
23
+ let high = hex_digit_to_u8(h1)?;
24
+ let low = hex_digit_to_u8(h2)?;
25
+ Some((high << 4) | low)
26
+ }
27
+
28
+ /// Decode a byte using Windows-1252 encoding for the 0x80-0x9F range.
29
+ ///
30
+ /// This function maps Windows-1252 bytes in the 0x80-0x9F range to their
31
+ /// corresponding Unicode characters. For other values, it returns the byte
32
+ /// as a character directly.
33
+ #[inline]
34
+ pub fn decode_windows_1252(byte: u8) -> char {
35
+ match byte {
36
+ 0x80 => '\u{20AC}', // Euro sign
37
+ 0x81 => '?',
38
+ 0x82 => '\u{201A}', // Single low-9 quotation mark
39
+ 0x83 => '\u{0192}', // Latin small letter f with hook
40
+ 0x84 => '\u{201E}', // Double low-9 quotation mark
41
+ 0x85 => '\u{2026}', // Horizontal ellipsis
42
+ 0x86 => '\u{2020}', // Dagger
43
+ 0x87 => '\u{2021}', // Double dagger
44
+ 0x88 => '\u{02C6}', // Modifier letter circumflex accent
45
+ 0x89 => '\u{2030}', // Per mille sign
46
+ 0x8A => '\u{0160}', // Latin capital letter S with caron
47
+ 0x8B => '\u{2039}', // Single left-pointing angle quotation mark
48
+ 0x8C => '\u{0152}', // Latin capital ligature OE
49
+ 0x8D => '?',
50
+ 0x8E => '\u{017D}', // Latin capital letter Z with caron
51
+ 0x8F => '?',
52
+ 0x90 => '?',
53
+ 0x91 => '\u{2018}', // Left single quotation mark
54
+ 0x92 => '\u{2019}', // Right single quotation mark
55
+ 0x93 => '\u{201C}', // Left double quotation mark
56
+ 0x94 => '\u{201D}', // Right double quotation mark
57
+ 0x95 => '\u{2022}', // Bullet
58
+ 0x96 => '\u{2013}', // En dash
59
+ 0x97 => '\u{2014}', // Em dash
60
+ 0x98 => '\u{02DC}', // Small tilde
61
+ 0x99 => '\u{2122}', // Trade mark sign
62
+ 0x9A => '\u{0161}', // Latin small letter s with caron
63
+ 0x9B => '\u{203A}', // Single right-pointing angle quotation mark
64
+ 0x9C => '\u{0153}', // Latin small ligature oe
65
+ 0x9D => '?',
66
+ 0x9E => '\u{017E}', // Latin small letter z with caron
67
+ 0x9F => '\u{0178}', // Latin capital letter Y with diaeresis
68
+ _ => byte as char,
69
+ }
70
+ }
71
+
72
+ /// Parse an RTF control word and extract its value.
73
+ ///
74
+ /// Returns a tuple of (control_word, optional_numeric_value).
75
+ pub fn parse_rtf_control_word(chars: &mut std::iter::Peekable<std::str::Chars>) -> (String, Option<i32>) {
76
+ let mut word = String::new();
77
+ let mut num_str = String::new();
78
+ let mut is_negative = false;
79
+
80
+ // Parse alphabetic control word
81
+ while let Some(&c) = chars.peek() {
82
+ if c.is_alphabetic() {
83
+ word.push(c);
84
+ chars.next();
85
+ } else {
86
+ break;
87
+ }
88
+ }
89
+
90
+ // Check for negative sign
91
+ if let Some(&c) = chars.peek()
92
+ && c == '-'
93
+ {
94
+ is_negative = true;
95
+ chars.next();
96
+ }
97
+
98
+ // Parse numeric parameter
99
+ while let Some(&c) = chars.peek() {
100
+ if c.is_ascii_digit() {
101
+ num_str.push(c);
102
+ chars.next();
103
+ } else {
104
+ break;
105
+ }
106
+ }
107
+
108
+ let num_value = if !num_str.is_empty() {
109
+ let val = num_str.parse::<i32>().unwrap_or(0);
110
+ Some(if is_negative { -val } else { val })
111
+ } else {
112
+ None
113
+ };
114
+
115
+ (word, num_value)
116
+ }
@@ -0,0 +1,24 @@
1
+ //! Text formatting utilities for RTF content.
2
+
3
+ /// Normalize whitespace in a string using a single-pass algorithm.
4
+ ///
5
+ /// Collapses multiple consecutive whitespace characters into single spaces
6
+ /// and trims leading/trailing whitespace.
7
+ pub fn normalize_whitespace(s: &str) -> String {
8
+ let mut result = String::with_capacity(s.len());
9
+ let mut last_was_space = false;
10
+
11
+ for ch in s.chars() {
12
+ if ch.is_whitespace() {
13
+ if !last_was_space {
14
+ result.push(' ');
15
+ last_was_space = true;
16
+ }
17
+ } else {
18
+ result.push(ch);
19
+ last_was_space = false;
20
+ }
21
+ }
22
+
23
+ result.trim().to_string()
24
+ }
@@ -0,0 +1,72 @@
1
+ //! Image metadata extraction from RTF documents.
2
+
3
+ use crate::extractors::rtf::encoding::parse_rtf_control_word;
4
+
5
+ /// Extract image metadata from within a \pict group.
6
+ ///
7
+ /// Looks for image type (jpegblip, pngblip, etc.) and dimensions.
8
+ pub fn extract_image_metadata(chars: &mut std::iter::Peekable<std::str::Chars>) -> String {
9
+ let mut metadata = String::new();
10
+ let mut image_type: Option<&str> = None;
11
+ let mut width_goal: Option<i32> = None;
12
+ let mut height_goal: Option<i32> = None;
13
+ let mut depth = 0;
14
+
15
+ while let Some(&ch) = chars.peek() {
16
+ match ch {
17
+ '{' => {
18
+ depth += 1;
19
+ chars.next();
20
+ }
21
+ '}' => {
22
+ if depth == 0 {
23
+ break;
24
+ }
25
+ depth -= 1;
26
+ chars.next();
27
+ }
28
+ '\\' => {
29
+ chars.next();
30
+ let (control_word, value) = parse_rtf_control_word(chars);
31
+
32
+ match control_word.as_str() {
33
+ "jpegblip" => image_type = Some("jpg"),
34
+ "pngblip" => image_type = Some("png"),
35
+ "wmetafile" => image_type = Some("wmf"),
36
+ "dibitmap" => image_type = Some("bmp"),
37
+ "picwgoal" => width_goal = value,
38
+ "pichgoal" => height_goal = value,
39
+ "bin" => break,
40
+ _ => {}
41
+ }
42
+ }
43
+ ' ' => {
44
+ chars.next();
45
+ }
46
+ _ => {
47
+ chars.next();
48
+ }
49
+ }
50
+ }
51
+
52
+ if let Some(itype) = image_type {
53
+ metadata.push_str("image.");
54
+ metadata.push_str(itype);
55
+ }
56
+
57
+ if let Some(width) = width_goal {
58
+ let width_inches = f64::from(width) / 1440.0;
59
+ metadata.push_str(&format!(" width=\"{:.1}in\"", width_inches));
60
+ }
61
+
62
+ if let Some(height) = height_goal {
63
+ let height_inches = f64::from(height) / 1440.0;
64
+ metadata.push_str(&format!(" height=\"{:.1}in\"", height_inches));
65
+ }
66
+
67
+ if metadata.is_empty() {
68
+ metadata.push_str("image.jpg");
69
+ }
70
+
71
+ metadata
72
+ }
@@ -0,0 +1,216 @@
1
+ //! Metadata extraction from RTF documents.
2
+
3
+ use crate::extractors::rtf::encoding::parse_rtf_control_word;
4
+ use serde_json::Value;
5
+ use std::collections::HashMap;
6
+
7
+ /// Parse a `{\\creatim ...}` or `{\\revtim ...}` RTF info block into ISO 8601 format.
8
+ pub fn parse_rtf_datetime(segment: &str) -> Option<String> {
9
+ let mut year: Option<i32> = None;
10
+ let mut month: Option<i32> = None;
11
+ let mut day: Option<i32> = None;
12
+ let mut hour: Option<i32> = None;
13
+ let mut minute: Option<i32> = None;
14
+
15
+ let mut chars = segment.chars().peekable();
16
+ while let Some(&ch) = chars.peek() {
17
+ if ch != '\\' {
18
+ chars.next();
19
+ continue;
20
+ }
21
+ chars.next();
22
+ let (word, value) = parse_rtf_control_word(&mut chars);
23
+ if let Some(v) = value {
24
+ match word.as_str() {
25
+ "yr" => year = Some(v),
26
+ "mo" => month = Some(v),
27
+ "dy" => day = Some(v),
28
+ "hr" => hour = Some(v),
29
+ "min" => minute = Some(v),
30
+ _ => {}
31
+ }
32
+ }
33
+ }
34
+
35
+ let year = year?;
36
+ let month = month.unwrap_or(1).max(1) as u32;
37
+ let day = day.unwrap_or(1).max(1) as u32;
38
+ let hour = hour.unwrap_or(0).max(0) as u32;
39
+ let minute = minute.unwrap_or(0).max(0) as u32;
40
+
41
+ Some(format!(
42
+ "{:04}-{:02}-{:02}T{:02}:{:02}:00Z",
43
+ year, month, day, hour, minute
44
+ ))
45
+ }
46
+
47
+ /// Extract metadata from the RTF `\\info` block and augment with computed statistics.
48
+ pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<String, Value> {
49
+ let mut metadata: HashMap<String, Value> = HashMap::new();
50
+
51
+ if let Some(start) = rtf_content.find("{\\info") {
52
+ let slice = &rtf_content[start..];
53
+ let mut depth = 0usize;
54
+ let mut end_offset: Option<usize> = None;
55
+
56
+ for (idx, ch) in slice.char_indices() {
57
+ match ch {
58
+ '{' => depth += 1,
59
+ '}' => {
60
+ if depth == 0 {
61
+ break;
62
+ }
63
+ depth -= 1;
64
+ if depth == 0 {
65
+ end_offset = Some(idx + 1);
66
+ break;
67
+ }
68
+ }
69
+ _ => {}
70
+ }
71
+ }
72
+
73
+ let info_block = end_offset.map(|end| &slice[..end]).unwrap_or(slice);
74
+
75
+ let mut segments: Vec<String> = Vec::new();
76
+ let mut seg_depth = 0usize;
77
+ let mut current = String::new();
78
+ let mut in_segment = false;
79
+
80
+ for ch in info_block.chars() {
81
+ if ch == '{' {
82
+ seg_depth += 1;
83
+ if seg_depth == 2 {
84
+ in_segment = true;
85
+ current.clear();
86
+ continue;
87
+ }
88
+ } else if ch == '}' {
89
+ if seg_depth == 2 && in_segment {
90
+ segments.push(current.clone());
91
+ in_segment = false;
92
+ }
93
+ seg_depth = seg_depth.saturating_sub(1);
94
+ continue;
95
+ }
96
+
97
+ if in_segment {
98
+ current.push(ch);
99
+ }
100
+ }
101
+
102
+ for segment in segments {
103
+ if !segment.starts_with('\\') {
104
+ continue;
105
+ }
106
+
107
+ let cleaned_segment = if segment.starts_with("\\*\\") {
108
+ segment.replacen("\\*\\", "\\", 1)
109
+ } else {
110
+ segment.clone()
111
+ };
112
+
113
+ let mut chars = cleaned_segment.chars().peekable();
114
+ chars.next();
115
+ let (keyword, numeric) = parse_rtf_control_word(&mut chars);
116
+ let remaining: String = chars.collect();
117
+ let trimmed = remaining.trim();
118
+
119
+ match keyword.as_str() {
120
+ "author" => {
121
+ if !trimmed.is_empty() {
122
+ let author = trimmed.to_string();
123
+ metadata.insert("created_by".to_string(), Value::String(author.clone()));
124
+ metadata.insert("authors".to_string(), Value::Array(vec![Value::String(author)]));
125
+ }
126
+ }
127
+ "operator" => {
128
+ if !trimmed.is_empty() {
129
+ metadata.insert("modified_by".to_string(), Value::String(trimmed.to_string()));
130
+ }
131
+ }
132
+ "title" => {
133
+ if !trimmed.is_empty() {
134
+ metadata.insert("title".to_string(), Value::String(trimmed.to_string()));
135
+ }
136
+ }
137
+ "subject" => {
138
+ if !trimmed.is_empty() {
139
+ metadata.insert("subject".to_string(), Value::String(trimmed.to_string()));
140
+ }
141
+ }
142
+ "generator" => {
143
+ if !trimmed.is_empty() {
144
+ metadata.insert("generator".to_string(), Value::String(trimmed.to_string()));
145
+ }
146
+ }
147
+ "creatim" => {
148
+ if let Some(dt) = parse_rtf_datetime(trimmed) {
149
+ metadata.insert("created_at".to_string(), Value::String(dt));
150
+ }
151
+ }
152
+ "revtim" => {
153
+ if let Some(dt) = parse_rtf_datetime(trimmed) {
154
+ metadata.insert("modified_at".to_string(), Value::String(dt));
155
+ }
156
+ }
157
+ "version" => {
158
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
159
+ metadata.insert("revision".to_string(), Value::String(val.to_string()));
160
+ }
161
+ }
162
+ "nofpages" => {
163
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
164
+ metadata.insert("page_count".to_string(), Value::Number(val.into()));
165
+ }
166
+ }
167
+ "nofwords" => {
168
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
169
+ metadata.insert("word_count".to_string(), Value::Number(val.into()));
170
+ }
171
+ }
172
+ "nofchars" => {
173
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
174
+ metadata.insert("character_count".to_string(), Value::Number(val.into()));
175
+ }
176
+ }
177
+ "lines" => {
178
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
179
+ metadata.insert("line_count".to_string(), Value::Number(val.into()));
180
+ }
181
+ }
182
+ "paragraphs" => {
183
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
184
+ metadata.insert("paragraph_count".to_string(), Value::Number(val.into()));
185
+ }
186
+ }
187
+ _ => {}
188
+ }
189
+ }
190
+ }
191
+
192
+ let cleaned_text = extracted_text.trim();
193
+ if !cleaned_text.is_empty() {
194
+ let word_count = cleaned_text.split_whitespace().count() as i64;
195
+ metadata
196
+ .entry("word_count".to_string())
197
+ .or_insert(Value::Number(word_count.into()));
198
+
199
+ let character_count = cleaned_text.chars().count() as i64;
200
+ metadata
201
+ .entry("character_count".to_string())
202
+ .or_insert(Value::Number(character_count.into()));
203
+
204
+ let line_count = cleaned_text.lines().count() as i64;
205
+ metadata
206
+ .entry("line_count".to_string())
207
+ .or_insert(Value::Number(line_count.into()));
208
+
209
+ let paragraph_count = cleaned_text.split("\n\n").filter(|p| !p.trim().is_empty()).count() as i64;
210
+ metadata
211
+ .entry("paragraph_count".to_string())
212
+ .or_insert(Value::Number(paragraph_count.into()));
213
+ }
214
+
215
+ metadata
216
+ }
@@ -0,0 +1,142 @@
1
+ //! RTF (Rich Text Format) extractor.
2
+ //!
3
+ //! Supports: Rich Text Format (.rtf)
4
+ //!
5
+ //! This native Rust extractor provides text extraction from RTF documents with:
6
+ //! - Character encoding support (Windows-1252 for 0x80-0x9F range)
7
+ //! - Common RTF control words (paragraph breaks, tabs, bullets, quotes, dashes)
8
+ //! - Unicode escape sequences
9
+ //! - Image metadata extraction
10
+ //! - Whitespace normalization
11
+
12
+ mod encoding;
13
+ mod formatting;
14
+ mod images;
15
+ mod metadata;
16
+ mod parser;
17
+ mod tables;
18
+
19
+ // Re-export public functions for backward compatibility
20
+ pub use encoding::{hex_digit_to_u8, parse_hex_byte, parse_rtf_control_word};
21
+ pub use formatting::normalize_whitespace;
22
+ pub use images::extract_image_metadata;
23
+ pub use metadata::{extract_rtf_metadata, parse_rtf_datetime};
24
+ pub use parser::extract_text_from_rtf;
25
+
26
+ use crate::Result;
27
+ use crate::core::config::ExtractionConfig;
28
+ use crate::plugins::{DocumentExtractor, Plugin};
29
+ use crate::types::{ExtractionResult, Metadata};
30
+ use async_trait::async_trait;
31
+
32
+ /// Native Rust RTF extractor.
33
+ ///
34
+ /// Extracts text content, metadata, and structure from RTF documents
35
+ pub struct RtfExtractor;
36
+
37
+ impl RtfExtractor {
38
+ /// Create a new RTF extractor.
39
+ pub fn new() -> Self {
40
+ Self
41
+ }
42
+ }
43
+
44
+ impl Default for RtfExtractor {
45
+ fn default() -> Self {
46
+ Self::new()
47
+ }
48
+ }
49
+
50
+ impl Plugin for RtfExtractor {
51
+ fn name(&self) -> &str {
52
+ "rtf-extractor"
53
+ }
54
+
55
+ fn version(&self) -> String {
56
+ env!("CARGO_PKG_VERSION").to_string()
57
+ }
58
+
59
+ fn initialize(&self) -> Result<()> {
60
+ Ok(())
61
+ }
62
+
63
+ fn shutdown(&self) -> Result<()> {
64
+ Ok(())
65
+ }
66
+
67
+ fn description(&self) -> &str {
68
+ "Extracts content from RTF (Rich Text Format) files with native Rust parsing"
69
+ }
70
+
71
+ fn author(&self) -> &str {
72
+ "Kreuzberg Team"
73
+ }
74
+ }
75
+
76
+ #[async_trait]
77
+ impl DocumentExtractor for RtfExtractor {
78
+ #[cfg_attr(feature = "otel", tracing::instrument(
79
+ skip(self, content, _config),
80
+ fields(
81
+ extractor.name = self.name(),
82
+ content.size_bytes = content.len(),
83
+ )
84
+ ))]
85
+ async fn extract_bytes(
86
+ &self,
87
+ content: &[u8],
88
+ mime_type: &str,
89
+ _config: &ExtractionConfig,
90
+ ) -> Result<ExtractionResult> {
91
+ let rtf_content = String::from_utf8_lossy(content);
92
+
93
+ let (extracted_text, tables) = extract_text_from_rtf(&rtf_content);
94
+ let metadata_map = extract_rtf_metadata(&rtf_content, &extracted_text);
95
+
96
+ Ok(ExtractionResult {
97
+ content: extracted_text,
98
+ mime_type: mime_type.to_string(),
99
+ metadata: Metadata {
100
+ additional: metadata_map,
101
+ ..Default::default()
102
+ },
103
+ pages: None,
104
+ tables,
105
+ detected_languages: None,
106
+ chunks: None,
107
+ images: None,
108
+ djot_content: None,
109
+ elements: None,
110
+ })
111
+ }
112
+
113
+ fn supported_mime_types(&self) -> &[&str] {
114
+ &["application/rtf", "text/rtf"]
115
+ }
116
+
117
+ fn priority(&self) -> i32 {
118
+ 50
119
+ }
120
+ }
121
+
122
+ #[cfg(test)]
123
+ mod tests {
124
+ use super::*;
125
+
126
+ #[tokio::test]
127
+ async fn test_rtf_extractor_plugin_interface() {
128
+ let extractor = RtfExtractor::new();
129
+ assert_eq!(extractor.name(), "rtf-extractor");
130
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
131
+ assert!(extractor.supported_mime_types().contains(&"application/rtf"));
132
+ assert_eq!(extractor.priority(), 50);
133
+ }
134
+
135
+ #[test]
136
+ fn test_simple_rtf_extraction() {
137
+ let _extractor = RtfExtractor;
138
+ let rtf_content = r#"{\rtf1 Hello World}"#;
139
+ let (extracted, _) = extract_text_from_rtf(rtf_content);
140
+ assert!(extracted.contains("Hello") || extracted.contains("World"));
141
+ }
142
+ }