kreuzberg 4.0.8 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (312) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +99 -2
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/spec/fixtures/config.toml +1 -1
  28. data/spec/fixtures/config.yaml +1 -1
  29. data/vendor/Cargo.toml +3 -3
  30. data/vendor/kreuzberg/Cargo.toml +5 -4
  31. data/vendor/kreuzberg/README.md +1 -1
  32. data/vendor/kreuzberg/src/api/config.rs +69 -0
  33. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  34. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  35. data/vendor/kreuzberg/src/api/router.rs +214 -0
  36. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  37. data/vendor/kreuzberg/src/api/types.rs +78 -0
  38. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  39. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  40. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  41. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  42. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  43. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  44. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  45. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  46. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  47. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  48. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  52. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  53. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  54. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  55. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  56. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  57. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  58. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  59. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  60. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  61. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  62. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  63. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  64. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  65. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  66. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  67. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  68. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  69. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  70. data/vendor/kreuzberg/src/core/mime.rs +15 -0
  71. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  72. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  73. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  74. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  75. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  76. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  77. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  78. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  79. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  80. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  81. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  83. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  84. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  85. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  86. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  87. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  88. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  89. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  90. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  91. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  92. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  93. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  94. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  95. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  96. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  97. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  98. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  99. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  103. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  104. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  105. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
  106. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  107. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  108. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  109. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  110. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  111. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  122. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  123. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  124. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  125. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  126. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  127. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  128. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  129. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  130. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  131. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  132. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  133. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  134. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  135. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  136. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  137. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  138. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  139. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  140. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  141. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  142. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  143. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  144. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  145. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  146. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  147. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  149. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  150. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  151. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  152. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  153. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  154. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  156. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  157. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  158. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  159. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  160. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  165. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  166. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  167. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  168. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  169. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  170. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  171. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  172. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  173. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  174. data/vendor/kreuzberg/src/lib.rs +2 -2
  175. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  176. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  177. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  178. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  179. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  180. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  181. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  182. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  183. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  184. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  185. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  186. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  187. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  188. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  189. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  190. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  191. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  192. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  193. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  194. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  195. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  196. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  197. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  198. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  199. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  200. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  201. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  202. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  203. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  204. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  205. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  206. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  207. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  208. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  209. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  210. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  211. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  212. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  213. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  214. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  215. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  216. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  220. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  221. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  222. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  223. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  224. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  225. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  233. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  234. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  235. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  236. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  237. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  238. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  239. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  240. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  241. data/vendor/kreuzberg/src/types/page.rs +182 -0
  242. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  243. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  244. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  245. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  246. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  247. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  250. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  251. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  252. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  253. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  254. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  255. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  256. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  257. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  258. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  260. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  261. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  262. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  263. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  264. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  265. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  266. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  267. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  268. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  269. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  270. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
  271. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  272. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  273. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  274. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  275. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  276. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  277. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  278. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  279. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  280. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  281. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  282. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  283. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  284. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  285. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  286. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  288. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  289. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  290. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  291. metadata +201 -28
  292. data/vendor/kreuzberg/src/api/server.rs +0 -518
  293. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  294. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  295. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  296. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  297. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  298. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  299. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  300. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  301. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  302. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  303. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  304. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  305. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  306. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  307. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  308. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  309. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  310. data/vendor/kreuzberg/src/types.rs +0 -1713
  311. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  312. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -41,6 +41,7 @@ impl PptxExtractor {
41
41
 
42
42
  let ocr_config = config.ocr.as_ref().unwrap();
43
43
  let tess_config = ocr_config.tesseract_config.as_ref().cloned().unwrap_or_default();
44
+ let output_format = config.output_format;
44
45
 
45
46
  for image in &mut images {
46
47
  let image_data = image.data.clone();
@@ -53,7 +54,7 @@ impl PptxExtractor {
53
54
 
54
55
  let proc = OcrProcessor::new(cache_dir)?;
55
56
  let ocr_tess_config: crate::ocr::types::TesseractConfig = (&tess_config_clone).into();
56
- proc.process_image(&image_data, &ocr_tess_config)
57
+ proc.process_image_with_format(&image_data, &ocr_tess_config, output_format)
57
58
  })
58
59
  .await
59
60
  .map_err(|e| crate::KreuzbergError::Ocr {
@@ -65,13 +66,15 @@ impl PptxExtractor {
65
66
  Ok(ocr_extraction) => {
66
67
  let extraction_result = ExtractionResult {
67
68
  content: ocr_extraction.content,
68
- mime_type: image.format.clone(),
69
+ mime_type: ocr_extraction.mime_type,
69
70
  metadata: Metadata::default(),
70
71
  tables: vec![],
71
72
  detected_languages: None,
72
73
  chunks: None,
73
74
  images: None,
75
+ djot_content: None,
74
76
  pages: None,
77
+ elements: None,
75
78
  };
76
79
  image.ocr_result = Some(Box::new(extraction_result));
77
80
  }
@@ -178,6 +181,8 @@ impl DocumentExtractor for PptxExtractor {
178
181
  detected_languages: None,
179
182
  chunks: None,
180
183
  images,
184
+ djot_content: None,
185
+ elements: None,
181
186
  })
182
187
  }
183
188
 
@@ -241,6 +246,8 @@ impl DocumentExtractor for PptxExtractor {
241
246
  detected_languages: None,
242
247
  chunks: None,
243
248
  images,
249
+ djot_content: None,
250
+ elements: None,
244
251
  })
245
252
  }
246
253
 
@@ -453,7 +453,9 @@ impl DocumentExtractor for RstExtractor {
453
453
  detected_languages: None,
454
454
  chunks: None,
455
455
  images: None,
456
+ djot_content: None,
456
457
  pages: None,
458
+ elements: None,
457
459
  })
458
460
  }
459
461
 
@@ -0,0 +1,116 @@
1
+ //! Character encoding utilities for RTF parsing.
2
+ //!
3
+ //! Provides hex byte parsing and Windows-1252 character mapping for the 0x80-0x9F range.
4
+
5
+ /// Convert a hex digit character to its numeric value.
6
+ ///
7
+ /// Returns None if the character is not a valid hex digit.
8
+ #[inline]
9
+ pub fn hex_digit_to_u8(c: char) -> Option<u8> {
10
+ match c {
11
+ '0'..='9' => Some((c as u8) - b'0'),
12
+ 'a'..='f' => Some((c as u8) - b'a' + 10),
13
+ 'A'..='F' => Some((c as u8) - b'A' + 10),
14
+ _ => None,
15
+ }
16
+ }
17
+
18
+ /// Parse a hex-encoded byte from two characters.
19
+ ///
20
+ /// Returns the decoded byte if both characters are valid hex digits.
21
+ #[inline]
22
+ pub fn parse_hex_byte(h1: char, h2: char) -> Option<u8> {
23
+ let high = hex_digit_to_u8(h1)?;
24
+ let low = hex_digit_to_u8(h2)?;
25
+ Some((high << 4) | low)
26
+ }
27
+
28
+ /// Decode a byte using Windows-1252 encoding for the 0x80-0x9F range.
29
+ ///
30
+ /// This function maps Windows-1252 bytes in the 0x80-0x9F range to their
31
+ /// corresponding Unicode characters. For other values, it returns the byte
32
+ /// as a character directly.
33
+ #[inline]
34
+ pub fn decode_windows_1252(byte: u8) -> char {
35
+ match byte {
36
+ 0x80 => '\u{20AC}', // Euro sign
37
+ 0x81 => '?',
38
+ 0x82 => '\u{201A}', // Single low-9 quotation mark
39
+ 0x83 => '\u{0192}', // Latin small letter f with hook
40
+ 0x84 => '\u{201E}', // Double low-9 quotation mark
41
+ 0x85 => '\u{2026}', // Horizontal ellipsis
42
+ 0x86 => '\u{2020}', // Dagger
43
+ 0x87 => '\u{2021}', // Double dagger
44
+ 0x88 => '\u{02C6}', // Modifier letter circumflex accent
45
+ 0x89 => '\u{2030}', // Per mille sign
46
+ 0x8A => '\u{0160}', // Latin capital letter S with caron
47
+ 0x8B => '\u{2039}', // Single left-pointing angle quotation mark
48
+ 0x8C => '\u{0152}', // Latin capital ligature OE
49
+ 0x8D => '?',
50
+ 0x8E => '\u{017D}', // Latin capital letter Z with caron
51
+ 0x8F => '?',
52
+ 0x90 => '?',
53
+ 0x91 => '\u{2018}', // Left single quotation mark
54
+ 0x92 => '\u{2019}', // Right single quotation mark
55
+ 0x93 => '\u{201C}', // Left double quotation mark
56
+ 0x94 => '\u{201D}', // Right double quotation mark
57
+ 0x95 => '\u{2022}', // Bullet
58
+ 0x96 => '\u{2013}', // En dash
59
+ 0x97 => '\u{2014}', // Em dash
60
+ 0x98 => '\u{02DC}', // Small tilde
61
+ 0x99 => '\u{2122}', // Trade mark sign
62
+ 0x9A => '\u{0161}', // Latin small letter s with caron
63
+ 0x9B => '\u{203A}', // Single right-pointing angle quotation mark
64
+ 0x9C => '\u{0153}', // Latin small ligature oe
65
+ 0x9D => '?',
66
+ 0x9E => '\u{017E}', // Latin small letter z with caron
67
+ 0x9F => '\u{0178}', // Latin capital letter Y with diaeresis
68
+ _ => byte as char,
69
+ }
70
+ }
71
+
72
+ /// Parse an RTF control word and extract its value.
73
+ ///
74
+ /// Returns a tuple of (control_word, optional_numeric_value).
75
+ pub fn parse_rtf_control_word(chars: &mut std::iter::Peekable<std::str::Chars>) -> (String, Option<i32>) {
76
+ let mut word = String::new();
77
+ let mut num_str = String::new();
78
+ let mut is_negative = false;
79
+
80
+ // Parse alphabetic control word
81
+ while let Some(&c) = chars.peek() {
82
+ if c.is_alphabetic() {
83
+ word.push(c);
84
+ chars.next();
85
+ } else {
86
+ break;
87
+ }
88
+ }
89
+
90
+ // Check for negative sign
91
+ if let Some(&c) = chars.peek()
92
+ && c == '-'
93
+ {
94
+ is_negative = true;
95
+ chars.next();
96
+ }
97
+
98
+ // Parse numeric parameter
99
+ while let Some(&c) = chars.peek() {
100
+ if c.is_ascii_digit() {
101
+ num_str.push(c);
102
+ chars.next();
103
+ } else {
104
+ break;
105
+ }
106
+ }
107
+
108
+ let num_value = if !num_str.is_empty() {
109
+ let val = num_str.parse::<i32>().unwrap_or(0);
110
+ Some(if is_negative { -val } else { val })
111
+ } else {
112
+ None
113
+ };
114
+
115
+ (word, num_value)
116
+ }
@@ -0,0 +1,24 @@
1
+ //! Text formatting utilities for RTF content.
2
+
3
+ /// Normalize whitespace in a string using a single-pass algorithm.
4
+ ///
5
+ /// Collapses multiple consecutive whitespace characters into single spaces
6
+ /// and trims leading/trailing whitespace.
7
+ pub fn normalize_whitespace(s: &str) -> String {
8
+ let mut result = String::with_capacity(s.len());
9
+ let mut last_was_space = false;
10
+
11
+ for ch in s.chars() {
12
+ if ch.is_whitespace() {
13
+ if !last_was_space {
14
+ result.push(' ');
15
+ last_was_space = true;
16
+ }
17
+ } else {
18
+ result.push(ch);
19
+ last_was_space = false;
20
+ }
21
+ }
22
+
23
+ result.trim().to_string()
24
+ }
@@ -0,0 +1,72 @@
1
+ //! Image metadata extraction from RTF documents.
2
+
3
+ use crate::extractors::rtf::encoding::parse_rtf_control_word;
4
+
5
+ /// Extract image metadata from within a \pict group.
6
+ ///
7
+ /// Looks for image type (jpegblip, pngblip, etc.) and dimensions.
8
+ pub fn extract_image_metadata(chars: &mut std::iter::Peekable<std::str::Chars>) -> String {
9
+ let mut metadata = String::new();
10
+ let mut image_type: Option<&str> = None;
11
+ let mut width_goal: Option<i32> = None;
12
+ let mut height_goal: Option<i32> = None;
13
+ let mut depth = 0;
14
+
15
+ while let Some(&ch) = chars.peek() {
16
+ match ch {
17
+ '{' => {
18
+ depth += 1;
19
+ chars.next();
20
+ }
21
+ '}' => {
22
+ if depth == 0 {
23
+ break;
24
+ }
25
+ depth -= 1;
26
+ chars.next();
27
+ }
28
+ '\\' => {
29
+ chars.next();
30
+ let (control_word, value) = parse_rtf_control_word(chars);
31
+
32
+ match control_word.as_str() {
33
+ "jpegblip" => image_type = Some("jpg"),
34
+ "pngblip" => image_type = Some("png"),
35
+ "wmetafile" => image_type = Some("wmf"),
36
+ "dibitmap" => image_type = Some("bmp"),
37
+ "picwgoal" => width_goal = value,
38
+ "pichgoal" => height_goal = value,
39
+ "bin" => break,
40
+ _ => {}
41
+ }
42
+ }
43
+ ' ' => {
44
+ chars.next();
45
+ }
46
+ _ => {
47
+ chars.next();
48
+ }
49
+ }
50
+ }
51
+
52
+ if let Some(itype) = image_type {
53
+ metadata.push_str("image.");
54
+ metadata.push_str(itype);
55
+ }
56
+
57
+ if let Some(width) = width_goal {
58
+ let width_inches = f64::from(width) / 1440.0;
59
+ metadata.push_str(&format!(" width=\"{:.1}in\"", width_inches));
60
+ }
61
+
62
+ if let Some(height) = height_goal {
63
+ let height_inches = f64::from(height) / 1440.0;
64
+ metadata.push_str(&format!(" height=\"{:.1}in\"", height_inches));
65
+ }
66
+
67
+ if metadata.is_empty() {
68
+ metadata.push_str("image.jpg");
69
+ }
70
+
71
+ metadata
72
+ }
@@ -0,0 +1,216 @@
1
+ //! Metadata extraction from RTF documents.
2
+
3
+ use crate::extractors::rtf::encoding::parse_rtf_control_word;
4
+ use serde_json::Value;
5
+ use std::collections::HashMap;
6
+
7
+ /// Parse a `{\\creatim ...}` or `{\\revtim ...}` RTF info block into ISO 8601 format.
8
+ pub fn parse_rtf_datetime(segment: &str) -> Option<String> {
9
+ let mut year: Option<i32> = None;
10
+ let mut month: Option<i32> = None;
11
+ let mut day: Option<i32> = None;
12
+ let mut hour: Option<i32> = None;
13
+ let mut minute: Option<i32> = None;
14
+
15
+ let mut chars = segment.chars().peekable();
16
+ while let Some(&ch) = chars.peek() {
17
+ if ch != '\\' {
18
+ chars.next();
19
+ continue;
20
+ }
21
+ chars.next();
22
+ let (word, value) = parse_rtf_control_word(&mut chars);
23
+ if let Some(v) = value {
24
+ match word.as_str() {
25
+ "yr" => year = Some(v),
26
+ "mo" => month = Some(v),
27
+ "dy" => day = Some(v),
28
+ "hr" => hour = Some(v),
29
+ "min" => minute = Some(v),
30
+ _ => {}
31
+ }
32
+ }
33
+ }
34
+
35
+ let year = year?;
36
+ let month = month.unwrap_or(1).max(1) as u32;
37
+ let day = day.unwrap_or(1).max(1) as u32;
38
+ let hour = hour.unwrap_or(0).max(0) as u32;
39
+ let minute = minute.unwrap_or(0).max(0) as u32;
40
+
41
+ Some(format!(
42
+ "{:04}-{:02}-{:02}T{:02}:{:02}:00Z",
43
+ year, month, day, hour, minute
44
+ ))
45
+ }
46
+
47
+ /// Extract metadata from the RTF `\\info` block and augment with computed statistics.
48
+ pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<String, Value> {
49
+ let mut metadata: HashMap<String, Value> = HashMap::new();
50
+
51
+ if let Some(start) = rtf_content.find("{\\info") {
52
+ let slice = &rtf_content[start..];
53
+ let mut depth = 0usize;
54
+ let mut end_offset: Option<usize> = None;
55
+
56
+ for (idx, ch) in slice.char_indices() {
57
+ match ch {
58
+ '{' => depth += 1,
59
+ '}' => {
60
+ if depth == 0 {
61
+ break;
62
+ }
63
+ depth -= 1;
64
+ if depth == 0 {
65
+ end_offset = Some(idx + 1);
66
+ break;
67
+ }
68
+ }
69
+ _ => {}
70
+ }
71
+ }
72
+
73
+ let info_block = end_offset.map(|end| &slice[..end]).unwrap_or(slice);
74
+
75
+ let mut segments: Vec<String> = Vec::new();
76
+ let mut seg_depth = 0usize;
77
+ let mut current = String::new();
78
+ let mut in_segment = false;
79
+
80
+ for ch in info_block.chars() {
81
+ if ch == '{' {
82
+ seg_depth += 1;
83
+ if seg_depth == 2 {
84
+ in_segment = true;
85
+ current.clear();
86
+ continue;
87
+ }
88
+ } else if ch == '}' {
89
+ if seg_depth == 2 && in_segment {
90
+ segments.push(current.clone());
91
+ in_segment = false;
92
+ }
93
+ seg_depth = seg_depth.saturating_sub(1);
94
+ continue;
95
+ }
96
+
97
+ if in_segment {
98
+ current.push(ch);
99
+ }
100
+ }
101
+
102
+ for segment in segments {
103
+ if !segment.starts_with('\\') {
104
+ continue;
105
+ }
106
+
107
+ let cleaned_segment = if segment.starts_with("\\*\\") {
108
+ segment.replacen("\\*\\", "\\", 1)
109
+ } else {
110
+ segment.clone()
111
+ };
112
+
113
+ let mut chars = cleaned_segment.chars().peekable();
114
+ chars.next();
115
+ let (keyword, numeric) = parse_rtf_control_word(&mut chars);
116
+ let remaining: String = chars.collect();
117
+ let trimmed = remaining.trim();
118
+
119
+ match keyword.as_str() {
120
+ "author" => {
121
+ if !trimmed.is_empty() {
122
+ let author = trimmed.to_string();
123
+ metadata.insert("created_by".to_string(), Value::String(author.clone()));
124
+ metadata.insert("authors".to_string(), Value::Array(vec![Value::String(author)]));
125
+ }
126
+ }
127
+ "operator" => {
128
+ if !trimmed.is_empty() {
129
+ metadata.insert("modified_by".to_string(), Value::String(trimmed.to_string()));
130
+ }
131
+ }
132
+ "title" => {
133
+ if !trimmed.is_empty() {
134
+ metadata.insert("title".to_string(), Value::String(trimmed.to_string()));
135
+ }
136
+ }
137
+ "subject" => {
138
+ if !trimmed.is_empty() {
139
+ metadata.insert("subject".to_string(), Value::String(trimmed.to_string()));
140
+ }
141
+ }
142
+ "generator" => {
143
+ if !trimmed.is_empty() {
144
+ metadata.insert("generator".to_string(), Value::String(trimmed.to_string()));
145
+ }
146
+ }
147
+ "creatim" => {
148
+ if let Some(dt) = parse_rtf_datetime(trimmed) {
149
+ metadata.insert("created_at".to_string(), Value::String(dt));
150
+ }
151
+ }
152
+ "revtim" => {
153
+ if let Some(dt) = parse_rtf_datetime(trimmed) {
154
+ metadata.insert("modified_at".to_string(), Value::String(dt));
155
+ }
156
+ }
157
+ "version" => {
158
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
159
+ metadata.insert("revision".to_string(), Value::String(val.to_string()));
160
+ }
161
+ }
162
+ "nofpages" => {
163
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
164
+ metadata.insert("page_count".to_string(), Value::Number(val.into()));
165
+ }
166
+ }
167
+ "nofwords" => {
168
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
169
+ metadata.insert("word_count".to_string(), Value::Number(val.into()));
170
+ }
171
+ }
172
+ "nofchars" => {
173
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
174
+ metadata.insert("character_count".to_string(), Value::Number(val.into()));
175
+ }
176
+ }
177
+ "lines" => {
178
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
179
+ metadata.insert("line_count".to_string(), Value::Number(val.into()));
180
+ }
181
+ }
182
+ "paragraphs" => {
183
+ if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
184
+ metadata.insert("paragraph_count".to_string(), Value::Number(val.into()));
185
+ }
186
+ }
187
+ _ => {}
188
+ }
189
+ }
190
+ }
191
+
192
+ let cleaned_text = extracted_text.trim();
193
+ if !cleaned_text.is_empty() {
194
+ let word_count = cleaned_text.split_whitespace().count() as i64;
195
+ metadata
196
+ .entry("word_count".to_string())
197
+ .or_insert(Value::Number(word_count.into()));
198
+
199
+ let character_count = cleaned_text.chars().count() as i64;
200
+ metadata
201
+ .entry("character_count".to_string())
202
+ .or_insert(Value::Number(character_count.into()));
203
+
204
+ let line_count = cleaned_text.lines().count() as i64;
205
+ metadata
206
+ .entry("line_count".to_string())
207
+ .or_insert(Value::Number(line_count.into()));
208
+
209
+ let paragraph_count = cleaned_text.split("\n\n").filter(|p| !p.trim().is_empty()).count() as i64;
210
+ metadata
211
+ .entry("paragraph_count".to_string())
212
+ .or_insert(Value::Number(paragraph_count.into()));
213
+ }
214
+
215
+ metadata
216
+ }
@@ -0,0 +1,142 @@
1
+ //! RTF (Rich Text Format) extractor.
2
+ //!
3
+ //! Supports: Rich Text Format (.rtf)
4
+ //!
5
+ //! This native Rust extractor provides text extraction from RTF documents with:
6
+ //! - Character encoding support (Windows-1252 for 0x80-0x9F range)
7
+ //! - Common RTF control words (paragraph breaks, tabs, bullets, quotes, dashes)
8
+ //! - Unicode escape sequences
9
+ //! - Image metadata extraction
10
+ //! - Whitespace normalization
11
+
12
+ mod encoding;
13
+ mod formatting;
14
+ mod images;
15
+ mod metadata;
16
+ mod parser;
17
+ mod tables;
18
+
19
+ // Re-export public functions for backward compatibility
20
+ pub use encoding::{hex_digit_to_u8, parse_hex_byte, parse_rtf_control_word};
21
+ pub use formatting::normalize_whitespace;
22
+ pub use images::extract_image_metadata;
23
+ pub use metadata::{extract_rtf_metadata, parse_rtf_datetime};
24
+ pub use parser::extract_text_from_rtf;
25
+
26
+ use crate::Result;
27
+ use crate::core::config::ExtractionConfig;
28
+ use crate::plugins::{DocumentExtractor, Plugin};
29
+ use crate::types::{ExtractionResult, Metadata};
30
+ use async_trait::async_trait;
31
+
32
+ /// Native Rust RTF extractor.
33
+ ///
34
+ /// Extracts text content, metadata, and structure from RTF documents
35
+ pub struct RtfExtractor;
36
+
37
+ impl RtfExtractor {
38
+ /// Create a new RTF extractor.
39
+ pub fn new() -> Self {
40
+ Self
41
+ }
42
+ }
43
+
44
+ impl Default for RtfExtractor {
45
+ fn default() -> Self {
46
+ Self::new()
47
+ }
48
+ }
49
+
50
+ impl Plugin for RtfExtractor {
51
+ fn name(&self) -> &str {
52
+ "rtf-extractor"
53
+ }
54
+
55
+ fn version(&self) -> String {
56
+ env!("CARGO_PKG_VERSION").to_string()
57
+ }
58
+
59
+ fn initialize(&self) -> Result<()> {
60
+ Ok(())
61
+ }
62
+
63
+ fn shutdown(&self) -> Result<()> {
64
+ Ok(())
65
+ }
66
+
67
+ fn description(&self) -> &str {
68
+ "Extracts content from RTF (Rich Text Format) files with native Rust parsing"
69
+ }
70
+
71
+ fn author(&self) -> &str {
72
+ "Kreuzberg Team"
73
+ }
74
+ }
75
+
76
+ #[async_trait]
77
+ impl DocumentExtractor for RtfExtractor {
78
+ #[cfg_attr(feature = "otel", tracing::instrument(
79
+ skip(self, content, _config),
80
+ fields(
81
+ extractor.name = self.name(),
82
+ content.size_bytes = content.len(),
83
+ )
84
+ ))]
85
+ async fn extract_bytes(
86
+ &self,
87
+ content: &[u8],
88
+ mime_type: &str,
89
+ _config: &ExtractionConfig,
90
+ ) -> Result<ExtractionResult> {
91
+ let rtf_content = String::from_utf8_lossy(content);
92
+
93
+ let (extracted_text, tables) = extract_text_from_rtf(&rtf_content);
94
+ let metadata_map = extract_rtf_metadata(&rtf_content, &extracted_text);
95
+
96
+ Ok(ExtractionResult {
97
+ content: extracted_text,
98
+ mime_type: mime_type.to_string(),
99
+ metadata: Metadata {
100
+ additional: metadata_map,
101
+ ..Default::default()
102
+ },
103
+ pages: None,
104
+ tables,
105
+ detected_languages: None,
106
+ chunks: None,
107
+ images: None,
108
+ djot_content: None,
109
+ elements: None,
110
+ })
111
+ }
112
+
113
+ fn supported_mime_types(&self) -> &[&str] {
114
+ &["application/rtf", "text/rtf"]
115
+ }
116
+
117
+ fn priority(&self) -> i32 {
118
+ 50
119
+ }
120
+ }
121
+
122
+ #[cfg(test)]
123
+ mod tests {
124
+ use super::*;
125
+
126
+ #[tokio::test]
127
+ async fn test_rtf_extractor_plugin_interface() {
128
+ let extractor = RtfExtractor::new();
129
+ assert_eq!(extractor.name(), "rtf-extractor");
130
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
131
+ assert!(extractor.supported_mime_types().contains(&"application/rtf"));
132
+ assert_eq!(extractor.priority(), 50);
133
+ }
134
+
135
+ #[test]
136
+ fn test_simple_rtf_extraction() {
137
+ let _extractor = RtfExtractor;
138
+ let rtf_content = r#"{\rtf1 Hello World}"#;
139
+ let (extracted, _) = extract_text_from_rtf(rtf_content);
140
+ assert!(extracted.contains("Hello") || extracted.contains("World"));
141
+ }
142
+ }