kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -12,6 +12,10 @@
12
12
  //!
13
13
  //! Requires the `office` feature (which includes `pulldown-cmark`).
14
14
 
15
+ #[cfg(feature = "office")]
16
+ use super::frontmatter_utils::{
17
+ cells_to_markdown, extract_frontmatter, extract_metadata_from_yaml, extract_title_from_content,
18
+ };
15
19
  #[cfg(feature = "office")]
16
20
  use crate::Result;
17
21
  #[cfg(feature = "office")]
@@ -24,8 +28,6 @@ use crate::types::{ExtractionResult, Metadata, Table};
24
28
  use async_trait::async_trait;
25
29
  #[cfg(feature = "office")]
26
30
  use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
27
- #[cfg(feature = "office")]
28
- use serde_yaml_ng::Value as YamlValue;
29
31
 
30
32
  /// Enhanced Markdown extractor with metadata and table support.
31
33
  ///
@@ -44,102 +46,7 @@ impl MarkdownExtractor {
44
46
  Self
45
47
  }
46
48
 
47
- /// Extract YAML frontmatter from markdown content.
48
- ///
49
- /// Frontmatter is expected to be delimited by `---` at the start of the document.
50
- /// Returns the remaining content after frontmatter.
51
- fn extract_frontmatter(content: &str) -> (Option<YamlValue>, String) {
52
- if !content.starts_with("---") {
53
- return (None, content.to_string());
54
- }
55
-
56
- let rest = &content[3..];
57
- if let Some(end_pos) = rest.find("\n---") {
58
- let frontmatter_str = &rest[..end_pos];
59
- let remaining = &rest[end_pos + 4..];
60
-
61
- match serde_yaml_ng::from_str::<YamlValue>(frontmatter_str) {
62
- Ok(value) => (Some(value), remaining.to_string()),
63
- Err(_) => (None, content.to_string()),
64
- }
65
- } else {
66
- (None, content.to_string())
67
- }
68
- }
69
-
70
- /// Extract metadata from YAML frontmatter.
71
- ///
72
- /// Extracts the following YAML fields:
73
- /// - Standard fields: title, author, date, description (as subject)
74
- /// - Extended fields: abstract, subject, category, tags, language, version
75
- /// - Array fields (keywords, tags): converted to comma-separated strings
76
- fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
77
- let mut metadata = Metadata::default();
78
-
79
- if let Some(title) = yaml.get("title").and_then(|v| v.as_str()) {
80
- metadata.additional.insert("title".to_string(), title.into());
81
- }
82
-
83
- if let Some(author) = yaml.get("author").and_then(|v| v.as_str()) {
84
- metadata.additional.insert("author".to_string(), author.into());
85
- }
86
-
87
- if let Some(date) = yaml.get("date").and_then(|v| v.as_str()) {
88
- metadata.created_at = Some(date.to_string());
89
- }
90
-
91
- if let Some(keywords) = yaml.get("keywords") {
92
- match keywords {
93
- YamlValue::String(s) => {
94
- metadata.additional.insert("keywords".to_string(), s.clone().into());
95
- }
96
- YamlValue::Sequence(seq) => {
97
- let keywords_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
98
- metadata.additional.insert("keywords".to_string(), keywords_str.into());
99
- }
100
- _ => {}
101
- }
102
- }
103
-
104
- if let Some(description) = yaml.get("description").and_then(|v| v.as_str()) {
105
- metadata.subject = Some(description.to_string());
106
- }
107
-
108
- if let Some(abstract_text) = yaml.get("abstract").and_then(|v| v.as_str()) {
109
- metadata.additional.insert("abstract".to_string(), abstract_text.into());
110
- }
111
-
112
- if let Some(subject) = yaml.get("subject").and_then(|v| v.as_str()) {
113
- metadata.subject = Some(subject.to_string());
114
- }
115
-
116
- if let Some(category) = yaml.get("category").and_then(|v| v.as_str()) {
117
- metadata.additional.insert("category".to_string(), category.into());
118
- }
119
-
120
- if let Some(tags) = yaml.get("tags") {
121
- match tags {
122
- YamlValue::String(s) => {
123
- metadata.additional.insert("tags".to_string(), s.clone().into());
124
- }
125
- YamlValue::Sequence(seq) => {
126
- let tags_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
127
- metadata.additional.insert("tags".to_string(), tags_str.into());
128
- }
129
- _ => {}
130
- }
131
- }
132
-
133
- if let Some(language) = yaml.get("language").and_then(|v| v.as_str()) {
134
- metadata.additional.insert("language".to_string(), language.into());
135
- }
136
-
137
- if let Some(version) = yaml.get("version").and_then(|v| v.as_str()) {
138
- metadata.additional.insert("version".to_string(), version.into());
139
- }
140
-
141
- metadata
142
- }
49
+ // Frontmatter utilities moved to shared frontmatter_utils module
143
50
 
144
51
  /// Extract plain text from markdown AST.
145
52
  fn extract_text_from_events(events: &[Event]) -> String {
@@ -222,7 +129,7 @@ impl MarkdownExtractor {
222
129
  if let Some((cells, idx)) = current_table.take()
223
130
  && !cells.is_empty()
224
131
  {
225
- let markdown = Self::cells_to_markdown(&cells);
132
+ let markdown = cells_to_markdown(&cells);
226
133
  tables.push(Table {
227
134
  cells,
228
135
  markdown,
@@ -238,50 +145,7 @@ impl MarkdownExtractor {
238
145
  tables
239
146
  }
240
147
 
241
- /// Convert table cells to markdown format.
242
- fn cells_to_markdown(cells: &[Vec<String>]) -> String {
243
- if cells.is_empty() {
244
- return String::new();
245
- }
246
-
247
- let mut md = String::new();
248
-
249
- md.push('|');
250
- for cell in &cells[0] {
251
- md.push(' ');
252
- md.push_str(cell);
253
- md.push_str(" |");
254
- }
255
- md.push('\n');
256
-
257
- md.push('|');
258
- for _ in &cells[0] {
259
- md.push_str(" --- |");
260
- }
261
- md.push('\n');
262
-
263
- for row in &cells[1..] {
264
- md.push('|');
265
- for cell in row {
266
- md.push(' ');
267
- md.push_str(cell);
268
- md.push_str(" |");
269
- }
270
- md.push('\n');
271
- }
272
-
273
- md
274
- }
275
-
276
- /// Extract first heading as title if not in frontmatter.
277
- fn extract_title_from_content(content: &str) -> Option<String> {
278
- for line in content.lines() {
279
- if let Some(heading) = line.strip_prefix("# ") {
280
- return Some(heading.trim().to_string());
281
- }
282
- }
283
- None
284
- }
148
+ // cells_to_markdown and extract_title_from_content moved to shared frontmatter_utils module
285
149
  }
286
150
 
287
151
  #[cfg(feature = "office")]
@@ -336,16 +200,16 @@ impl DocumentExtractor for MarkdownExtractor {
336
200
  ) -> Result<ExtractionResult> {
337
201
  let text = String::from_utf8_lossy(content).into_owned();
338
202
 
339
- let (yaml, remaining_content) = Self::extract_frontmatter(&text);
203
+ let (yaml, remaining_content) = extract_frontmatter(&text);
340
204
 
341
205
  let mut metadata = if let Some(ref yaml_value) = yaml {
342
- Self::extract_metadata_from_yaml(yaml_value)
206
+ extract_metadata_from_yaml(yaml_value)
343
207
  } else {
344
208
  Metadata::default()
345
209
  };
346
210
 
347
211
  if !metadata.additional.contains_key("title")
348
- && let Some(title) = Self::extract_title_from_content(&remaining_content)
212
+ && let Some(title) = extract_title_from_content(&remaining_content)
349
213
  {
350
214
  metadata.additional.insert("title".to_string(), title.into());
351
215
  }
@@ -365,7 +229,9 @@ impl DocumentExtractor for MarkdownExtractor {
365
229
  detected_languages: None,
366
230
  chunks: None,
367
231
  images: None,
232
+ djot_content: None,
368
233
  pages: None,
234
+ elements: None,
369
235
  })
370
236
  }
371
237
 
@@ -380,7 +246,9 @@ impl DocumentExtractor for MarkdownExtractor {
380
246
 
381
247
  #[cfg(all(test, feature = "office"))]
382
248
  mod tests {
249
+ use super::super::frontmatter_utils::{cells_to_markdown, extract_frontmatter, extract_metadata_from_yaml};
383
250
  use super::*;
251
+ use serde_yaml_ng::Value as YamlValue;
384
252
 
385
253
  #[test]
386
254
  fn test_can_extract_markdown_mime_types() {
@@ -399,7 +267,7 @@ mod tests {
399
267
  b"# Header\n\nThis is a paragraph with **bold** and *italic* text.\n\n## Subheading\n\nMore content here.";
400
268
  let text = String::from_utf8_lossy(content).into_owned();
401
269
 
402
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
270
+ let (yaml, remaining) = extract_frontmatter(&text);
403
271
  assert!(yaml.is_none());
404
272
  assert!(!remaining.is_empty());
405
273
 
@@ -419,19 +287,25 @@ mod tests {
419
287
 
420
288
  let text = String::from_utf8_lossy(content).into_owned();
421
289
 
422
- let (yaml_opt, remaining) = MarkdownExtractor::extract_frontmatter(&text);
290
+ let (yaml_opt, remaining) = extract_frontmatter(&text);
423
291
  assert!(yaml_opt.is_some());
424
292
  assert!(remaining.contains("# Content"));
425
293
 
426
294
  let yaml = yaml_opt.expect("Should extract YAML frontmatter");
427
- let metadata = MarkdownExtractor::extract_metadata_from_yaml(&yaml);
295
+ let metadata = extract_metadata_from_yaml(&yaml);
428
296
 
429
297
  assert_eq!(
430
- metadata.additional.get("title").and_then(|v| v.as_str()),
298
+ metadata
299
+ .additional
300
+ .get("title")
301
+ .and_then(|v: &serde_json::Value| v.as_str()),
431
302
  Some("My Document")
432
303
  );
433
304
  assert_eq!(
434
- metadata.additional.get("author").and_then(|v| v.as_str()),
305
+ metadata
306
+ .additional
307
+ .get("author")
308
+ .and_then(|v: &serde_json::Value| v.as_str()),
435
309
  Some("John Doe")
436
310
  );
437
311
  assert_eq!(metadata.created_at, Some("2024-01-15".to_string()));
@@ -450,13 +324,16 @@ mod tests {
450
324
  let content = b"---\ntitle: Document\nkeywords:\n - rust\n - markdown\n - parsing\n---\n\nContent";
451
325
 
452
326
  let text = String::from_utf8_lossy(content).into_owned();
453
- let (yaml_opt, _remaining) = MarkdownExtractor::extract_frontmatter(&text);
327
+ let (yaml_opt, _remaining) = extract_frontmatter(&text);
454
328
 
455
329
  assert!(yaml_opt.is_some());
456
330
  let yaml = yaml_opt.expect("Should extract YAML frontmatter");
457
- let metadata = MarkdownExtractor::extract_metadata_from_yaml(&yaml);
331
+ let metadata = extract_metadata_from_yaml(&yaml);
458
332
 
459
- let keywords = metadata.additional.get("keywords").and_then(|v| v.as_str());
333
+ let keywords = metadata
334
+ .additional
335
+ .get("keywords")
336
+ .and_then(|v: &serde_json::Value| v.as_str());
460
337
  assert!(keywords.is_some());
461
338
  let keywords_str = keywords.expect("Should extract keywords from metadata");
462
339
  assert!(keywords_str.contains("rust"));
@@ -485,11 +362,11 @@ mod tests {
485
362
  let content = b"# Main Title\n\nSome content\n\nMore text";
486
363
  let text = String::from_utf8_lossy(content).into_owned();
487
364
 
488
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
365
+ let (yaml, remaining) = extract_frontmatter(&text);
489
366
  assert!(yaml.is_none());
490
367
  assert_eq!(remaining, text);
491
368
 
492
- let title = MarkdownExtractor::extract_title_from_content(&remaining);
369
+ let title = extract_title_from_content(&remaining);
493
370
  assert_eq!(title, Some("Main Title".to_string()));
494
371
  }
495
372
 
@@ -498,7 +375,7 @@ mod tests {
498
375
  let content = b"";
499
376
  let text = String::from_utf8_lossy(content).into_owned();
500
377
 
501
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
378
+ let (yaml, remaining) = extract_frontmatter(&text);
502
379
  assert!(yaml.is_none());
503
380
  assert!(remaining.is_empty());
504
381
 
@@ -513,7 +390,7 @@ mod tests {
513
390
  let content = b" \n\n \n";
514
391
  let text = String::from_utf8_lossy(content).into_owned();
515
392
 
516
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
393
+ let (yaml, remaining) = extract_frontmatter(&text);
517
394
  assert!(yaml.is_none());
518
395
 
519
396
  let parser = Parser::new_ext(&remaining, Options::ENABLE_TABLES);
@@ -528,7 +405,7 @@ mod tests {
528
405
 
529
406
  let text = String::from_utf8_lossy(content).into_owned();
530
407
 
531
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
408
+ let (yaml, remaining) = extract_frontmatter(&text);
532
409
  assert!(yaml.is_none());
533
410
 
534
411
  let parser = Parser::new_ext(&remaining, Options::ENABLE_TABLES);
@@ -580,7 +457,7 @@ mod tests {
580
457
  vec!["Data 3".to_string(), "Data 4".to_string()],
581
458
  ];
582
459
 
583
- let markdown = MarkdownExtractor::cells_to_markdown(&cells);
460
+ let markdown = cells_to_markdown(&cells);
584
461
  assert!(markdown.contains("Header 1"));
585
462
  assert!(markdown.contains("Data 1"));
586
463
  assert!(markdown.contains("---"));
@@ -619,7 +496,7 @@ mod tests {
619
496
  let content = b"---\nthis: is: invalid: yaml:\n---\n\nContent here";
620
497
  let text = String::from_utf8_lossy(content).into_owned();
621
498
 
622
- let (yaml, _remaining) = MarkdownExtractor::extract_frontmatter(&text);
499
+ let (yaml, _remaining) = extract_frontmatter(&text);
623
500
  let _ = yaml;
624
501
  }
625
502
 
@@ -650,7 +527,7 @@ nested:
650
527
  "#;
651
528
 
652
529
  let yaml: YamlValue = serde_yaml_ng::from_str(yaml_str).expect("Valid YAML");
653
- let metadata = MarkdownExtractor::extract_metadata_from_yaml(&yaml);
530
+ let metadata = extract_metadata_from_yaml(&yaml);
654
531
 
655
532
  assert_eq!(metadata.created_at, Some("2024-01-15".to_string()));
656
533
  assert_eq!(
@@ -64,6 +64,9 @@ pub trait SyncExtractor {
64
64
  pub mod structured;
65
65
  pub mod text;
66
66
 
67
+ pub mod djot_format;
68
+ pub mod frontmatter_utils;
69
+
67
70
  #[cfg(feature = "archives")]
68
71
  pub mod security;
69
72
 
@@ -166,6 +169,8 @@ pub use epub::EpubExtractor;
166
169
  #[cfg(feature = "office")]
167
170
  pub use fictionbook::FictionBookExtractor;
168
171
 
172
+ pub use djot_format::DjotExtractor;
173
+
169
174
  #[cfg(feature = "office")]
170
175
  pub use markdown::MarkdownExtractor as EnhancedMarkdownExtractor;
171
176
 
@@ -281,6 +286,8 @@ pub fn register_default_extractors() -> Result<()> {
281
286
  #[cfg(feature = "excel")]
282
287
  registry.register(Arc::new(ExcelExtractor::new()))?;
283
288
 
289
+ registry.register(Arc::new(DjotExtractor::new()))?;
290
+
284
291
  #[cfg(feature = "office")]
285
292
  {
286
293
  registry.register(Arc::new(EnhancedMarkdownExtractor::new()))?;
@@ -341,10 +348,11 @@ mod tests {
341
348
  let extractor_names = reg.list();
342
349
 
343
350
  #[allow(unused_mut)]
344
- let mut expected_count = 3;
351
+ let mut expected_count = 4; // plain-text, markdown, structured, djot
345
352
  assert!(extractor_names.contains(&"plain-text-extractor".to_string()));
346
353
  assert!(extractor_names.contains(&"markdown-extractor".to_string()));
347
354
  assert!(extractor_names.contains(&"structured-extractor".to_string()));
355
+ assert!(extractor_names.contains(&"djot-extractor".to_string()));
348
356
 
349
357
  #[cfg(feature = "ocr")]
350
358
  {
@@ -567,6 +567,8 @@ impl DocumentExtractor for OdtExtractor {
567
567
  detected_languages: None,
568
568
  chunks: None,
569
569
  images: None,
570
+ djot_content: None,
571
+ elements: None,
570
572
  })
571
573
  }
572
574
 
@@ -0,0 +1,165 @@
1
+ //! Core OPML extractor implementation.
2
+ //!
3
+ //! This module provides the main `OpmlExtractor` struct and implements the
4
+ //! `Plugin` and `DocumentExtractor` traits for OPML document processing.
5
+
6
+ use crate::Result;
7
+ use crate::core::config::ExtractionConfig;
8
+ use crate::plugins::{DocumentExtractor, Plugin};
9
+ use crate::types::{ExtractionResult, Metadata};
10
+ use async_trait::async_trait;
11
+
12
+ #[cfg(feature = "office")]
13
+ use super::parser;
14
+
15
+ /// OPML format extractor.
16
+ ///
17
+ /// Extracts outline structure and metadata from OPML documents using native Rust parsing.
18
+ pub struct OpmlExtractor;
19
+
20
+ impl OpmlExtractor {
21
+ /// Create a new OPML extractor.
22
+ pub fn new() -> Self {
23
+ Self
24
+ }
25
+ }
26
+
27
+ impl Default for OpmlExtractor {
28
+ fn default() -> Self {
29
+ Self::new()
30
+ }
31
+ }
32
+
33
+ impl Plugin for OpmlExtractor {
34
+ fn name(&self) -> &str {
35
+ "opml-extractor"
36
+ }
37
+
38
+ fn version(&self) -> String {
39
+ env!("CARGO_PKG_VERSION").to_string()
40
+ }
41
+
42
+ fn initialize(&self) -> Result<()> {
43
+ Ok(())
44
+ }
45
+
46
+ fn shutdown(&self) -> Result<()> {
47
+ Ok(())
48
+ }
49
+
50
+ fn description(&self) -> &str {
51
+ "Extracts content and metadata from OPML (Outline Processor Markup Language) documents"
52
+ }
53
+
54
+ fn author(&self) -> &str {
55
+ "Kreuzberg Team"
56
+ }
57
+ }
58
+
59
+ #[cfg(feature = "office")]
60
+ #[async_trait]
61
+ impl DocumentExtractor for OpmlExtractor {
62
+ #[cfg_attr(
63
+ feature = "otel",
64
+ tracing::instrument(
65
+ skip(self, content, _config),
66
+ fields(
67
+ extractor.name = self.name(),
68
+ content.size_bytes = content.len(),
69
+ )
70
+ )
71
+ )]
72
+ async fn extract_bytes(
73
+ &self,
74
+ content: &[u8],
75
+ mime_type: &str,
76
+ _config: &ExtractionConfig,
77
+ ) -> Result<ExtractionResult> {
78
+ let (extracted_content, metadata_map) = parser::extract_content_and_metadata(content)?;
79
+
80
+ Ok(ExtractionResult {
81
+ content: extracted_content,
82
+ mime_type: mime_type.to_string(),
83
+ metadata: Metadata {
84
+ additional: metadata_map,
85
+ ..Default::default()
86
+ },
87
+ pages: None,
88
+ tables: vec![],
89
+ detected_languages: None,
90
+ chunks: None,
91
+ images: None,
92
+ djot_content: None,
93
+ elements: None,
94
+ })
95
+ }
96
+
97
+ fn supported_mime_types(&self) -> &[&str] {
98
+ &["text/x-opml", "application/xml+opml"]
99
+ }
100
+
101
+ fn priority(&self) -> i32 {
102
+ 55
103
+ }
104
+ }
105
+
106
+ #[cfg(all(test, feature = "office"))]
107
+ mod tests {
108
+ use super::*;
109
+
110
+ #[test]
111
+ fn test_opml_extractor_plugin_interface() {
112
+ let extractor = OpmlExtractor::new();
113
+ assert_eq!(extractor.name(), "opml-extractor");
114
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
115
+ assert_eq!(extractor.priority(), 55);
116
+ assert!(!extractor.supported_mime_types().is_empty());
117
+ }
118
+
119
+ #[test]
120
+ fn test_opml_extractor_default() {
121
+ let extractor = OpmlExtractor;
122
+ assert_eq!(extractor.name(), "opml-extractor");
123
+ }
124
+
125
+ #[tokio::test]
126
+ async fn test_opml_extractor_initialize_shutdown() {
127
+ let extractor = OpmlExtractor::new();
128
+ assert!(extractor.initialize().is_ok());
129
+ assert!(extractor.shutdown().is_ok());
130
+ }
131
+
132
+ #[test]
133
+ fn test_opml_supported_mime_types() {
134
+ let extractor = OpmlExtractor::new();
135
+ let supported = extractor.supported_mime_types();
136
+ assert!(supported.contains(&"text/x-opml"));
137
+ assert!(supported.contains(&"application/xml+opml"));
138
+ }
139
+
140
+ #[tokio::test]
141
+ async fn test_opml_extractor_async_extraction() {
142
+ let extractor = OpmlExtractor::new();
143
+ let opml = br#"<?xml version="1.0"?>
144
+ <opml version="2.0">
145
+ <head>
146
+ <title>Async Test</title>
147
+ </head>
148
+ <body>
149
+ <outline text="Item" />
150
+ </body>
151
+ </opml>"#;
152
+
153
+ let result = extractor
154
+ .extract_bytes(opml, "text/x-opml", &ExtractionConfig::default())
155
+ .await
156
+ .expect("Should extract OPML asynchronously");
157
+
158
+ assert_eq!(result.mime_type, "text/x-opml");
159
+ assert!(result.content.contains("Item"));
160
+ assert_eq!(
161
+ result.metadata.additional.get("title").and_then(|v| v.as_str()),
162
+ Some("Async Test")
163
+ );
164
+ }
165
+ }
@@ -0,0 +1,31 @@
1
+ //! Native OPML (Outline Processor Markup Language) extractor using the `roxmltree` library.
2
+ //!
3
+ //! This extractor provides native Rust-based OPML extraction, parsing outline structures
4
+ //! commonly used for RSS feed lists, podcast directories, and general outlines.
5
+ //!
6
+ //! Extracts:
7
+ //! - Metadata from `<head>`: title, dateCreated, dateModified, ownerName, ownerEmail
8
+ //! - Content from `<body><outline>` hierarchy using text attributes
9
+ //! - Outline hierarchy structure preserved in plain text format with indentation
10
+ //! - Note: URLs (xmlUrl, htmlUrl) are extracted from attributes but not included in main content
11
+ //!
12
+ //! Example OPML structure:
13
+ //! ```xml
14
+ //! <opml version="2.0">
15
+ //! <head>
16
+ //! <title>My Feeds</title>
17
+ //! <ownerName>John</ownerName>
18
+ //! </head>
19
+ //! <body>
20
+ //! <outline text="Tech" type="folder">
21
+ //! <outline text="Hacker News" type="rss" xmlUrl="https://..." />
22
+ //! </outline>
23
+ //! </body>
24
+ //! </opml>
25
+ //! ```
26
+
27
+ mod core;
28
+ mod parser;
29
+
30
+ // Re-export public API
31
+ pub use core::OpmlExtractor;