kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +28 -116
  6. data/README.md +269 -629
  7. data/Rakefile +0 -9
  8. data/Steepfile +4 -8
  9. data/examples/async_patterns.rb +58 -1
  10. data/ext/kreuzberg_rb/extconf.rb +5 -35
  11. data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
  12. data/ext/kreuzberg_rb/native/build.rs +14 -12
  13. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  14. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  15. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  16. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  17. data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
  18. data/extconf.rb +6 -38
  19. data/kreuzberg.gemspec +20 -114
  20. data/lib/kreuzberg/api_proxy.rb +18 -2
  21. data/lib/kreuzberg/cache_api.rb +0 -22
  22. data/lib/kreuzberg/cli.rb +10 -2
  23. data/lib/kreuzberg/cli_proxy.rb +10 -0
  24. data/lib/kreuzberg/config.rb +22 -274
  25. data/lib/kreuzberg/errors.rb +7 -73
  26. data/lib/kreuzberg/extraction_api.rb +8 -237
  27. data/lib/kreuzberg/mcp_proxy.rb +11 -2
  28. data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
  29. data/lib/kreuzberg/post_processor_protocol.rb +71 -0
  30. data/lib/kreuzberg/result.rb +33 -151
  31. data/lib/kreuzberg/setup_lib_path.rb +2 -22
  32. data/lib/kreuzberg/validator_protocol.rb +73 -0
  33. data/lib/kreuzberg/version.rb +1 -1
  34. data/lib/kreuzberg.rb +13 -27
  35. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  36. data/sig/kreuzberg.rbs +12 -105
  37. data/spec/binding/cache_spec.rb +22 -22
  38. data/spec/binding/cli_proxy_spec.rb +4 -2
  39. data/spec/binding/cli_spec.rb +11 -12
  40. data/spec/binding/config_spec.rb +0 -74
  41. data/spec/binding/config_validation_spec.rb +6 -100
  42. data/spec/binding/error_handling_spec.rb +97 -283
  43. data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
  44. data/spec/binding/plugins/postprocessor_spec.rb +11 -11
  45. data/spec/binding/plugins/validator_spec.rb +13 -12
  46. data/spec/examples.txt +104 -0
  47. data/spec/fixtures/config.toml +1 -0
  48. data/spec/fixtures/config.yaml +1 -0
  49. data/spec/fixtures/invalid_config.toml +1 -0
  50. data/spec/smoke/package_spec.rb +3 -2
  51. data/spec/spec_helper.rb +3 -1
  52. data/vendor/kreuzberg/Cargo.toml +67 -192
  53. data/vendor/kreuzberg/README.md +9 -97
  54. data/vendor/kreuzberg/build.rs +194 -516
  55. data/vendor/kreuzberg/src/api/handlers.rs +9 -130
  56. data/vendor/kreuzberg/src/api/mod.rs +3 -18
  57. data/vendor/kreuzberg/src/api/server.rs +71 -236
  58. data/vendor/kreuzberg/src/api/types.rs +7 -43
  59. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  60. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  61. data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
  62. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  63. data/vendor/kreuzberg/src/core/config.rs +23 -905
  64. data/vendor/kreuzberg/src/core/extractor.rs +106 -403
  65. data/vendor/kreuzberg/src/core/io.rs +2 -4
  66. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  67. data/vendor/kreuzberg/src/core/mod.rs +3 -22
  68. data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
  69. data/vendor/kreuzberg/src/embeddings.rs +21 -169
  70. data/vendor/kreuzberg/src/error.rs +2 -2
  71. data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
  72. data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
  73. data/vendor/kreuzberg/src/extraction/email.rs +11 -12
  74. data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
  75. data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
  76. data/vendor/kreuzberg/src/extraction/image.rs +14 -138
  77. data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
  78. data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
  79. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  80. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  81. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  82. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  83. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  84. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  85. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  86. data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
  87. data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
  88. data/vendor/kreuzberg/src/extraction/table.rs +1 -2
  89. data/vendor/kreuzberg/src/extraction/text.rs +10 -18
  90. data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
  91. data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
  92. data/vendor/kreuzberg/src/extractors/email.rs +9 -37
  93. data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
  94. data/vendor/kreuzberg/src/extractors/html.rs +173 -182
  95. data/vendor/kreuzberg/src/extractors/image.rs +8 -32
  96. data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
  97. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  98. data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
  99. data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
  100. data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
  101. data/vendor/kreuzberg/src/extractors/text.rs +7 -30
  102. data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
  103. data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
  104. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  105. data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
  106. data/vendor/kreuzberg/src/lib.rs +5 -17
  107. data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
  108. data/vendor/kreuzberg/src/mcp/server.rs +21 -145
  109. data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
  110. data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
  111. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
  112. data/vendor/kreuzberg/src/pdf/error.rs +1 -93
  113. data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
  114. data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
  115. data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
  116. data/vendor/kreuzberg/src/pdf/table.rs +64 -61
  117. data/vendor/kreuzberg/src/pdf/text.rs +24 -416
  118. data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
  119. data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
  120. data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
  121. data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
  122. data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
  123. data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
  124. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  125. data/vendor/kreuzberg/src/text/mod.rs +0 -8
  126. data/vendor/kreuzberg/src/text/quality.rs +15 -28
  127. data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
  128. data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
  129. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
  130. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
  131. data/vendor/kreuzberg/src/types.rs +67 -907
  132. data/vendor/kreuzberg/src/utils/mod.rs +0 -14
  133. data/vendor/kreuzberg/src/utils/quality.rs +3 -12
  134. data/vendor/kreuzberg/tests/api_tests.rs +0 -506
  135. data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
  136. data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
  137. data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
  138. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  139. data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
  140. data/vendor/kreuzberg/tests/config_features.rs +1 -33
  141. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
  142. data/vendor/kreuzberg/tests/core_integration.rs +9 -35
  143. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  144. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  145. data/vendor/kreuzberg/tests/email_integration.rs +1 -3
  146. data/vendor/kreuzberg/tests/error_handling.rs +34 -43
  147. data/vendor/kreuzberg/tests/format_integration.rs +1 -7
  148. data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
  149. data/vendor/kreuzberg/tests/image_integration.rs +0 -2
  150. data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
  151. data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
  152. data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
  153. data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
  154. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  155. data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
  156. data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
  157. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
  158. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
  159. data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
  160. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
  161. data/vendor/kreuzberg/tests/security_validation.rs +1 -13
  162. data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
  163. metadata +25 -171
  164. data/.rubocop.yml +0 -543
  165. data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
  166. data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
  167. data/lib/kreuzberg/error_context.rb +0 -136
  168. data/lib/kreuzberg/types.rb +0 -170
  169. data/lib/libpdfium.so +0 -0
  170. data/spec/binding/async_operations_spec.rb +0 -473
  171. data/spec/binding/batch_operations_spec.rb +0 -595
  172. data/spec/binding/batch_spec.rb +0 -359
  173. data/spec/binding/config_result_spec.rb +0 -377
  174. data/spec/binding/embeddings_spec.rb +0 -816
  175. data/spec/binding/error_recovery_spec.rb +0 -488
  176. data/spec/binding/font_config_spec.rb +0 -220
  177. data/spec/binding/images_spec.rb +0 -738
  178. data/spec/binding/keywords_extraction_spec.rb +0 -600
  179. data/spec/binding/metadata_types_spec.rb +0 -1228
  180. data/spec/binding/pages_extraction_spec.rb +0 -471
  181. data/spec/binding/tables_spec.rb +0 -641
  182. data/spec/unit/config/chunking_config_spec.rb +0 -213
  183. data/spec/unit/config/embedding_config_spec.rb +0 -343
  184. data/spec/unit/config/extraction_config_spec.rb +0 -438
  185. data/spec/unit/config/font_config_spec.rb +0 -285
  186. data/spec/unit/config/hierarchy_config_spec.rb +0 -314
  187. data/spec/unit/config/image_extraction_config_spec.rb +0 -209
  188. data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
  189. data/spec/unit/config/keyword_config_spec.rb +0 -229
  190. data/spec/unit/config/language_detection_config_spec.rb +0 -258
  191. data/spec/unit/config/ocr_config_spec.rb +0 -171
  192. data/spec/unit/config/page_config_spec.rb +0 -221
  193. data/spec/unit/config/pdf_config_spec.rb +0 -267
  194. data/spec/unit/config/postprocessor_config_spec.rb +0 -290
  195. data/spec/unit/config/tesseract_config_spec.rb +0 -181
  196. data/spec/unit/config/token_reduction_config_spec.rb +0 -251
  197. data/test/metadata_types_test.rb +0 -959
  198. data/vendor/Cargo.toml +0 -61
  199. data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
  200. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
  201. data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
  202. data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
  203. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  204. data/vendor/kreuzberg/src/core/formats.rs +0 -235
  205. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  206. data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
  207. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
  208. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
  209. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
  210. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
  211. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  212. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
  213. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
  214. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
  215. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  216. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
  217. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  218. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  219. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
  220. data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
  221. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  222. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  223. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  224. data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
  225. data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
  226. data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
  227. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  228. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
  229. data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
  230. data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
  231. data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
  232. data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
  233. data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
  234. data/vendor/kreuzberg/src/utils/pool.rs +0 -503
  235. data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
  236. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
  237. data/vendor/kreuzberg/tests/api_embed.rs +0 -360
  238. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  239. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
  240. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
  241. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
  242. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  243. data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
  244. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
  245. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
  246. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  247. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  248. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  249. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  250. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  251. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  252. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  253. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  254. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  255. data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
  256. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
  257. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  258. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  259. data/vendor/kreuzberg/tests/page_markers.rs +0 -297
  260. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
  261. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
  262. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
  263. data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
  264. data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
  265. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
  266. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
  267. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
  268. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
  269. data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
  270. data/vendor/kreuzberg-ffi/README.md +0 -851
  271. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
  272. data/vendor/kreuzberg-ffi/build.rs +0 -168
  273. data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
  274. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
  275. data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
  276. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
  277. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
  278. data/vendor/kreuzberg-ffi/src/error.rs +0 -901
  279. data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
  280. data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
  281. data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
  282. data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
  283. data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
  284. data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
  285. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
  286. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
  287. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
  288. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
  289. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
  290. data/vendor/kreuzberg-ffi/src/result.rs +0 -510
  291. data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
  292. data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
  293. data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
  294. data/vendor/kreuzberg-ffi/src/types.rs +0 -363
  295. data/vendor/kreuzberg-ffi/src/util.rs +0 -210
  296. data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
  297. data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
  298. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
  299. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
  300. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
  301. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
  302. data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
  303. data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
  304. data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
  305. data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
  306. data/vendor/kreuzberg-tesseract/LICENSE +0 -22
  307. data/vendor/kreuzberg-tesseract/README.md +0 -399
  308. data/vendor/kreuzberg-tesseract/build.rs +0 -1127
  309. data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
  310. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
  311. data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
  312. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
  313. data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
  314. data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
  315. data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
  316. data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
  317. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
  318. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
  319. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
  320. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
  321. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
@@ -1,701 +0,0 @@
1
- //! Enhanced Markdown extractor with YAML frontmatter support.
2
- //!
3
- //! This extractor provides:
4
- //! - Comprehensive markdown parsing using pulldown-cmark
5
- //! - Complete YAML frontmatter metadata extraction:
6
- //! - Standard fields: title, author, date, description, keywords
7
- //! - Extended fields: abstract, subject, category, tags, language, version
8
- //! - Automatic conversion of array fields (keywords, tags) to comma-separated strings
9
- //! - Table extraction as structured data
10
- //! - Heading structure preservation
11
- //! - Code block and link extraction
12
- //!
13
- //! Requires the `office` feature (which includes `pulldown-cmark`).
14
-
15
- #[cfg(feature = "office")]
16
- use crate::Result;
17
- #[cfg(feature = "office")]
18
- use crate::core::config::ExtractionConfig;
19
- #[cfg(feature = "office")]
20
- use crate::plugins::{DocumentExtractor, Plugin};
21
- #[cfg(feature = "office")]
22
- use crate::types::{ExtractionResult, Metadata, Table};
23
- #[cfg(feature = "office")]
24
- use async_trait::async_trait;
25
- #[cfg(feature = "office")]
26
- use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
27
- #[cfg(feature = "office")]
28
- use serde_yaml_ng::Value as YamlValue;
29
-
30
- /// Enhanced Markdown extractor with metadata and table support.
31
- ///
32
- /// Parses markdown documents with YAML frontmatter, extracting:
33
- /// - Metadata from YAML frontmatter
34
- /// - Plain text content
35
- /// - Tables as structured data
36
- /// - Document structure (headings, links, code blocks)
37
- #[cfg(feature = "office")]
38
- pub struct MarkdownExtractor;
39
-
40
- #[cfg(feature = "office")]
41
- impl MarkdownExtractor {
42
- /// Create a new Markdown extractor.
43
- pub fn new() -> Self {
44
- Self
45
- }
46
-
47
- /// Extract YAML frontmatter from markdown content.
48
- ///
49
- /// Frontmatter is expected to be delimited by `---` at the start of the document.
50
- /// Returns the remaining content after frontmatter.
51
- fn extract_frontmatter(content: &str) -> (Option<YamlValue>, String) {
52
- if !content.starts_with("---") {
53
- return (None, content.to_string());
54
- }
55
-
56
- let rest = &content[3..];
57
- if let Some(end_pos) = rest.find("\n---") {
58
- let frontmatter_str = &rest[..end_pos];
59
- let remaining = &rest[end_pos + 4..];
60
-
61
- match serde_yaml_ng::from_str::<YamlValue>(frontmatter_str) {
62
- Ok(value) => (Some(value), remaining.to_string()),
63
- Err(_) => (None, content.to_string()),
64
- }
65
- } else {
66
- (None, content.to_string())
67
- }
68
- }
69
-
70
- /// Extract metadata from YAML frontmatter.
71
- ///
72
- /// Extracts the following YAML fields:
73
- /// - Standard fields: title, author, date, description (as subject)
74
- /// - Extended fields: abstract, subject, category, tags, language, version
75
- /// - Array fields (keywords, tags): converted to comma-separated strings
76
- fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
77
- let mut metadata = Metadata::default();
78
-
79
- if let Some(title) = yaml.get("title").and_then(|v| v.as_str()) {
80
- metadata.additional.insert("title".to_string(), title.into());
81
- }
82
-
83
- if let Some(author) = yaml.get("author").and_then(|v| v.as_str()) {
84
- metadata.additional.insert("author".to_string(), author.into());
85
- }
86
-
87
- if let Some(date) = yaml.get("date").and_then(|v| v.as_str()) {
88
- metadata.created_at = Some(date.to_string());
89
- }
90
-
91
- if let Some(keywords) = yaml.get("keywords") {
92
- match keywords {
93
- YamlValue::String(s) => {
94
- metadata.additional.insert("keywords".to_string(), s.clone().into());
95
- }
96
- YamlValue::Sequence(seq) => {
97
- let keywords_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
98
- metadata.additional.insert("keywords".to_string(), keywords_str.into());
99
- }
100
- _ => {}
101
- }
102
- }
103
-
104
- if let Some(description) = yaml.get("description").and_then(|v| v.as_str()) {
105
- metadata.subject = Some(description.to_string());
106
- }
107
-
108
- if let Some(abstract_text) = yaml.get("abstract").and_then(|v| v.as_str()) {
109
- metadata.additional.insert("abstract".to_string(), abstract_text.into());
110
- }
111
-
112
- if let Some(subject) = yaml.get("subject").and_then(|v| v.as_str()) {
113
- metadata.subject = Some(subject.to_string());
114
- }
115
-
116
- if let Some(category) = yaml.get("category").and_then(|v| v.as_str()) {
117
- metadata.additional.insert("category".to_string(), category.into());
118
- }
119
-
120
- if let Some(tags) = yaml.get("tags") {
121
- match tags {
122
- YamlValue::String(s) => {
123
- metadata.additional.insert("tags".to_string(), s.clone().into());
124
- }
125
- YamlValue::Sequence(seq) => {
126
- let tags_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
127
- metadata.additional.insert("tags".to_string(), tags_str.into());
128
- }
129
- _ => {}
130
- }
131
- }
132
-
133
- if let Some(language) = yaml.get("language").and_then(|v| v.as_str()) {
134
- metadata.additional.insert("language".to_string(), language.into());
135
- }
136
-
137
- if let Some(version) = yaml.get("version").and_then(|v| v.as_str()) {
138
- metadata.additional.insert("version".to_string(), version.into());
139
- }
140
-
141
- metadata
142
- }
143
-
144
- /// Extract plain text from markdown AST.
145
- fn extract_text_from_events(events: &[Event]) -> String {
146
- let mut text = String::new();
147
- for event in events {
148
- match event {
149
- Event::Text(s) | Event::Code(s) | Event::Html(s) => {
150
- text.push_str(s);
151
- }
152
- Event::SoftBreak | Event::HardBreak => {
153
- text.push('\n');
154
- }
155
- Event::Start(_) | Event::End(_) | Event::TaskListMarker(_) => {}
156
- Event::FootnoteReference(s) => {
157
- text.push('[');
158
- text.push_str(s);
159
- text.push(']');
160
- }
161
- Event::Rule => {
162
- text.push_str("\n---\n");
163
- }
164
- _ => {}
165
- }
166
- }
167
- text
168
- }
169
-
170
- /// Extract tables from markdown AST.
171
- fn extract_tables_from_events(events: &[Event]) -> Vec<Table> {
172
- let mut tables = Vec::new();
173
- let mut current_table: Option<(Vec<Vec<String>>, usize)> = None;
174
- let mut current_row: Vec<String> = Vec::new();
175
- let mut current_cell = String::new();
176
- let mut in_table_cell = false;
177
- let mut table_index = 0;
178
-
179
- for event in events {
180
- match event {
181
- Event::Start(Tag::Table(_)) => {
182
- current_table = Some((Vec::new(), table_index));
183
- }
184
- Event::Start(Tag::TableHead) => {}
185
- Event::Start(Tag::TableRow) => {
186
- current_row = Vec::new();
187
- }
188
- Event::Start(Tag::TableCell) => {
189
- current_cell = String::new();
190
- in_table_cell = true;
191
- }
192
- Event::Text(s) if in_table_cell => {
193
- current_cell.push_str(s);
194
- }
195
- Event::Code(s) if in_table_cell => {
196
- current_cell.push_str(s);
197
- }
198
- Event::End(TagEnd::TableCell) => {
199
- if in_table_cell {
200
- current_row.push(current_cell.trim().to_string());
201
- current_cell = String::new();
202
- in_table_cell = false;
203
- }
204
- }
205
- Event::End(TagEnd::TableHead) => {
206
- if !current_row.is_empty()
207
- && let Some((ref mut rows, _)) = current_table
208
- {
209
- rows.push(current_row.clone());
210
- }
211
- current_row = Vec::new();
212
- }
213
- Event::End(TagEnd::TableRow) => {
214
- if !current_row.is_empty()
215
- && let Some((ref mut rows, _)) = current_table
216
- {
217
- rows.push(current_row.clone());
218
- }
219
- current_row = Vec::new();
220
- }
221
- Event::End(TagEnd::Table) => {
222
- if let Some((cells, idx)) = current_table.take()
223
- && !cells.is_empty()
224
- {
225
- let markdown = Self::cells_to_markdown(&cells);
226
- tables.push(Table {
227
- cells,
228
- markdown,
229
- page_number: idx + 1,
230
- });
231
- table_index += 1;
232
- }
233
- }
234
- _ => {}
235
- }
236
- }
237
-
238
- tables
239
- }
240
-
241
- /// Convert table cells to markdown format.
242
- fn cells_to_markdown(cells: &[Vec<String>]) -> String {
243
- if cells.is_empty() {
244
- return String::new();
245
- }
246
-
247
- let mut md = String::new();
248
-
249
- md.push('|');
250
- for cell in &cells[0] {
251
- md.push(' ');
252
- md.push_str(cell);
253
- md.push_str(" |");
254
- }
255
- md.push('\n');
256
-
257
- md.push('|');
258
- for _ in &cells[0] {
259
- md.push_str(" --- |");
260
- }
261
- md.push('\n');
262
-
263
- for row in &cells[1..] {
264
- md.push('|');
265
- for cell in row {
266
- md.push(' ');
267
- md.push_str(cell);
268
- md.push_str(" |");
269
- }
270
- md.push('\n');
271
- }
272
-
273
- md
274
- }
275
-
276
- /// Extract first heading as title if not in frontmatter.
277
- fn extract_title_from_content(content: &str) -> Option<String> {
278
- for line in content.lines() {
279
- if let Some(heading) = line.strip_prefix("# ") {
280
- return Some(heading.trim().to_string());
281
- }
282
- }
283
- None
284
- }
285
- }
286
-
287
- #[cfg(feature = "office")]
288
- impl Default for MarkdownExtractor {
289
- fn default() -> Self {
290
- Self::new()
291
- }
292
- }
293
-
294
- #[cfg(feature = "office")]
295
- impl Plugin for MarkdownExtractor {
296
- fn name(&self) -> &str {
297
- "markdown-extractor"
298
- }
299
-
300
- fn version(&self) -> String {
301
- env!("CARGO_PKG_VERSION").to_string()
302
- }
303
-
304
- fn initialize(&self) -> Result<()> {
305
- Ok(())
306
- }
307
-
308
- fn shutdown(&self) -> Result<()> {
309
- Ok(())
310
- }
311
-
312
- fn description(&self) -> &str {
313
- "Extracts content from Markdown files with YAML frontmatter and table support"
314
- }
315
-
316
- fn author(&self) -> &str {
317
- "Kreuzberg Team"
318
- }
319
- }
320
-
321
- #[cfg(feature = "office")]
322
- #[async_trait]
323
- impl DocumentExtractor for MarkdownExtractor {
324
- #[cfg_attr(feature = "otel", tracing::instrument(
325
- skip(self, content, _config),
326
- fields(
327
- extractor.name = self.name(),
328
- content.size_bytes = content.len(),
329
- )
330
- ))]
331
- async fn extract_bytes(
332
- &self,
333
- content: &[u8],
334
- mime_type: &str,
335
- _config: &ExtractionConfig,
336
- ) -> Result<ExtractionResult> {
337
- let text = String::from_utf8_lossy(content).into_owned();
338
-
339
- let (yaml, remaining_content) = Self::extract_frontmatter(&text);
340
-
341
- let mut metadata = if let Some(ref yaml_value) = yaml {
342
- Self::extract_metadata_from_yaml(yaml_value)
343
- } else {
344
- Metadata::default()
345
- };
346
-
347
- if !metadata.additional.contains_key("title")
348
- && let Some(title) = Self::extract_title_from_content(&remaining_content)
349
- {
350
- metadata.additional.insert("title".to_string(), title.into());
351
- }
352
-
353
- let parser = Parser::new_ext(&remaining_content, Options::ENABLE_TABLES);
354
- let events: Vec<Event> = parser.collect();
355
-
356
- let extracted_text = Self::extract_text_from_events(&events);
357
-
358
- let tables = Self::extract_tables_from_events(&events);
359
-
360
- Ok(ExtractionResult {
361
- content: extracted_text,
362
- mime_type: mime_type.to_string(),
363
- metadata,
364
- tables,
365
- detected_languages: None,
366
- chunks: None,
367
- images: None,
368
- pages: None,
369
- })
370
- }
371
-
372
- fn supported_mime_types(&self) -> &[&str] {
373
- &["text/markdown", "text/x-markdown", "text/x-gfm", "text/x-commonmark"]
374
- }
375
-
376
- fn priority(&self) -> i32 {
377
- 50
378
- }
379
- }
380
-
381
- #[cfg(all(test, feature = "office"))]
382
- mod tests {
383
- use super::*;
384
-
385
- #[test]
386
- fn test_can_extract_markdown_mime_types() {
387
- let extractor = MarkdownExtractor::new();
388
- let mime_types = extractor.supported_mime_types();
389
-
390
- assert!(mime_types.contains(&"text/markdown"));
391
- assert!(mime_types.contains(&"text/x-markdown"));
392
- assert!(mime_types.contains(&"text/x-gfm"));
393
- assert!(mime_types.contains(&"text/x-commonmark"));
394
- }
395
-
396
- #[test]
397
- fn test_extract_simple_markdown() {
398
- let content =
399
- b"# Header\n\nThis is a paragraph with **bold** and *italic* text.\n\n## Subheading\n\nMore content here.";
400
- let text = String::from_utf8_lossy(content).into_owned();
401
-
402
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
403
- assert!(yaml.is_none());
404
- assert!(!remaining.is_empty());
405
-
406
- let parser = Parser::new_ext(&remaining, Options::ENABLE_TABLES);
407
- let events: Vec<Event> = parser.collect();
408
- let extracted = MarkdownExtractor::extract_text_from_events(&events);
409
-
410
- assert!(extracted.contains("Header"));
411
- assert!(extracted.contains("This is a paragraph"));
412
- assert!(extracted.contains("bold"));
413
- assert!(extracted.contains("italic"));
414
- }
415
-
416
- #[test]
417
- fn test_extract_frontmatter_metadata() {
418
- let content = b"---\ntitle: My Document\nauthor: John Doe\ndate: 2024-01-15\nkeywords: rust, markdown, extraction\ndescription: A test document\n---\n\n# Content\n\nBody text.";
419
-
420
- let text = String::from_utf8_lossy(content).into_owned();
421
-
422
- let (yaml_opt, remaining) = MarkdownExtractor::extract_frontmatter(&text);
423
- assert!(yaml_opt.is_some());
424
- assert!(remaining.contains("# Content"));
425
-
426
- let yaml = yaml_opt.expect("Should extract YAML frontmatter");
427
- let metadata = MarkdownExtractor::extract_metadata_from_yaml(&yaml);
428
-
429
- assert_eq!(
430
- metadata.additional.get("title").and_then(|v| v.as_str()),
431
- Some("My Document")
432
- );
433
- assert_eq!(
434
- metadata.additional.get("author").and_then(|v| v.as_str()),
435
- Some("John Doe")
436
- );
437
- assert_eq!(metadata.created_at, Some("2024-01-15".to_string()));
438
- assert!(metadata.subject.is_some());
439
- assert!(
440
- metadata
441
- .subject
442
- .as_ref()
443
- .expect("Should have subject description")
444
- .contains("test document")
445
- );
446
- }
447
-
448
- #[test]
449
- fn test_extract_frontmatter_metadata_array_keywords() {
450
- let content = b"---\ntitle: Document\nkeywords:\n - rust\n - markdown\n - parsing\n---\n\nContent";
451
-
452
- let text = String::from_utf8_lossy(content).into_owned();
453
- let (yaml_opt, _remaining) = MarkdownExtractor::extract_frontmatter(&text);
454
-
455
- assert!(yaml_opt.is_some());
456
- let yaml = yaml_opt.expect("Should extract YAML frontmatter");
457
- let metadata = MarkdownExtractor::extract_metadata_from_yaml(&yaml);
458
-
459
- let keywords = metadata.additional.get("keywords").and_then(|v| v.as_str());
460
- assert!(keywords.is_some());
461
- let keywords_str = keywords.expect("Should extract keywords from metadata");
462
- assert!(keywords_str.contains("rust"));
463
- assert!(keywords_str.contains("markdown"));
464
- }
465
-
466
- #[tokio::test]
467
- async fn test_extract_tables() {
468
- let content = b"# Tables Example\n\n| Header 1 | Header 2 |\n|----------|----------|\n| Cell 1 | Cell 2 |\n| Cell 3 | Cell 4 |";
469
-
470
- let extractor = MarkdownExtractor::new();
471
- let result = extractor
472
- .extract_bytes(content, "text/markdown", &ExtractionConfig::default())
473
- .await
474
- .expect("Should extract markdown with tables");
475
-
476
- assert!(!result.tables.is_empty());
477
- let table = &result.tables[0];
478
- assert!(!table.cells.is_empty());
479
- assert_eq!(table.cells[0].len(), 2);
480
- assert!(!table.markdown.is_empty());
481
- }
482
-
483
- #[test]
484
- fn test_extract_without_frontmatter() {
485
- let content = b"# Main Title\n\nSome content\n\nMore text";
486
- let text = String::from_utf8_lossy(content).into_owned();
487
-
488
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
489
- assert!(yaml.is_none());
490
- assert_eq!(remaining, text);
491
-
492
- let title = MarkdownExtractor::extract_title_from_content(&remaining);
493
- assert_eq!(title, Some("Main Title".to_string()));
494
- }
495
-
496
- #[test]
497
- fn test_empty_document() {
498
- let content = b"";
499
- let text = String::from_utf8_lossy(content).into_owned();
500
-
501
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
502
- assert!(yaml.is_none());
503
- assert!(remaining.is_empty());
504
-
505
- let parser = Parser::new_ext(&remaining, Options::ENABLE_TABLES);
506
- let events: Vec<Event> = parser.collect();
507
- let extracted = MarkdownExtractor::extract_text_from_events(&events);
508
- assert!(extracted.is_empty());
509
- }
510
-
511
- #[test]
512
- fn test_whitespace_only_document() {
513
- let content = b" \n\n \n";
514
- let text = String::from_utf8_lossy(content).into_owned();
515
-
516
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
517
- assert!(yaml.is_none());
518
-
519
- let parser = Parser::new_ext(&remaining, Options::ENABLE_TABLES);
520
- let events: Vec<Event> = parser.collect();
521
- let extracted = MarkdownExtractor::extract_text_from_events(&events);
522
- assert!(extracted.trim().is_empty());
523
- }
524
-
525
- #[test]
526
- fn test_unicode_content() {
527
- let content = "# 日本語のタイトル\n\nこれは日本語の内容です。\n\n## Español\n\nEste es un documento en español.\n\n## Русский\n\nЭто русский текст.".as_bytes();
528
-
529
- let text = String::from_utf8_lossy(content).into_owned();
530
-
531
- let (yaml, remaining) = MarkdownExtractor::extract_frontmatter(&text);
532
- assert!(yaml.is_none());
533
-
534
- let parser = Parser::new_ext(&remaining, Options::ENABLE_TABLES);
535
- let events: Vec<Event> = parser.collect();
536
- let extracted = MarkdownExtractor::extract_text_from_events(&events);
537
-
538
- assert!(extracted.contains("日本語"));
539
- assert!(extracted.contains("Español"));
540
- assert!(extracted.contains("Русский"));
541
- }
542
-
543
- #[tokio::test]
544
- async fn test_full_extraction_with_frontmatter_and_tables() {
545
- let content = b"---\ntitle: Complete Document\nauthor: Test Author\ndate: 2024-01-20\n---\n\n# Document\n\nIntroduction text.\n\n| Name | Value |\n|------|-------|\n| A | 1 |\n| B | 2 |";
546
-
547
- let extractor = MarkdownExtractor::new();
548
- let result = extractor
549
- .extract_bytes(content, "text/x-markdown", &ExtractionConfig::default())
550
- .await
551
- .expect("Should extract markdown with frontmatter and tables");
552
-
553
- assert_eq!(result.mime_type, "text/x-markdown");
554
- assert!(result.content.contains("Introduction text"));
555
- assert_eq!(
556
- result.metadata.additional.get("title").and_then(|v| v.as_str()),
557
- Some("Complete Document")
558
- );
559
- assert_eq!(
560
- result.metadata.additional.get("author").and_then(|v| v.as_str()),
561
- Some("Test Author")
562
- );
563
- assert!(!result.tables.is_empty());
564
- }
565
-
566
- #[test]
567
- fn test_plugin_interface() {
568
- let extractor = MarkdownExtractor::new();
569
- assert_eq!(extractor.name(), "markdown-extractor");
570
- assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
571
- assert_eq!(extractor.priority(), 50);
572
- assert!(extractor.supported_mime_types().contains(&"text/markdown"));
573
- }
574
-
575
- #[test]
576
- fn test_cells_to_markdown() {
577
- let cells = vec![
578
- vec!["Header 1".to_string(), "Header 2".to_string()],
579
- vec!["Data 1".to_string(), "Data 2".to_string()],
580
- vec!["Data 3".to_string(), "Data 4".to_string()],
581
- ];
582
-
583
- let markdown = MarkdownExtractor::cells_to_markdown(&cells);
584
- assert!(markdown.contains("Header 1"));
585
- assert!(markdown.contains("Data 1"));
586
- assert!(markdown.contains("---"));
587
- let lines: Vec<&str> = markdown.lines().collect();
588
- assert!(lines.len() >= 4);
589
- }
590
-
591
- #[test]
592
- fn test_extract_markdown_with_links() {
593
- let content = b"# Page\n\nCheck [Google](https://google.com) and [Rust](https://rust-lang.org).";
594
- let text = String::from_utf8_lossy(content).into_owned();
595
-
596
- let parser = Parser::new_ext(&text, Options::ENABLE_TABLES);
597
- let events: Vec<Event> = parser.collect();
598
- let extracted = MarkdownExtractor::extract_text_from_events(&events);
599
-
600
- assert!(extracted.contains("Google"));
601
- assert!(extracted.contains("Rust"));
602
- }
603
-
604
- #[test]
605
- fn test_extract_markdown_with_code_blocks() {
606
- let content = b"# Code Example\n\n```rust\nfn main() {\n println!(\"Hello\");\n}\n```";
607
- let text = String::from_utf8_lossy(content).into_owned();
608
-
609
- let parser = Parser::new_ext(&text, Options::ENABLE_TABLES);
610
- let events: Vec<Event> = parser.collect();
611
- let extracted = MarkdownExtractor::extract_text_from_events(&events);
612
-
613
- assert!(extracted.contains("main"));
614
- assert!(extracted.contains("println"));
615
- }
616
-
617
- #[test]
618
- fn test_malformed_frontmatter_fallback() {
619
- let content = b"---\nthis: is: invalid: yaml:\n---\n\nContent here";
620
- let text = String::from_utf8_lossy(content).into_owned();
621
-
622
- let (yaml, _remaining) = MarkdownExtractor::extract_frontmatter(&text);
623
- let _ = yaml;
624
- }
625
-
626
- #[test]
627
- fn test_metadata_extraction_completeness() {
628
- let yaml_str = r#"
629
- title: "Test Document"
630
- author: "Test Author"
631
- date: "2024-01-15"
632
- keywords:
633
- - rust
634
- - markdown
635
- - testing
636
- description: "A test description"
637
- abstract: "Test abstract"
638
- subject: "Test subject"
639
- category: "Documentation"
640
- version: "1.2.3"
641
- language: "en"
642
- tags:
643
- - tag1
644
- - tag2
645
- custom_field: "custom_value"
646
- nested:
647
- organization: "Test Corp"
648
- contact:
649
- email: "test@example.com"
650
- "#;
651
-
652
- let yaml: YamlValue = serde_yaml_ng::from_str(yaml_str).expect("Valid YAML");
653
- let metadata = MarkdownExtractor::extract_metadata_from_yaml(&yaml);
654
-
655
- assert_eq!(metadata.created_at, Some("2024-01-15".to_string()));
656
- assert_eq!(
657
- metadata.additional.get("title").and_then(|v| v.as_str()),
658
- Some("Test Document")
659
- );
660
- assert_eq!(
661
- metadata.additional.get("author").and_then(|v| v.as_str()),
662
- Some("Test Author")
663
- );
664
-
665
- assert!(metadata.additional.contains_key("keywords"));
666
- let keywords = metadata
667
- .additional
668
- .get("keywords")
669
- .and_then(|v| v.as_str())
670
- .unwrap_or("");
671
- assert!(keywords.contains("rust"));
672
- assert!(keywords.contains("markdown"));
673
-
674
- assert_eq!(metadata.subject, Some("Test subject".to_string()));
675
-
676
- assert_eq!(
677
- metadata.additional.get("abstract").and_then(|v| v.as_str()),
678
- Some("Test abstract")
679
- );
680
-
681
- assert_eq!(
682
- metadata.additional.get("category").and_then(|v| v.as_str()),
683
- Some("Documentation")
684
- );
685
-
686
- assert!(metadata.additional.contains_key("tags"));
687
- let tags = metadata.additional.get("tags").and_then(|v| v.as_str()).unwrap_or("");
688
- assert!(tags.contains("tag1"));
689
- assert!(tags.contains("tag2"));
690
-
691
- assert_eq!(metadata.additional.get("language").and_then(|v| v.as_str()), Some("en"));
692
-
693
- assert_eq!(
694
- metadata.additional.get("version").and_then(|v| v.as_str()),
695
- Some("1.2.3")
696
- );
697
-
698
- assert_eq!(metadata.additional.len(), 8, "Should extract all standard fields");
699
- println!("\nSuccessfully extracted all 8 additional metadata fields");
700
- }
701
- }