kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,285 @@
1
+ use ahash::AHashMap;
2
+ use once_cell::sync::Lazy;
3
+ use regex::Regex;
4
+
5
+ /// Regular expression for matching Markdown code blocks.
6
+ /// Matches triple-backtick code blocks: ```...```
7
+ static MARKDOWN_CODE_BLOCK_REGEX: Lazy<Regex> =
8
+ Lazy::new(|| Regex::new(r"```[\s\S]*?```").expect("Markdown code block regex pattern is valid and should compile"));
9
+
10
+ /// Regular expression for matching Markdown inline code.
11
+ /// Matches single-backtick inline code: `code`
12
+ static MARKDOWN_INLINE_CODE_REGEX: Lazy<Regex> =
13
+ Lazy::new(|| Regex::new(r"`[^`\n]+`").expect("Markdown inline code regex pattern is valid and should compile"));
14
+
15
+ /// Regular expression for matching Markdown headers.
16
+ /// Matches headers like `# Header`, `## Header`, etc.
17
+ static MARKDOWN_HEADERS_REGEX: Lazy<Regex> =
18
+ Lazy::new(|| Regex::new(r"^#{1,6}\s+").expect("Markdown headers regex pattern is valid and should compile"));
19
+
20
+ /// Regular expression for matching Markdown list items.
21
+ /// Matches list markers: `- `, `* `, `+ ` at the start of lines
22
+ static MARKDOWN_LISTS_REGEX: Lazy<Regex> =
23
+ Lazy::new(|| Regex::new(r"^[ \t]*[-*+]\s+").expect("Markdown lists regex pattern is valid and should compile"));
24
+
25
+ /// Extracts and preserves Markdown code blocks and inline code by replacing them with placeholders.
26
+ ///
27
+ /// This function scans the input text for Markdown code blocks (``` ... ```) and inline code (` ... `),
28
+ /// replaces them with unique placeholders, and stores the original content in a hashmap.
29
+ ///
30
+ /// # Arguments
31
+ /// * `text` - The input text containing Markdown code
32
+ /// * `preserved` - A mutable hashmap to store the preserved code blocks
33
+ ///
34
+ /// # Returns
35
+ /// A new `String` with code blocks replaced by placeholders
36
+ pub fn extract_and_preserve_code(text: &str, preserved: &mut AHashMap<String, String>) -> String {
37
+ let mut result = text.to_string();
38
+ let mut code_block_id = 0;
39
+ let mut inline_code_id = 0;
40
+
41
+ // Extract code blocks first
42
+ result = MARKDOWN_CODE_BLOCK_REGEX
43
+ .replace_all(&result, |caps: &regex::Captures| {
44
+ let code_block = caps[0].to_string();
45
+ let placeholder = format!("__CODEBLOCK_{}__", code_block_id);
46
+ code_block_id += 1;
47
+ preserved.insert(placeholder.clone(), code_block);
48
+ placeholder
49
+ })
50
+ .to_string();
51
+
52
+ // Extract inline code
53
+ result = MARKDOWN_INLINE_CODE_REGEX
54
+ .replace_all(&result, |caps: &regex::Captures| {
55
+ let inline_code = caps[0].to_string();
56
+ let placeholder = format!("__INLINECODE_{}__", inline_code_id);
57
+ inline_code_id += 1;
58
+ preserved.insert(placeholder.clone(), inline_code);
59
+ placeholder
60
+ })
61
+ .to_string();
62
+
63
+ result
64
+ }
65
+
66
+ /// Restores preserved code blocks by replacing placeholders with their original content.
67
+ ///
68
+ /// # Arguments
69
+ /// * `text` - The text containing placeholders
70
+ /// * `preserved` - The hashmap containing the original code blocks
71
+ ///
72
+ /// # Returns
73
+ /// A new `String` with placeholders replaced by their original content
74
+ pub fn restore_preserved_blocks(text: &str, preserved: &AHashMap<String, String>) -> String {
75
+ if preserved.is_empty() {
76
+ return text.to_string();
77
+ }
78
+
79
+ let mut result = text.to_string();
80
+
81
+ for (placeholder, original_content) in preserved {
82
+ result = result.replace(placeholder, original_content);
83
+ }
84
+
85
+ result
86
+ }
87
+
88
+ /// Preserves Markdown structure elements like headers, lists, and tables.
89
+ ///
90
+ /// This function processes text line-by-line and preserves lines that contain
91
+ /// Markdown structural elements without modification.
92
+ ///
93
+ /// # Arguments
94
+ /// * `text` - The input text with Markdown structure
95
+ ///
96
+ /// # Returns
97
+ /// A new `String` with Markdown structure preserved
98
+ pub fn preserve_markdown_structure(text: &str) -> String {
99
+ let lines: Vec<&str> = text.lines().collect();
100
+ let mut processed_lines = Vec::with_capacity(lines.len());
101
+
102
+ for line in lines {
103
+ // Preserve headers
104
+ if MARKDOWN_HEADERS_REGEX.is_match(line) {
105
+ processed_lines.push(line);
106
+ continue;
107
+ }
108
+
109
+ // Preserve list items
110
+ if MARKDOWN_LISTS_REGEX.is_match(line) {
111
+ processed_lines.push(line);
112
+ continue;
113
+ }
114
+
115
+ processed_lines.push(line);
116
+ }
117
+
118
+ processed_lines.join("\n")
119
+ }
120
+
121
+ /// Checks if a line is a Markdown header.
122
+ ///
123
+ /// # Arguments
124
+ /// * `line` - The line to check
125
+ ///
126
+ /// # Returns
127
+ /// `true` if the line is a Markdown header, `false` otherwise
128
+ #[inline]
129
+ pub fn is_markdown_header(line: &str) -> bool {
130
+ MARKDOWN_HEADERS_REGEX.is_match(line)
131
+ }
132
+
133
+ /// Checks if a line is a Markdown list item.
134
+ ///
135
+ /// # Arguments
136
+ /// * `line` - The line to check
137
+ ///
138
+ /// # Returns
139
+ /// `true` if the line is a Markdown list item, `false` otherwise
140
+ #[inline]
141
+ pub fn is_markdown_list(line: &str) -> bool {
142
+ MARKDOWN_LISTS_REGEX.is_match(line)
143
+ }
144
+
145
+ /// Checks if a line is a Markdown table row.
146
+ ///
147
+ /// # Arguments
148
+ /// * `line` - The line to check
149
+ ///
150
+ /// # Returns
151
+ /// `true` if the line appears to be a Markdown table row, `false` otherwise
152
+ #[inline]
153
+ pub fn is_markdown_table(line: &str) -> bool {
154
+ let trimmed = line.trim();
155
+ trimmed.starts_with('|') && trimmed.ends_with('|')
156
+ }
157
+
158
+ #[cfg(test)]
159
+ mod tests {
160
+ use super::*;
161
+
162
+ #[test]
163
+ fn test_extract_code_block() {
164
+ let mut preserved = AHashMap::new();
165
+ let input = "Text before\n```rust\nfn main() {}\n```\nText after";
166
+ let result = extract_and_preserve_code(input, &mut preserved);
167
+
168
+ assert_eq!(preserved.len(), 1);
169
+ assert!(preserved.values().any(|v| v.contains("fn main()")));
170
+ assert!(result.contains("__CODEBLOCK_0__"));
171
+ }
172
+
173
+ #[test]
174
+ fn test_extract_inline_code() {
175
+ let mut preserved = AHashMap::new();
176
+ let input = "Use the `println!` macro";
177
+ let result = extract_and_preserve_code(input, &mut preserved);
178
+
179
+ assert_eq!(preserved.len(), 1);
180
+ assert!(preserved.values().any(|v| v == "`println!`"));
181
+ assert!(result.contains("__INLINECODE_0__"));
182
+ }
183
+
184
+ #[test]
185
+ fn test_multiple_code_blocks() {
186
+ let input =
187
+ "Start ```rust\nlet x = 1;\n``` middle `inline1` text ```python\nprint('hi')\n``` and `inline2` end";
188
+ let mut preserved = AHashMap::new();
189
+ let result = extract_and_preserve_code(input, &mut preserved);
190
+
191
+ assert_eq!(preserved.len(), 4);
192
+ assert!(preserved.contains_key("__CODEBLOCK_0__"));
193
+ assert!(preserved.contains_key("__CODEBLOCK_1__"));
194
+ assert!(preserved.contains_key("__INLINECODE_0__"));
195
+ assert!(preserved.contains_key("__INLINECODE_1__"));
196
+
197
+ assert_eq!(preserved.get("__CODEBLOCK_0__").unwrap(), "```rust\nlet x = 1;\n```");
198
+ assert_eq!(preserved.get("__CODEBLOCK_1__").unwrap(), "```python\nprint('hi')\n```");
199
+ assert_eq!(preserved.get("__INLINECODE_0__").unwrap(), "`inline1`");
200
+ assert_eq!(preserved.get("__INLINECODE_1__").unwrap(), "`inline2`");
201
+
202
+ let restored = restore_preserved_blocks(&result, &preserved);
203
+ assert!(restored.contains("```rust\nlet x = 1;\n```"));
204
+ assert!(restored.contains("```python\nprint('hi')\n```"));
205
+ assert!(restored.contains("`inline1`"));
206
+ assert!(restored.contains("`inline2`"));
207
+ assert!(!restored.contains("__CODEBLOCK_"));
208
+ assert!(!restored.contains("__INLINECODE_"));
209
+ }
210
+
211
+ #[test]
212
+ fn test_restore_preserved_blocks() {
213
+ let mut preserved = AHashMap::new();
214
+ preserved.insert("__CODEBLOCK_0__".to_string(), "```code```".to_string());
215
+ preserved.insert("__INLINECODE_0__".to_string(), "`inline`".to_string());
216
+ let input = "Text __CODEBLOCK_0__ and __INLINECODE_0__ here";
217
+ let result = restore_preserved_blocks(input, &preserved);
218
+
219
+ assert!(result.contains("```code```"));
220
+ assert!(result.contains("`inline`"));
221
+ assert!(!result.contains("__CODEBLOCK_0__"));
222
+ assert!(!result.contains("__INLINECODE_0__"));
223
+ }
224
+
225
+ #[test]
226
+ fn test_hashmap_order_independence() {
227
+ let input = "Text `a` and `b` and `c` here";
228
+ let mut preserved = AHashMap::new();
229
+ let result = extract_and_preserve_code(input, &mut preserved);
230
+
231
+ assert_eq!(preserved.len(), 3);
232
+ let restored = restore_preserved_blocks(&result, &preserved);
233
+
234
+ assert!(restored.contains("`a`"));
235
+ assert!(restored.contains("`b`"));
236
+ assert!(restored.contains("`c`"));
237
+ assert_eq!(restored, "Text `a` and `b` and `c` here");
238
+ }
239
+
240
+ #[test]
241
+ fn test_preserve_markdown_structure() {
242
+ let input = "# Header 1\n## Header 2\n### Header 3\nRegular text";
243
+ let result = preserve_markdown_structure(input);
244
+
245
+ assert!(result.contains("# Header 1"));
246
+ assert!(result.contains("## Header 2"));
247
+ assert!(result.contains("### Header 3"));
248
+ }
249
+
250
+ #[test]
251
+ fn test_is_markdown_header() {
252
+ assert!(is_markdown_header("# Header 1"));
253
+ assert!(is_markdown_header("## Header 2"));
254
+ assert!(is_markdown_header("### Header 3"));
255
+ assert!(!is_markdown_header("Regular text"));
256
+ assert!(!is_markdown_header("- List item"));
257
+ }
258
+
259
+ #[test]
260
+ fn test_is_markdown_list() {
261
+ assert!(is_markdown_list("- Item 1"));
262
+ assert!(is_markdown_list("* Item 2"));
263
+ assert!(is_markdown_list("+ Item 3"));
264
+ assert!(is_markdown_list(" - Indented item"));
265
+ assert!(!is_markdown_list("Regular text"));
266
+ assert!(!is_markdown_list("# Header"));
267
+ }
268
+
269
+ #[test]
270
+ fn test_is_markdown_table() {
271
+ assert!(is_markdown_table("| Header 1 | Header 2 |"));
272
+ assert!(is_markdown_table("|----------|----------|"));
273
+ assert!(is_markdown_table("| Cell 1 | Cell 2 |"));
274
+ assert!(!is_markdown_table("Regular text"));
275
+ assert!(!is_markdown_table("- List item"));
276
+ }
277
+
278
+ #[test]
279
+ fn test_lazy_regex_initialization() {
280
+ let _ = &*MARKDOWN_CODE_BLOCK_REGEX;
281
+ let _ = &*MARKDOWN_INLINE_CODE_REGEX;
282
+ let _ = &*MARKDOWN_HEADERS_REGEX;
283
+ let _ = &*MARKDOWN_LISTS_REGEX;
284
+ }
285
+ }