kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -0,0 +1,157 @@
1
+ //! LaTeX environment processing.
2
+ //!
3
+ //! This module handles LaTeX environments like itemize, enumerate, description,
4
+ //! tabular, and table environments.
5
+
6
+ use super::commands::process_line;
7
+ use super::utilities::{clean_text, collect_environment, extract_braced, extract_env_name};
8
+ use crate::types::Table;
9
+
10
+ /// Processes a list environment (itemize, enumerate, or description).
11
+ ///
12
+ /// Converts LaTeX lists into markdown-style lists with proper nesting.
13
+ pub fn process_list(content: &str, list_type: &str, output: &mut String) {
14
+ let lines: Vec<&str> = content.lines().collect();
15
+ let mut item_num = 1;
16
+ let mut i = 0;
17
+
18
+ while i < lines.len() {
19
+ let line = lines[i];
20
+ let trimmed = line.trim();
21
+
22
+ // Handle nested lists
23
+ if trimmed.contains("\\begin{")
24
+ && let Some(env_name) = extract_env_name(trimmed)
25
+ && (env_name == "itemize" || env_name == "enumerate" || env_name == "description")
26
+ {
27
+ let (nested_content, new_i) = collect_environment(&lines, i, &env_name);
28
+ let current_output_len = output.len();
29
+ process_list(&nested_content, &env_name, output);
30
+ let nested_output = output[current_output_len..].to_string();
31
+ output.truncate(current_output_len);
32
+ // Indent nested list
33
+ for nested_line in nested_output.lines() {
34
+ output.push_str(" ");
35
+ output.push_str(nested_line);
36
+ output.push('\n');
37
+ }
38
+ i = new_i;
39
+ continue;
40
+ }
41
+
42
+ // Handle \item
43
+ if trimmed.starts_with("\\item")
44
+ && let Some(pos) = trimmed.find("\\item")
45
+ {
46
+ let after = trimmed[pos + 5..].trim();
47
+
48
+ // Handle \item[label] for description lists
49
+ if after.starts_with('[')
50
+ && let Some(bracket_end) = after.find(']')
51
+ {
52
+ let label = after[1..bracket_end].to_string();
53
+ let text = after[bracket_end + 1..].trim().to_string();
54
+ if list_type == "description" {
55
+ let processed_text = process_line(&text);
56
+ output.push_str(&format!("{}: {}\n", label, processed_text));
57
+ item_num += 1;
58
+ i += 1;
59
+ continue;
60
+ }
61
+ }
62
+
63
+ // Regular list item
64
+ let prefix = if list_type == "enumerate" {
65
+ format!("{}. ", item_num)
66
+ } else {
67
+ "- ".to_string()
68
+ };
69
+ output.push_str(&prefix);
70
+
71
+ let item_text = process_line(after);
72
+ output.push_str(item_text.trim());
73
+ output.push('\n');
74
+ item_num += 1;
75
+ }
76
+
77
+ i += 1;
78
+ }
79
+ output.push('\n');
80
+ }
81
+
82
+ /// Processes a tabular environment.
83
+ ///
84
+ /// Converts LaTeX tables into markdown tables and creates Table structures.
85
+ pub fn process_table(content: &str, output: &mut String, tables: &mut Vec<Table>) {
86
+ let lines: Vec<&str> = content.lines().collect();
87
+ let mut rows: Vec<Vec<String>> = Vec::new();
88
+
89
+ for line in lines {
90
+ let trimmed = line.trim();
91
+ if trimmed.starts_with("\\hline") || trimmed.is_empty() || trimmed.contains("\\begin{tabular}") {
92
+ continue;
93
+ }
94
+
95
+ let row_str = trimmed.replace("\\\\", "");
96
+ let cells: Vec<String> = row_str
97
+ .split('&')
98
+ .map(|s| clean_text(s.trim()))
99
+ .filter(|s| !s.is_empty())
100
+ .collect();
101
+
102
+ if !cells.is_empty() {
103
+ rows.push(cells);
104
+ }
105
+ }
106
+
107
+ if !rows.is_empty() {
108
+ let mut markdown = String::new();
109
+ for (i, row) in rows.iter().enumerate() {
110
+ markdown.push('|');
111
+ for cell in row {
112
+ markdown.push_str(&format!(" {} |", cell));
113
+ }
114
+ markdown.push('\n');
115
+
116
+ // Add header separator after first row
117
+ if i == 0 && rows.len() > 1 {
118
+ markdown.push('|');
119
+ for _ in row {
120
+ markdown.push_str(" --- |");
121
+ }
122
+ markdown.push('\n');
123
+ }
124
+ }
125
+
126
+ output.push_str(&markdown);
127
+
128
+ let table = Table {
129
+ cells: rows,
130
+ markdown: markdown.clone(),
131
+ page_number: 1,
132
+ };
133
+ tables.push(table);
134
+ }
135
+ }
136
+
137
+ /// Processes a table environment with caption.
138
+ ///
139
+ /// Extracts the caption and processes the embedded tabular environment.
140
+ pub fn process_table_with_caption(content: &str, output: &mut String, tables: &mut Vec<Table>) {
141
+ // Extract and add caption if present
142
+ if content.contains("\\caption{")
143
+ && let Some(caption) = extract_braced(content, "caption")
144
+ {
145
+ output.push_str(&caption);
146
+ output.push('\n');
147
+ }
148
+
149
+ // Process the tabular environment inside
150
+ if content.contains("\\begin{tabular}")
151
+ && let Some(start) = content.find("\\begin{tabular}")
152
+ && let Some(end) = content.find("\\end{tabular}")
153
+ {
154
+ let tabular_content = &content[start..end + 13];
155
+ process_table(tabular_content, output, tables);
156
+ }
157
+ }
@@ -0,0 +1,27 @@
1
+ //! Metadata extraction for LaTeX documents.
2
+ //!
3
+ //! This module handles extraction of document metadata like title, author, and date
4
+ //! from LaTeX preamble commands.
5
+
6
+ use super::utilities::extract_braced;
7
+ use crate::types::Metadata;
8
+
9
+ /// Extracts metadata from a LaTeX line.
10
+ ///
11
+ /// Looks for \title{}, \author{}, and \date{} commands and populates
12
+ /// the provided Metadata structure.
13
+ pub fn extract_metadata_from_line(line: &str, metadata: &mut Metadata) {
14
+ if line.starts_with("\\title{") {
15
+ if let Some(title) = extract_braced(line, "title") {
16
+ metadata.additional.insert("title".to_string(), title.into());
17
+ }
18
+ } else if line.starts_with("\\author{") {
19
+ if let Some(author) = extract_braced(line, "author") {
20
+ metadata.additional.insert("author".to_string(), author.into());
21
+ }
22
+ } else if line.starts_with("\\date{")
23
+ && let Some(date) = extract_braced(line, "date")
24
+ {
25
+ metadata.additional.insert("date".to_string(), date.into());
26
+ }
27
+ }
@@ -0,0 +1,146 @@
1
+ //! Native Rust LaTeX text extractor.
2
+ //!
3
+ //! This extractor provides comprehensive LaTeX document parsing and text extraction.
4
+ //!
5
+ //! Features:
6
+ //! - Metadata extraction: title, author, date from \title{}, \author{}, \date{}
7
+ //! - Section hierarchy: \section{}, \subsection{}, \subsubsection{}, etc.
8
+ //! - Inline formatting: \emph{}, \textbf{}, \textit{}, \texttt{}, \underline{}
9
+ //! - Lists: itemize, enumerate, description environments
10
+ //! - Tables: tabular environment parsing
11
+ //! - Math: inline ($...$) and display (\[...\]) math preservation
12
+ //! - Unicode support
13
+ //!
14
+ //! Requires the `office` feature.
15
+
16
+ mod commands;
17
+ mod environments;
18
+ mod metadata;
19
+ mod parser;
20
+ mod utilities;
21
+
22
+ use crate::Result;
23
+ use crate::core::config::ExtractionConfig;
24
+ use crate::plugins::{DocumentExtractor, Plugin};
25
+ use crate::types::{ExtractionResult, Metadata, Table};
26
+ use async_trait::async_trait;
27
+
28
+ use parser::LatexParser;
29
+
30
+ /// LaTeX document extractor
31
+ pub struct LatexExtractor;
32
+
33
+ impl LatexExtractor {
34
+ /// Create a new LaTeX extractor.
35
+ pub fn new() -> Self {
36
+ Self
37
+ }
38
+
39
+ /// Parse LaTeX content and extract text.
40
+ fn extract_from_latex(content: &str) -> (String, Metadata, Vec<Table>) {
41
+ let mut parser = LatexParser::new(content);
42
+ parser.parse()
43
+ }
44
+ }
45
+
46
+ impl Default for LatexExtractor {
47
+ fn default() -> Self {
48
+ Self::new()
49
+ }
50
+ }
51
+
52
+ impl Plugin for LatexExtractor {
53
+ fn name(&self) -> &str {
54
+ "latex-extractor"
55
+ }
56
+
57
+ fn version(&self) -> String {
58
+ env!("CARGO_PKG_VERSION").to_string()
59
+ }
60
+
61
+ fn initialize(&self) -> Result<()> {
62
+ Ok(())
63
+ }
64
+
65
+ fn shutdown(&self) -> Result<()> {
66
+ Ok(())
67
+ }
68
+
69
+ fn description(&self) -> &str {
70
+ "Native Rust LaTeX document extractor with metadata and table support"
71
+ }
72
+
73
+ fn author(&self) -> &str {
74
+ "Kreuzberg Team"
75
+ }
76
+ }
77
+
78
+ #[async_trait]
79
+ impl DocumentExtractor for LatexExtractor {
80
+ #[cfg_attr(feature = "otel", tracing::instrument(
81
+ skip(self, content, _config),
82
+ fields(
83
+ extractor.name = self.name(),
84
+ content.size_bytes = content.len(),
85
+ )
86
+ ))]
87
+ async fn extract_bytes(
88
+ &self,
89
+ content: &[u8],
90
+ mime_type: &str,
91
+ _config: &ExtractionConfig,
92
+ ) -> Result<ExtractionResult> {
93
+ let latex_str = String::from_utf8_lossy(content).to_string();
94
+ let (text, metadata, tables) = Self::extract_from_latex(&latex_str);
95
+
96
+ Ok(ExtractionResult {
97
+ content: text,
98
+ mime_type: mime_type.to_string(),
99
+ metadata,
100
+ tables,
101
+ detected_languages: None,
102
+ chunks: None,
103
+ images: None,
104
+ djot_content: None,
105
+ pages: None,
106
+ elements: None,
107
+ })
108
+ }
109
+
110
+ fn supported_mime_types(&self) -> &[&str] {
111
+ &["application/x-latex", "text/x-tex"]
112
+ }
113
+
114
+ fn priority(&self) -> i32 {
115
+ 50
116
+ }
117
+ }
118
+
119
+ #[cfg(test)]
120
+ mod tests {
121
+ use super::*;
122
+
123
+ #[test]
124
+ fn test_basic_title_extraction() {
125
+ let latex = r#"\title{Hello World}"#;
126
+ let (_, metadata, _) = LatexExtractor::extract_from_latex(latex);
127
+ assert_eq!(
128
+ metadata.additional.get("title").and_then(|v| v.as_str()),
129
+ Some("Hello World")
130
+ );
131
+ }
132
+
133
+ #[test]
134
+ fn test_author_extraction() {
135
+ let latex = r#"\author{John Doe}"#;
136
+ let (_, metadata, _) = LatexExtractor::extract_from_latex(latex);
137
+ assert!(metadata.additional.contains_key("author"));
138
+ }
139
+
140
+ #[test]
141
+ fn test_section_extraction() {
142
+ let latex = r#"\begin{document}\section{Introduction}\end{document}"#;
143
+ let (content, _, _) = LatexExtractor::extract_from_latex(latex);
144
+ assert!(content.contains("Introduction"));
145
+ }
146
+ }
@@ -0,0 +1,231 @@
1
+ //! Core LaTeX parser implementation.
2
+ //!
3
+ //! This module contains the main LatexParser struct and the core parsing logic
4
+ //! that orchestrates document structure extraction.
5
+
6
+ use super::commands::process_line;
7
+ use super::environments::{process_list, process_table, process_table_with_caption};
8
+ use super::metadata::extract_metadata_from_line;
9
+ use super::utilities::{collect_environment, extract_braced, extract_env_name};
10
+ use crate::types::{Metadata, Table};
11
+
12
+ /// LaTeX parser state machine.
13
+ ///
14
+ /// Maintains parsing state including metadata, tables, and output as it
15
+ /// processes a LaTeX document line by line.
16
+ pub struct LatexParser<'a> {
17
+ source: &'a str,
18
+ metadata: Metadata,
19
+ tables: Vec<Table>,
20
+ output: String,
21
+ }
22
+
23
+ impl<'a> LatexParser<'a> {
24
+ /// Creates a new LaTeX parser for the given source.
25
+ pub fn new(source: &'a str) -> Self {
26
+ Self {
27
+ source,
28
+ metadata: Metadata::default(),
29
+ tables: Vec::new(),
30
+ output: String::new(),
31
+ }
32
+ }
33
+
34
+ /// Parses the LaTeX document and returns extracted content, metadata, and tables.
35
+ pub fn parse(&mut self) -> (String, Metadata, Vec<Table>) {
36
+ let lines: Vec<&str> = self.source.lines().collect();
37
+ let mut in_document = false;
38
+ let mut skip_until_end = None::<String>;
39
+ let mut i = 0;
40
+
41
+ // Detect plain TeX documents (no \begin{document})
42
+ let is_plain_tex = self.source.contains("\\bye") && !self.source.contains("\\begin{document}");
43
+ if is_plain_tex {
44
+ in_document = true;
45
+ }
46
+
47
+ while i < lines.len() {
48
+ let line = lines[i];
49
+ let trimmed = line.trim();
50
+
51
+ // Handle environments we're skipping
52
+ if let Some(ref env) = skip_until_end {
53
+ if trimmed.contains(&format!("\\end{{{}}}", env)) {
54
+ skip_until_end = None;
55
+ }
56
+ i += 1;
57
+ continue;
58
+ }
59
+
60
+ // Handle plain TeX end marker
61
+ if is_plain_tex && trimmed.contains("\\bye") {
62
+ break;
63
+ }
64
+
65
+ // Extract metadata from preamble
66
+ if !in_document && !is_plain_tex {
67
+ extract_metadata_from_line(trimmed, &mut self.metadata);
68
+ }
69
+
70
+ // Handle \begin{document}
71
+ if !is_plain_tex && trimmed.contains("\\begin{document}") {
72
+ in_document = true;
73
+
74
+ // Handle single-line documents
75
+ if trimmed.contains("\\end{document}") {
76
+ self.process_single_line_document(trimmed);
77
+ break;
78
+ }
79
+
80
+ i += 1;
81
+ continue;
82
+ }
83
+
84
+ // Handle \end{document}
85
+ if !is_plain_tex && trimmed.contains("\\end{document}") {
86
+ break;
87
+ }
88
+
89
+ // Process document content
90
+ if in_document {
91
+ if self.process_environments(&lines, trimmed, &mut i, &mut skip_until_end) {
92
+ continue;
93
+ }
94
+
95
+ self.process_sections_and_content(trimmed, &lines, &mut i);
96
+ }
97
+
98
+ i += 1;
99
+ }
100
+
101
+ let content = self.output.trim().to_string();
102
+ (content, self.metadata.clone(), self.tables.clone())
103
+ }
104
+
105
+ /// Processes a single-line document (both \begin and \end on same line).
106
+ fn process_single_line_document(&mut self, trimmed: &str) {
107
+ let Some(begin_pos) = trimmed.find("\\begin{document}") else {
108
+ return;
109
+ };
110
+ let Some(end_pos) = trimmed.find("\\end{document}") else {
111
+ return;
112
+ };
113
+ let content_between = trimmed[begin_pos + 16..end_pos].trim();
114
+ if !content_between.is_empty() {
115
+ if content_between.starts_with("\\section{") {
116
+ if let Some(title) = extract_braced(content_between, "section") {
117
+ self.output.push_str(&format!("\n# {}\n\n", title));
118
+ }
119
+ } else {
120
+ let processed = process_line(content_between);
121
+ if !processed.is_empty() {
122
+ self.output.push_str(&processed);
123
+ self.output.push('\n');
124
+ }
125
+ }
126
+ }
127
+ }
128
+
129
+ /// Processes LaTeX environments (lists, tables, math).
130
+ ///
131
+ /// Returns true if an environment was processed and the line index was updated.
132
+ fn process_environments(
133
+ &mut self,
134
+ lines: &[&str],
135
+ trimmed: &str,
136
+ i: &mut usize,
137
+ skip_until_end: &mut Option<String>,
138
+ ) -> bool {
139
+ if !trimmed.contains("\\begin{") {
140
+ return false;
141
+ }
142
+
143
+ let Some(env_name) = extract_env_name(trimmed) else {
144
+ return false;
145
+ };
146
+
147
+ match env_name.as_str() {
148
+ "itemize" | "enumerate" | "description" => {
149
+ let (env_content, new_i) = collect_environment(lines, *i, &env_name);
150
+ process_list(&env_content, &env_name, &mut self.output);
151
+ *i = new_i;
152
+ true
153
+ }
154
+ "tabular" => {
155
+ let (env_content, new_i) = collect_environment(lines, *i, "tabular");
156
+ process_table(&env_content, &mut self.output, &mut self.tables);
157
+ *i = new_i;
158
+ true
159
+ }
160
+ "table" => {
161
+ let (env_content, new_i) = collect_environment(lines, *i, "table");
162
+ process_table_with_caption(&env_content, &mut self.output, &mut self.tables);
163
+ *i = new_i;
164
+ true
165
+ }
166
+ "equation" | "align" | "gather" | "multline" => {
167
+ let (env_content, new_i) = collect_environment(lines, *i, &env_name);
168
+ self.output.push_str("$$\\begin{");
169
+ self.output.push_str(&env_name);
170
+ self.output.push_str("}\n");
171
+ self.output.push_str(&env_content);
172
+ self.output.push_str("\\end{");
173
+ self.output.push_str(&env_name);
174
+ self.output.push_str("}$$\n\n");
175
+ *i = new_i;
176
+ true
177
+ }
178
+ _ => {
179
+ *skip_until_end = Some(env_name);
180
+ false
181
+ }
182
+ }
183
+ }
184
+
185
+ /// Processes section headings, display math, and regular content.
186
+ fn process_sections_and_content(&mut self, trimmed: &str, lines: &[&str], i: &mut usize) {
187
+ if trimmed.starts_with("\\section{") {
188
+ if let Some(title) = extract_braced(trimmed, "section") {
189
+ self.output.push_str(&format!("\n# {}\n\n", title));
190
+ }
191
+ } else if trimmed.starts_with("\\subsection{") {
192
+ if let Some(title) = extract_braced(trimmed, "subsection") {
193
+ self.output.push_str(&format!("## {}\n\n", title));
194
+ }
195
+ } else if trimmed.starts_with("\\subsubsection{") {
196
+ if let Some(title) = extract_braced(trimmed, "subsubsection") {
197
+ self.output.push_str(&format!("### {}\n\n", title));
198
+ }
199
+ } else if trimmed.starts_with("\\[") {
200
+ // Display math mode
201
+ self.process_display_math(trimmed, lines, i);
202
+ } else if !trimmed.is_empty() && !trimmed.starts_with("%") {
203
+ // Regular content
204
+ let processed = process_line(trimmed);
205
+ if !processed.is_empty() {
206
+ self.output.push_str(&processed);
207
+ self.output.push('\n');
208
+ }
209
+ }
210
+ }
211
+
212
+ /// Processes display math mode \[...\].
213
+ fn process_display_math(&mut self, trimmed: &str, lines: &[&str], i: &mut usize) {
214
+ let mut math_content = trimmed.to_string();
215
+ if !trimmed.contains("\\]") {
216
+ // Math spans multiple lines
217
+ *i += 1;
218
+ while *i < lines.len() {
219
+ let math_line = lines[*i];
220
+ math_content.push('\n');
221
+ math_content.push_str(math_line);
222
+ if math_line.trim().contains("\\]") {
223
+ break;
224
+ }
225
+ *i += 1;
226
+ }
227
+ }
228
+ self.output.push_str(&math_content);
229
+ self.output.push('\n');
230
+ }
231
+ }
@@ -0,0 +1,126 @@
1
+ //! Utility functions for LaTeX parsing.
2
+ //!
3
+ //! This module contains helper functions for text cleaning, brace extraction,
4
+ //! and other common operations used throughout the LaTeX parser.
5
+
6
+ /// Extracts content from within braces for a given command.
7
+ ///
8
+ /// Example: `\title{Hello World}` with command "title" returns "Hello World"
9
+ pub fn extract_braced(text: &str, command: &str) -> Option<String> {
10
+ let pattern = format!("\\{}{{", command);
11
+ if let Some(start) = text.find(&pattern) {
12
+ let after = &text[start + pattern.len()..];
13
+ let mut depth = 1;
14
+ let mut content = String::new();
15
+
16
+ for ch in after.chars() {
17
+ match ch {
18
+ '{' => {
19
+ depth += 1;
20
+ content.push(ch);
21
+ }
22
+ '}' => {
23
+ depth -= 1;
24
+ if depth == 0 {
25
+ return Some(clean_text(&content));
26
+ }
27
+ content.push(ch);
28
+ }
29
+ _ => content.push(ch),
30
+ }
31
+ }
32
+ }
33
+ None
34
+ }
35
+
36
+ /// Reads braced content from a character iterator.
37
+ ///
38
+ /// Handles nested braces correctly and maintains proper depth tracking.
39
+ pub fn read_braced_from_chars(chars: &mut std::iter::Peekable<std::str::Chars>) -> Option<String> {
40
+ // Skip whitespace before opening brace
41
+ while let Some(&c) = chars.peek() {
42
+ if c.is_whitespace() {
43
+ chars.next();
44
+ } else {
45
+ break;
46
+ }
47
+ }
48
+
49
+ // Check for opening brace
50
+ if chars.peek() != Some(&'{') {
51
+ return None;
52
+ }
53
+ chars.next(); // Consume '{'
54
+
55
+ let mut content = String::new();
56
+ let mut depth = 1;
57
+
58
+ for c in chars.by_ref() {
59
+ match c {
60
+ '{' => {
61
+ depth += 1;
62
+ content.push(c);
63
+ }
64
+ '}' => {
65
+ depth -= 1;
66
+ if depth == 0 {
67
+ return Some(content);
68
+ }
69
+ content.push(c);
70
+ }
71
+ _ => content.push(c),
72
+ }
73
+ }
74
+
75
+ Some(content)
76
+ }
77
+
78
+ /// Extracts environment name from a \begin{} statement.
79
+ ///
80
+ /// Example: `\begin{itemize}` returns "itemize"
81
+ pub fn extract_env_name(line: &str) -> Option<String> {
82
+ if let Some(start) = line.find("\\begin{") {
83
+ let after = &line[start + 7..];
84
+ if let Some(end) = after.find('}') {
85
+ return Some(after[..end].to_string());
86
+ }
87
+ }
88
+ None
89
+ }
90
+
91
+ /// Cleans LaTeX text by removing escape sequences.
92
+ ///
93
+ /// Handles common LaTeX escape sequences like \\&, \\#, \\\_, etc.
94
+ pub fn clean_text(text: &str) -> String {
95
+ text.to_string()
96
+ .replace("\\\\", "\n")
97
+ .replace("\\&", "&")
98
+ .replace("\\#", "#")
99
+ .replace("\\_", "_")
100
+ .replace("\\{", "{")
101
+ .replace("\\}", "}")
102
+ .replace("\\%", "%")
103
+ .trim()
104
+ .to_string()
105
+ }
106
+
107
+ /// Collects content of an environment from begin to end.
108
+ ///
109
+ /// Returns the content and the index of the line after \end{environment}.
110
+ pub fn collect_environment(lines: &[&str], start_idx: usize, env_name: &str) -> (String, usize) {
111
+ let mut content = String::new();
112
+ let mut i = start_idx + 1;
113
+ let end_marker = format!("\\end{{{}}}", env_name);
114
+
115
+ while i < lines.len() {
116
+ let line = lines[i];
117
+ if line.trim().contains(&end_marker) {
118
+ return (content, i + 1);
119
+ }
120
+ content.push_str(line);
121
+ content.push('\n');
122
+ i += 1;
123
+ }
124
+
125
+ (content, i)
126
+ }